summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2024-02-14 10:45:07 +0100
committerIngo Molnar <mingo@kernel.org>2024-02-14 10:45:07 +0100
commit03c11eb3b16dc0058589751dfd91f254be2be613 (patch)
treee5f2889212fec0bb0babdce9abd781ab487e246a /kernel
parentde8c6a352131f642b82474abe0cbb5dd26a7e081 (diff)
parent841c35169323cd833294798e58b9bf63fa4fa1de (diff)
Merge tag 'v6.8-rc4' into x86/percpu, to resolve conflicts and refresh the branch
Conflicts: arch/x86/include/asm/percpu.h arch/x86/include/asm/text-patching.h Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec6
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/async.c90
-rw-r--r--kernel/audit.c31
-rw-r--r--kernel/audit_tree.c6
-rw-r--r--kernel/audit_watch.c9
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/bpf/arraymap.c93
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c6
-rw-r--r--kernel/bpf/bpf_iter.c8
-rw-r--r--kernel/bpf/bpf_local_storage.c49
-rw-r--r--kernel/bpf/bpf_lsm.c12
-rw-r--r--kernel/bpf/bpf_struct_ops.c57
-rw-r--r--kernel/bpf/btf.c550
-rw-r--r--kernel/bpf/cgroup.c66
-rw-r--r--kernel/bpf/cgroup_iter.c65
-rw-r--r--kernel/bpf/core.c97
-rw-r--r--kernel/bpf/cpumap.c10
-rw-r--r--kernel/bpf/cpumask.c26
-rw-r--r--kernel/bpf/devmap.c10
-rw-r--r--kernel/bpf/dispatcher.c7
-rw-r--r--kernel/bpf/hashtab.c20
-rw-r--r--kernel/bpf/helpers.c212
-rw-r--r--kernel/bpf/inode.c56
-rw-r--r--kernel/bpf/log.c504
-rw-r--r--kernel/bpf/lpm_trie.c3
-rw-r--r--kernel/bpf/map_in_map.c17
-rw-r--r--kernel/bpf/map_in_map.h2
-rw-r--r--kernel/bpf/map_iter.c6
-rw-r--r--kernel/bpf/memalloc.c154
-rw-r--r--kernel/bpf/mprog.c13
-rw-r--r--kernel/bpf/offload.c30
-rw-r--r--kernel/bpf/queue_stack_maps.c21
-rw-r--r--kernel/bpf/ringbuf.c3
-rw-r--r--kernel/bpf/stackmap.c13
-rw-r--r--kernel/bpf/syscall.c208
-rw-r--r--kernel/bpf/task_iter.c287
-rw-r--r--kernel/bpf/tcx.c12
-rw-r--r--kernel/bpf/tnum.c13
-rw-r--r--kernel/bpf/trampoline.c110
-rw-r--r--kernel/bpf/verifier.c4189
-rw-r--r--kernel/cgroup/cgroup-internal.h4
-rw-r--r--kernel/cgroup/cgroup-v1.c41
-rw-r--r--kernel/cgroup/cgroup.c155
-rw-r--r--kernel/cgroup/cpuset.c1489
-rw-r--r--kernel/cgroup/legacy_freezer.c8
-rw-r--r--kernel/cgroup/rstat.c159
-rw-r--r--kernel/configs/debug.config2
-rw-r--r--kernel/configs/hardening.config98
-rw-r--r--kernel/configs/kvm_guest.config1
-rw-r--r--kernel/configs/nopm.config2
-rw-r--r--kernel/configs/rust.config1
-rw-r--r--kernel/configs/x86_debug.config1
-rw-r--r--kernel/configs/xen.config2
-rw-r--r--kernel/cpu.c58
-rw-r--r--kernel/crash_core.c306
-rw-r--r--kernel/cred.c271
-rw-r--r--kernel/debug/debug_core.c3
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c9
-rw-r--r--kernel/dma/Kconfig13
-rw-r--r--kernel/dma/coherent.c4
-rw-r--r--kernel/dma/contiguous.c5
-rw-r--r--kernel/dma/debug.c27
-rw-r--r--kernel/dma/direct.c78
-rw-r--r--kernel/dma/direct.h1
-rw-r--r--kernel/dma/mapping.c22
-rw-r--r--kernel/dma/pool.c10
-rw-r--r--kernel/dma/swiotlb.c169
-rw-r--r--kernel/entry/common.c108
-rw-r--r--kernel/events/core.c309
-rw-r--r--kernel/events/ring_buffer.c12
-rw-r--r--kernel/events/uprobes.c10
-rw-r--r--kernel/exit.c153
-rw-r--r--kernel/exit.h30
-rw-r--r--kernel/fork.c107
-rw-r--r--kernel/freezer.c42
-rw-r--r--kernel/futex/core.c111
-rw-r--r--kernel/futex/futex.h106
-rw-r--r--kernel/futex/pi.c96
-rw-r--r--kernel/futex/requeue.c26
-rw-r--r--kernel/futex/syscalls.c235
-rw-r--r--kernel/futex/waitwake.c130
-rw-r--r--kernel/gcov/fs.c2
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/irq/debugfs.c1
-rw-r--r--kernel/irq/generic-chip.c31
-rw-r--r--kernel/irq/irqdesc.c2
-rw-r--r--kernel/irq/manage.c15
-rw-r--r--kernel/irq/matrix.c6
-rw-r--r--kernel/irq/msi.c12
-rw-r--r--kernel/kcmp.c4
-rw-r--r--kernel/kcsan/kcsan_test.c9
-rw-r--r--kernel/kcsan/selftest.c9
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kexec_core.c37
-rw-r--r--kernel/kexec_file.c20
-rw-r--r--kernel/kprobes.c97
-rw-r--r--kernel/kthread.c19
-rw-r--r--kernel/livepatch/core.c2
-rw-r--r--kernel/locking/lock_events.c10
-rw-r--r--kernel/locking/lockdep.c3
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/locking/locktorture.c214
-rw-r--r--kernel/locking/mutex.c8
-rw-r--r--kernel/locking/osq_lock.c37
-rw-r--r--kernel/locking/rtmutex.c37
-rw-r--r--kernel/locking/rwbase_rt.c8
-rw-r--r--kernel/locking/rwsem.c8
-rw-r--r--kernel/locking/spinlock_debug.c1
-rw-r--r--kernel/locking/spinlock_rt.c6
-rw-r--r--kernel/locking/test-ww_mutex.c48
-rw-r--r--kernel/locking/ww_rt_mutex.c2
-rw-r--r--kernel/module/Kconfig25
-rw-r--r--kernel/module/decompress.c8
-rw-r--r--kernel/module/dups.c2
-rw-r--r--kernel/module/main.c3
-rw-r--r--kernel/module/stats.c2
-rw-r--r--kernel/module/sysfs.c2
-rw-r--r--kernel/numa.c26
-rw-r--r--kernel/padata.c8
-rw-r--r--kernel/panic.c23
-rw-r--r--kernel/params.c52
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/pid_namespace.c7
-rw-r--r--kernel/power/hibernate.c28
-rw-r--r--kernel/power/main.c16
-rw-r--r--kernel/power/power.h6
-rw-r--r--kernel/power/snapshot.c36
-rw-r--r--kernel/power/swap.c86
-rw-r--r--kernel/printk/Makefile2
-rw-r--r--kernel/printk/internal.h33
-rw-r--r--kernel/printk/nbcon.c1029
-rw-r--r--kernel/printk/printk.c379
-rw-r--r--kernel/printk/printk_ringbuffer.c2
-rw-r--r--kernel/printk/printk_safe.c9
-rw-r--r--kernel/ptrace.c143
-rw-r--r--kernel/rcu/Kconfig.debug25
-rw-r--r--kernel/rcu/rcu.h21
-rw-r--r--kernel/rcu/rcu_segcblist.c4
-rw-r--r--kernel/rcu/rcutorture.c43
-rw-r--r--kernel/rcu/refscale.c6
-rw-r--r--kernel/rcu/srcutiny.c1
-rw-r--r--kernel/rcu/srcutree.c92
-rw-r--r--kernel/rcu/tasks.h47
-rw-r--r--kernel/rcu/tiny.c1
-rw-r--r--kernel/rcu/tree.c342
-rw-r--r--kernel/rcu/tree.h4
-rw-r--r--kernel/rcu/tree_exp.h9
-rw-r--r--kernel/rcu/tree_nocb.h19
-rw-r--r--kernel/rcu/tree_stall.h144
-rw-r--r--kernel/rcu/update.c15
-rw-r--r--kernel/reboot.c55
-rw-r--r--kernel/relay.c162
-rw-r--r--kernel/resource.c112
-rw-r--r--kernel/sched/build_utility.c1
-rw-r--r--kernel/sched/core.c881
-rw-r--r--kernel/sched/cpudeadline.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c163
-rw-r--r--kernel/sched/cpupri.c1
-rw-r--r--kernel/sched/deadline.c552
-rw-r--r--kernel/sched/debug.c25
-rw-r--r--kernel/sched/fair.c1038
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/idle.c35
-rw-r--r--kernel/sched/pelt.c2
-rw-r--r--kernel/sched/pelt.h4
-rw-r--r--kernel/sched/psi.c58
-rw-r--r--kernel/sched/rt.c110
-rw-r--r--kernel/sched/sched.h189
-rw-r--r--kernel/sched/stop_task.c17
-rw-r--r--kernel/sched/topology.c215
-rw-r--r--kernel/sched/wait.c60
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c110
-rw-r--r--kernel/smp.c39
-rw-r--r--kernel/smpboot.c3
-rw-r--r--kernel/stacktrace.c4
-rw-r--r--kernel/sys.c127
-rw-r--r--kernel/sys_ni.c22
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/task_work.c1
-rw-r--r--kernel/taskstats.c5
-rw-r--r--kernel/time/alarmtimer.c11
-rw-r--r--kernel/time/clocksource.c25
-rw-r--r--kernel/time/hrtimer.c36
-rw-r--r--kernel/time/posix-clock.c36
-rw-r--r--kernel/time/posix-stubs.c45
-rw-r--r--kernel/time/tick-internal.h3
-rw-r--r--kernel/time/tick-sched.c251
-rw-r--r--kernel/time/timer.c110
-rw-r--r--kernel/torture.c75
-rw-r--r--kernel/trace/bpf_trace.c216
-rw-r--r--kernel/trace/fprobe.c36
-rw-r--r--kernel/trace/ftrace.c110
-rw-r--r--kernel/trace/rethook.c113
-rw-r--r--kernel/trace/ring_buffer.c968
-rw-r--r--kernel/trace/ring_buffer_benchmark.c10
-rw-r--r--kernel/trace/synth_event_gen_test.c11
-rw-r--r--kernel/trace/trace.c657
-rw-r--r--kernel/trace/trace.h21
-rw-r--r--kernel/trace/trace_boot.c2
-rw-r--r--kernel/trace/trace_eprobe.c5
-rw-r--r--kernel/trace/trace_events.c464
-rw-r--r--kernel/trace/trace_events_filter.c3
-rw-r--r--kernel/trace/trace_events_hist.c72
-rw-r--r--kernel/trace/trace_events_inject.c3
-rw-r--r--kernel/trace/trace_events_synth.c8
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_events_user.c98
-rw-r--r--kernel/trace/trace_fprobe.c9
-rw-r--r--kernel/trace/trace_kprobe.c103
-rw-r--r--kernel/trace/trace_osnoise.c6
-rw-r--r--kernel/trace/trace_output.c8
-rw-r--r--kernel/trace/trace_probe.c32
-rw-r--r--kernel/trace/trace_probe.h4
-rw-r--r--kernel/trace/trace_seq.c9
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/trace/tracing_map.c7
-rw-r--r--kernel/up.c2
-rw-r--r--kernel/user.c13
-rw-r--r--kernel/user_namespace.c25
-rw-r--r--kernel/watch_queue.c4
-rw-r--r--kernel/watchdog.c47
-rw-r--r--kernel/workqueue.c256
227 files changed, 14959 insertions, 7835 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 9bfe68fe9676..946dffa048b7 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -36,6 +36,8 @@ config KEXEC
config KEXEC_FILE
bool "Enable kexec file based system call"
depends on ARCH_SUPPORTS_KEXEC_FILE
+ select CRYPTO
+ select CRYPTO_SHA256
select KEXEC_CORE
help
This is new version of kexec system call. This system call is
@@ -94,10 +96,8 @@ config KEXEC_JUMP
config CRASH_DUMP
bool "kernel crash dumps"
depends on ARCH_SUPPORTS_CRASH_DUMP
- depends on ARCH_SUPPORTS_KEXEC
select CRASH_CORE
select KEXEC_CORE
- select KEXEC
help
Generate crash dump after being started by kexec.
This should be normally only set in special crash dump kernels
@@ -110,7 +110,7 @@ config CRASH_DUMP
For more details see Documentation/admin-guide/kdump/kdump.rst
For s390, this option also enables zfcpdump.
- See also <file:Documentation/s390/zfcpdump.rst>
+ See also <file:Documentation/arch/s390/zfcpdump.rst>
config CRASH_HOTPLUG
bool "Update the crash elfcorehdr on system configuration changes"
diff --git a/kernel/Makefile b/kernel/Makefile
index 3947122d618b..ce105a5558fc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -114,6 +114,7 @@ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
obj-$(CONFIG_CFI_CLANG) += cfi.o
+obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/acct.c b/kernel/acct.c
index 1a9f929fe629..986c8214dabf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -246,7 +246,7 @@ static int acct_on(struct filename *pathname)
filp_close(file, NULL);
return PTR_ERR(internal);
}
- err = __mnt_want_write(internal);
+ err = mnt_get_write_access(internal);
if (err) {
mntput(internal);
kfree(acct);
@@ -271,7 +271,7 @@ static int acct_on(struct filename *pathname)
old = xchg(&ns->bacct, &acct->pin);
mutex_unlock(&acct->lock);
pin_kill(old);
- __mnt_drop_write(mnt);
+ mnt_put_write_access(mnt);
mntput(mnt);
return 0;
}
diff --git a/kernel/async.c b/kernel/async.c
index b2c4ba5686ee..97f224a5257b 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -46,11 +46,12 @@ asynchronous and synchronous parts of the kernel.
#include <linux/async.h>
#include <linux/atomic.h>
-#include <linux/ktime.h>
#include <linux/export.h>
-#include <linux/wait.h>
+#include <linux/ktime.h>
+#include <linux/pid.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/wait.h>
#include <linux/workqueue.h>
#include "workqueue_internal.h"
@@ -145,6 +146,39 @@ static void async_run_entry_fn(struct work_struct *work)
wake_up(&async_done);
}
+static async_cookie_t __async_schedule_node_domain(async_func_t func,
+ void *data, int node,
+ struct async_domain *domain,
+ struct async_entry *entry)
+{
+ async_cookie_t newcookie;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&entry->domain_list);
+ INIT_LIST_HEAD(&entry->global_list);
+ INIT_WORK(&entry->work, async_run_entry_fn);
+ entry->func = func;
+ entry->data = data;
+ entry->domain = domain;
+
+ spin_lock_irqsave(&async_lock, flags);
+
+ /* allocate cookie and queue */
+ newcookie = entry->cookie = next_cookie++;
+
+ list_add_tail(&entry->domain_list, &domain->pending);
+ if (domain->registered)
+ list_add_tail(&entry->global_list, &async_global_pending);
+
+ atomic_inc(&entry_count);
+ spin_unlock_irqrestore(&async_lock, flags);
+
+ /* schedule for execution */
+ queue_work_node(node, system_unbound_wq, &entry->work);
+
+ return newcookie;
+}
+
/**
* async_schedule_node_domain - NUMA specific version of async_schedule_domain
* @func: function to execute asynchronously
@@ -186,29 +220,8 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
func(data, newcookie);
return newcookie;
}
- INIT_LIST_HEAD(&entry->domain_list);
- INIT_LIST_HEAD(&entry->global_list);
- INIT_WORK(&entry->work, async_run_entry_fn);
- entry->func = func;
- entry->data = data;
- entry->domain = domain;
-
- spin_lock_irqsave(&async_lock, flags);
-
- /* allocate cookie and queue */
- newcookie = entry->cookie = next_cookie++;
- list_add_tail(&entry->domain_list, &domain->pending);
- if (domain->registered)
- list_add_tail(&entry->global_list, &async_global_pending);
-
- atomic_inc(&entry_count);
- spin_unlock_irqrestore(&async_lock, flags);
-
- /* schedule for execution */
- queue_work_node(node, system_unbound_wq, &entry->work);
-
- return newcookie;
+ return __async_schedule_node_domain(func, data, node, domain, entry);
}
EXPORT_SYMBOL_GPL(async_schedule_node_domain);
@@ -232,6 +245,35 @@ async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
EXPORT_SYMBOL_GPL(async_schedule_node);
/**
+ * async_schedule_dev_nocall - A simplified variant of async_schedule_dev()
+ * @func: function to execute asynchronously
+ * @dev: device argument to be passed to function
+ *
+ * @dev is used as both the argument for the function and to provide NUMA
+ * context for where to run the function.
+ *
+ * If the asynchronous execution of @func is scheduled successfully, return
+ * true. Otherwise, do nothing and return false, unlike async_schedule_dev()
+ * that will run the function synchronously then.
+ */
+bool async_schedule_dev_nocall(async_func_t func, struct device *dev)
+{
+ struct async_entry *entry;
+
+ entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL);
+
+ /* Give up if there is no memory or too much work. */
+ if (!entry || atomic_read(&entry_count) > MAX_WORK) {
+ kfree(entry);
+ return false;
+ }
+
+ __async_schedule_node_domain(func, dev, dev_to_node(dev),
+ &async_dfl_domain, entry);
+ return true;
+}
+
+/**
* async_synchronize_full - synchronize all asynchronous function calls
*
* This function waits until all asynchronous function calls have been done.
diff --git a/kernel/audit.c b/kernel/audit.c
index 16205dd29843..9c8e5f732c4c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -487,15 +487,19 @@ static void auditd_conn_free(struct rcu_head *rcu)
* @pid: auditd PID
* @portid: auditd netlink portid
* @net: auditd network namespace pointer
+ * @skb: the netlink command from the audit daemon
+ * @ack: netlink ack flag, cleared if ack'd here
*
* Description:
* This function will obtain and drop network namespace references as
* necessary. Returns zero on success, negative values on failure.
*/
-static int auditd_set(struct pid *pid, u32 portid, struct net *net)
+static int auditd_set(struct pid *pid, u32 portid, struct net *net,
+ struct sk_buff *skb, bool *ack)
{
unsigned long flags;
struct auditd_connection *ac_old, *ac_new;
+ struct nlmsghdr *nlh;
if (!pid || !net)
return -EINVAL;
@@ -507,6 +511,13 @@ static int auditd_set(struct pid *pid, u32 portid, struct net *net)
ac_new->portid = portid;
ac_new->net = get_net(net);
+ /* send the ack now to avoid a race with the queue backlog */
+ if (*ack) {
+ nlh = nlmsg_hdr(skb);
+ netlink_ack(skb, nlh, 0, NULL);
+ *ack = false;
+ }
+
spin_lock_irqsave(&auditd_conn_lock, flags);
ac_old = rcu_dereference_protected(auditd_conn,
lockdep_is_held(&auditd_conn_lock));
@@ -1200,7 +1211,8 @@ static int audit_replace(struct pid *pid)
return auditd_send_unicast_skb(skb);
}
-static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+ bool *ack)
{
u32 seq;
void *data;
@@ -1293,7 +1305,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* register a new auditd connection */
err = auditd_set(req_pid,
NETLINK_CB(skb).portid,
- sock_net(NETLINK_CB(skb).sk));
+ sock_net(NETLINK_CB(skb).sk),
+ skb, ack);
if (audit_enabled != AUDIT_OFF)
audit_log_config_change("audit_pid",
new_pid,
@@ -1538,9 +1551,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
* Parse the provided skb and deal with any messages that may be present,
* malformed skbs are discarded.
*/
-static void audit_receive(struct sk_buff *skb)
+static void audit_receive(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
+ bool ack;
/*
* len MUST be signed for nlmsg_next to be able to dec it below 0
* if the nlmsg_len was not aligned
@@ -1553,9 +1567,12 @@ static void audit_receive(struct sk_buff *skb)
audit_ctl_lock();
while (nlmsg_ok(nlh, len)) {
- err = audit_receive_msg(skb, nlh);
- /* if err or if this message says it wants a response */
- if (err || (nlh->nlmsg_flags & NLM_F_ACK))
+ ack = nlh->nlmsg_flags & NLM_F_ACK;
+ err = audit_receive_msg(skb, nlh, &ack);
+
+ /* send an ack if the user asked for one and audit_receive_msg
+ * didn't already do it, or if there was an error. */
+ if (ack || err)
netlink_ack(skb, nlh, err, NULL);
nlh = nlmsg_next(nlh, &len);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e867c17d3f84..1b07e6f12a07 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -34,7 +34,7 @@ struct audit_chunk {
struct list_head list;
struct audit_tree *owner;
unsigned index; /* index; upper bit indicates 'will prune' */
- } owners[];
+ } owners[] __counted_by(count);
};
struct audit_tree_mark {
@@ -87,8 +87,8 @@ static struct task_struct *prune_thread;
* that makes a difference. Some.
*/
-static struct fsnotify_group *audit_tree_group;
-static struct kmem_cache *audit_tree_mark_cachep __read_mostly;
+static struct fsnotify_group *audit_tree_group __ro_after_init;
+static struct kmem_cache *audit_tree_mark_cachep __ro_after_init;
static struct audit_tree *alloc_tree(const char *s)
{
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 65075f1e4ac8..7a98cd176a12 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -527,11 +527,18 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
unsigned long ino;
dev_t dev;
- exe_file = get_task_exe_file(tsk);
+ /* only do exe filtering if we are recording @current events/records */
+ if (tsk != current)
+ return 0;
+
+ if (!current->mm)
+ return 0;
+ exe_file = get_mm_exe_file(current->mm);
if (!exe_file)
return 0;
ino = file_inode(exe_file)->i_ino;
dev = file_inode(exe_file)->i_sb->s_dev;
fput(exe_file);
+
return audit_mark_compare(mark, ino, dev);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b0cb7631e48b..6f0d6fb6523f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -143,6 +143,8 @@ static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
{ AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" },
{ AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" },
{ AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" },
+ { AUDIT_NFT_OP_SETELEM_RESET, "nft_reset_setelem" },
+ { AUDIT_NFT_OP_RULE_RESET, "nft_reset_rule" },
{ AUDIT_NFT_OP_INVALID, "nft_invalid" },
};
@@ -2210,7 +2212,7 @@ __audit_reusename(const __user char *uptr)
if (!n->name)
continue;
if (n->name->uptr == uptr) {
- n->name->refcnt++;
+ atomic_inc(&n->name->refcnt);
return n->name;
}
}
@@ -2239,7 +2241,7 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
name->aname = n;
- name->refcnt++;
+ atomic_inc(&name->refcnt);
}
static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2371,7 +2373,7 @@ out_alloc:
return;
if (name) {
n->name = name;
- name->refcnt++;
+ atomic_inc(&name->refcnt);
}
out:
@@ -2498,7 +2500,7 @@ void __audit_inode_child(struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- found_child->name->refcnt++;
+ atomic_inc(&found_child->name->refcnt);
}
}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2058e89b5ddd..0bdbbbeab155 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -867,11 +867,11 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
}
if (old_ptr)
- map->ops->map_fd_put_ptr(old_ptr);
+ map->ops->map_fd_put_ptr(map, old_ptr, true);
return 0;
}
-static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
+static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *old_ptr;
@@ -890,13 +890,18 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
}
if (old_ptr) {
- map->ops->map_fd_put_ptr(old_ptr);
+ map->ops->map_fd_put_ptr(map, old_ptr, need_defer);
return 0;
} else {
return -ENOENT;
}
}
+static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return __fd_array_map_delete_elem(map, key, true);
+}
+
static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
@@ -913,8 +918,9 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
return prog;
}
-static void prog_fd_array_put_ptr(void *ptr)
+static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
+ /* bpf_prog is freed after one RCU or tasks trace grace period */
bpf_prog_put(ptr);
}
@@ -924,13 +930,13 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr)
}
/* decrement refcnt of all bpf_progs that are stored in this map */
-static void bpf_fd_array_map_clear(struct bpf_map *map)
+static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
for (i = 0; i < array->map.max_entries; i++)
- fd_array_map_delete_elem(map, &i);
+ __fd_array_map_delete_elem(map, &i, need_defer);
}
static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -1012,11 +1018,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map,
mutex_unlock(&aux->poke_mutex);
}
+void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+ struct bpf_prog *new, struct bpf_prog *old)
+{
+ WARN_ON_ONCE(1);
+}
+
static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
struct bpf_prog *old,
struct bpf_prog *new)
{
- u8 *old_addr, *new_addr, *old_bypass_addr;
struct prog_poke_elem *elem;
struct bpf_array_aux *aux;
@@ -1025,7 +1036,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
list_for_each_entry(elem, &aux->poke_progs, list) {
struct bpf_jit_poke_descriptor *poke;
- int i, ret;
+ int i;
for (i = 0; i < elem->aux->size_poke_tab; i++) {
poke = &elem->aux->poke_tab[i];
@@ -1044,21 +1055,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
* activated, so tail call updates can arrive from here
* while JIT is still finishing its final fixup for
* non-activated poke entries.
- * 3) On program teardown, the program's kallsym entry gets
- * removed out of RCU callback, but we can only untrack
- * from sleepable context, therefore bpf_arch_text_poke()
- * might not see that this is in BPF text section and
- * bails out with -EINVAL. As these are unreachable since
- * RCU grace period already passed, we simply skip them.
- * 4) Also programs reaching refcount of zero while patching
+ * 3) Also programs reaching refcount of zero while patching
* is in progress is okay since we're protected under
* poke_mutex and untrack the programs before the JIT
- * buffer is freed. When we're still in the middle of
- * patching and suddenly kallsyms entry of the program
- * gets evicted, we just skip the rest which is fine due
- * to point 3).
- * 5) Any other error happening below from bpf_arch_text_poke()
- * is a unexpected bug.
+ * buffer is freed.
*/
if (!READ_ONCE(poke->tailcall_target_stable))
continue;
@@ -1068,39 +1068,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
poke->tail_call.key != key)
continue;
- old_bypass_addr = old ? NULL : poke->bypass_addr;
- old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
- new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
-
- if (new) {
- ret = bpf_arch_text_poke(poke->tailcall_target,
- BPF_MOD_JUMP,
- old_addr, new_addr);
- BUG_ON(ret < 0 && ret != -EINVAL);
- if (!old) {
- ret = bpf_arch_text_poke(poke->tailcall_bypass,
- BPF_MOD_JUMP,
- poke->bypass_addr,
- NULL);
- BUG_ON(ret < 0 && ret != -EINVAL);
- }
- } else {
- ret = bpf_arch_text_poke(poke->tailcall_bypass,
- BPF_MOD_JUMP,
- old_bypass_addr,
- poke->bypass_addr);
- BUG_ON(ret < 0 && ret != -EINVAL);
- /* let other CPUs finish the execution of program
- * so that it will not possible to expose them
- * to invalid nop, stack unwind, nop state
- */
- if (!ret)
- synchronize_rcu();
- ret = bpf_arch_text_poke(poke->tailcall_target,
- BPF_MOD_JUMP,
- old_addr, NULL);
- BUG_ON(ret < 0 && ret != -EINVAL);
- }
+ bpf_arch_poke_desc_update(poke, new, old);
}
}
}
@@ -1109,7 +1077,7 @@ static void prog_array_map_clear_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_array_aux,
work)->map;
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, true);
bpf_map_put(map);
}
@@ -1189,7 +1157,7 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
{
struct bpf_event_entry *ee;
- ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
+ ee = kzalloc(sizeof(*ee), GFP_KERNEL);
if (ee) {
ee->event = perf_file->private_data;
ee->perf_file = perf_file;
@@ -1239,8 +1207,9 @@ err_out:
return ee;
}
-static void perf_event_fd_array_put_ptr(void *ptr)
+static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
+ /* bpf_perf_event is freed after one RCU grace period */
bpf_event_entry_free_rcu(ptr);
}
@@ -1258,7 +1227,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
for (i = 0; i < array->map.max_entries; i++) {
ee = READ_ONCE(array->ptrs[i]);
if (ee && ee->map_file == map_file)
- fd_array_map_delete_elem(map, &i);
+ __fd_array_map_delete_elem(map, &i, true);
}
rcu_read_unlock();
}
@@ -1266,7 +1235,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
static void perf_event_fd_array_map_free(struct bpf_map *map)
{
if (map->map_flags & BPF_F_PRESERVE_ELEMS)
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
@@ -1294,7 +1263,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
return cgroup_get_from_fd(fd);
}
-static void cgroup_fd_array_put_ptr(void *ptr)
+static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
/* cgroup_put free cgrp after a rcu grace period */
cgroup_put(ptr);
@@ -1302,7 +1271,7 @@ static void cgroup_fd_array_put_ptr(void *ptr)
static void cgroup_fd_array_free(struct bpf_map *map)
{
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
@@ -1347,7 +1316,7 @@ static void array_of_map_free(struct bpf_map *map)
* is protected by fdget/fdput.
*/
bpf_map_meta_free(map->inner_map_meta);
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index d44fe8dd9732..28efd0a3f220 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -82,7 +82,7 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
int fd;
fd = *(int *)key;
- cgroup = cgroup_get_from_fd(fd);
+ cgroup = cgroup_v1v2_get_from_fd(fd);
if (IS_ERR(cgroup))
return ERR_CAST(cgroup);
@@ -101,7 +101,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
int fd;
fd = *(int *)key;
- cgroup = cgroup_get_from_fd(fd);
+ cgroup = cgroup_v1v2_get_from_fd(fd);
if (IS_ERR(cgroup))
return PTR_ERR(cgroup);
@@ -131,7 +131,7 @@ static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
int err, fd;
fd = *(int *)key;
- cgroup = cgroup_get_from_fd(fd);
+ cgroup = cgroup_v1v2_get_from_fd(fd);
if (IS_ERR(cgroup))
return PTR_ERR(cgroup);
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 96856f130cbf..0fae79164187 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -782,9 +782,7 @@ struct bpf_iter_num_kern {
int end; /* final value, exclusive */
} __aligned(8);
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
- "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
{
@@ -793,8 +791,6 @@ __bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
- BTF_TYPE_EMIT(struct btf_iter_num);
-
/* start == end is legit, it's an empty range and we'll just get NULL
* on first (and any subsequent) bpf_iter_num_next() call
*/
@@ -845,4 +841,4 @@ __bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
s->cur = s->end = 0;
}
-__diag_pop();
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b5149cfce7d4..146824cc9689 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -553,7 +553,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
void *value, u64 map_flags, gfp_t gfp_flags)
{
struct bpf_local_storage_data *old_sdata = NULL;
- struct bpf_local_storage_elem *selem = NULL;
+ struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
struct bpf_local_storage *local_storage;
unsigned long flags;
int err;
@@ -607,11 +607,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
}
}
- if (gfp_flags == GFP_KERNEL) {
- selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
- if (!selem)
- return ERR_PTR(-ENOMEM);
- }
+ /* A lookup has just been done before and concluded a new selem is
+ * needed. The chance of an unnecessary alloc is unlikely.
+ */
+ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+ if (!alloc_selem)
+ return ERR_PTR(-ENOMEM);
raw_spin_lock_irqsave(&local_storage->lock, flags);
@@ -623,13 +624,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
* simple.
*/
err = -EAGAIN;
- goto unlock_err;
+ goto unlock;
}
old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
err = check_flags(old_sdata, map_flags);
if (err)
- goto unlock_err;
+ goto unlock;
if (old_sdata && (map_flags & BPF_F_LOCK)) {
copy_map_value_locked(&smap->map, old_sdata->data, value,
@@ -638,23 +639,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
goto unlock;
}
- if (gfp_flags != GFP_KERNEL) {
- /* local_storage->lock is held. Hence, we are sure
- * we can unlink and uncharge the old_sdata successfully
- * later. Hence, instead of charging the new selem now
- * and then uncharge the old selem later (which may cause
- * a potential but unnecessary charge failure), avoid taking
- * a charge at all here (the "!old_sdata" check) and the
- * old_sdata will not be uncharged later during
- * bpf_selem_unlink_storage_nolock().
- */
- selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags);
- if (!selem) {
- err = -ENOMEM;
- goto unlock_err;
- }
- }
-
+ alloc_selem = NULL;
/* First, link the new selem to the map */
bpf_selem_link_map(smap, selem);
@@ -665,20 +650,16 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
- false, false);
+ true, false);
}
unlock:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- return SDATA(selem);
-
-unlock_err:
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- if (selem) {
+ if (alloc_selem) {
mem_uncharge(smap, owner, smap->elem_size);
- bpf_selem_free(selem, smap, true);
+ bpf_selem_free(alloc_selem, smap, true);
}
- return ERR_PTR(err);
+ return err ? ERR_PTR(err) : SDATA(selem);
}
static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
@@ -779,7 +760,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
* of the loop will set the free_cgroup_storage to true.
*/
free_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false, true);
+ local_storage, selem, true, true);
}
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index e14c822f8911..e8e910395bf6 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -298,6 +298,18 @@ BTF_ID(func, bpf_lsm_kernel_module_request)
BTF_ID(func, bpf_lsm_kernel_read_file)
BTF_ID(func, bpf_lsm_kernfs_init_security)
+#ifdef CONFIG_SECURITY_PATH
+BTF_ID(func, bpf_lsm_path_unlink)
+BTF_ID(func, bpf_lsm_path_mkdir)
+BTF_ID(func, bpf_lsm_path_rmdir)
+BTF_ID(func, bpf_lsm_path_truncate)
+BTF_ID(func, bpf_lsm_path_symlink)
+BTF_ID(func, bpf_lsm_path_link)
+BTF_ID(func, bpf_lsm_path_rename)
+BTF_ID(func, bpf_lsm_path_chmod)
+BTF_ID(func, bpf_lsm_path_chown)
+#endif /* CONFIG_SECURITY_PATH */
+
#ifdef CONFIG_KEYS
BTF_ID(func, bpf_lsm_key_free)
#endif /* CONFIG_KEYS */
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index fdc3e8705a3c..02068bd0e4d9 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -352,18 +352,24 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = {
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
struct bpf_tramp_link *link,
const struct btf_func_model *model,
- void *image, void *image_end)
+ void *stub_func, void *image, void *image_end)
{
- u32 flags;
+ u32 flags = BPF_TRAMP_F_INDIRECT;
+ int size;
tlinks[BPF_TRAMP_FENTRY].links[0] = link;
tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
- /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops,
- * and it must be used alone.
- */
- flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
+
+ if (model->ret_size > 0)
+ flags |= BPF_TRAMP_F_RET_FENTRY_RET;
+
+ size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
+ if (size < 0)
+ return size;
+ if (size > (unsigned long)image_end - (unsigned long)image)
+ return -E2BIG;
return arch_prepare_bpf_trampoline(NULL, image, image_end,
- model, flags, tlinks, NULL);
+ model, flags, tlinks, stub_func);
}
static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
@@ -497,11 +503,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
err = bpf_struct_ops_prepare_trampoline(tlinks, link,
&st_ops->func_models[i],
+ *(void **)(st_ops->cfi_stubs + moff),
image, image_end);
if (err < 0)
goto reset_unlock;
- *(void **)(kdata + moff) = image;
+ *(void **)(kdata + moff) = image + cfi_get_offset();
image += err;
/* put prog_id to udata */
@@ -515,7 +522,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (err)
goto reset_unlock;
}
- set_memory_rox((long)st_map->image, 1);
+ arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
/* Let bpf_link handle registration & unregistration.
*
* Pair with smp_load_acquire() during lookup_elem().
@@ -524,7 +531,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock;
}
- set_memory_rox((long)st_map->image, 1);
+ arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
err = st_ops->reg(kdata);
if (likely(!err)) {
/* This refcnt increment on the map here after
@@ -547,8 +554,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
* there was a race in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps.
*/
- set_memory_nx((long)st_map->image, 1);
- set_memory_rw((long)st_map->image, 1);
+ arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
reset_unlock:
bpf_struct_ops_map_put_progs(st_map);
@@ -615,7 +621,10 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
bpf_map_area_free(st_map->links);
- bpf_jit_free_exec(st_map->image);
+ if (st_map->image) {
+ arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ }
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
}
@@ -657,6 +666,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
struct bpf_struct_ops_map *st_map;
const struct btf_type *t, *vt;
struct bpf_map *map;
+ int ret;
st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
if (!st_ops)
@@ -681,18 +691,32 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
st_map->st_ops = st_ops;
map = &st_map->map;
+ ret = bpf_jit_charge_modmem(PAGE_SIZE);
+ if (ret) {
+ __bpf_struct_ops_map_free(map);
+ return ERR_PTR(ret);
+ }
+
+ st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
+ if (!st_map->image) {
+ /* __bpf_struct_ops_map_free() uses st_map->image as flag
+ * for "charged or not". In this case, we need to unchange
+ * here.
+ */
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ __bpf_struct_ops_map_free(map);
+ return ERR_PTR(-ENOMEM);
+ }
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
st_map->links =
bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *),
NUMA_NO_NODE);
- st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!st_map->uvalue || !st_map->links || !st_map->image) {
+ if (!st_map->uvalue || !st_map->links) {
__bpf_struct_ops_map_free(map);
return ERR_PTR(-ENOMEM);
}
mutex_init(&st_map->lock);
- set_vm_flush_reset_perms(st_map->image);
bpf_map_init_from_attr(map, attr);
return map;
@@ -907,4 +931,3 @@ err_out:
kfree(link);
return err;
}
-
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 1095bbe29859..596471189176 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3293,6 +3293,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
type = BPF_KPTR_UNREF;
else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
type = BPF_KPTR_REF;
+ else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off)))
+ type = BPF_KPTR_PERCPU;
else
return -EINVAL;
@@ -3308,10 +3310,10 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
return BTF_FIELD_FOUND;
}
-static const char *btf_find_decl_tag_value(const struct btf *btf,
- const struct btf_type *pt,
- int comp_idx, const char *tag_key)
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+ int comp_idx, const char *tag_key)
{
+ const char *value = NULL;
int i;
for (i = 1; i < btf_nr_types(btf); i++) {
@@ -3325,9 +3327,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf,
continue;
if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
continue;
- return __btf_name_by_offset(btf, t->name_off) + len;
+ /* Prevent duplicate entries for same type */
+ if (value)
+ return ERR_PTR(-EEXIST);
+ value = __btf_name_by_offset(btf, t->name_off) + len;
}
- return NULL;
+ if (!value)
+ return ERR_PTR(-ENOENT);
+ return value;
}
static int
@@ -3345,7 +3352,7 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
if (t->size != sz)
return BTF_FIELD_IGNORE;
value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
- if (!value_type)
+ if (IS_ERR(value_type))
return -EINVAL;
node_field_name = strstr(value_type, ":");
if (!node_field_name)
@@ -3457,6 +3464,7 @@ static int btf_find_struct_field(const struct btf *btf,
break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
ret = btf_find_kptr(btf, member_type, off, sz,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
@@ -3523,6 +3531,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
ret = btf_find_kptr(btf, var_type, off, sz,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
@@ -3783,6 +3792,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
if (ret < 0)
goto end;
@@ -3830,9 +3840,6 @@ end:
return ERR_PTR(ret);
}
-#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT)
-#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE)
-
int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
{
int i;
@@ -3845,13 +3852,13 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
* Hence we only need to ensure that bpf_{list_head,rb_root} ownership
* does not form cycles.
*/
- if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK))
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT))
return 0;
for (i = 0; i < rec->cnt; i++) {
struct btf_struct_meta *meta;
u32 btf_id;
- if (!(rec->fields[i].type & GRAPH_ROOT_MASK))
+ if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
continue;
btf_id = rec->fields[i].graph_root.value_btf_id;
meta = btf_find_struct_meta(btf, btf_id);
@@ -3863,7 +3870,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
* to check ownership cycle for a type unless it's also a
* node type.
*/
- if (!(rec->field_mask & GRAPH_NODE_MASK))
+ if (!(rec->field_mask & BPF_GRAPH_NODE))
continue;
/* We need to ensure ownership acyclicity among all types. The
@@ -3899,7 +3906,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
* - A is both an root and node.
* - B is only an node.
*/
- if (meta->record->field_mask & GRAPH_ROOT_MASK)
+ if (meta->record->field_mask & BPF_GRAPH_ROOT)
return -ELOOP;
}
return 0;
@@ -5608,21 +5615,46 @@ static u8 bpf_ctx_convert_map[] = {
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
-const struct btf_member *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
- const struct btf_type *t, enum bpf_prog_type prog_type,
- int arg)
+static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type)
{
const struct btf_type *conv_struct;
- const struct btf_type *ctx_struct;
const struct btf_member *ctx_type;
- const char *tname, *ctx_tname;
conv_struct = bpf_ctx_convert.t;
- if (!conv_struct) {
- bpf_log(log, "btf_vmlinux is malformed\n");
+ if (!conv_struct)
return NULL;
- }
+ /* prog_type is valid bpf program type. No need for bounds check. */
+ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
+ /* ctx_type is a pointer to prog_ctx_type in vmlinux.
+ * Like 'struct __sk_buff'
+ */
+ return btf_type_by_id(btf_vmlinux, ctx_type->type);
+}
+
+static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
+{
+ const struct btf_type *conv_struct;
+ const struct btf_member *ctx_type;
+
+ conv_struct = bpf_ctx_convert.t;
+ if (!conv_struct)
+ return -EFAULT;
+ /* prog_type is valid bpf program type. No need for bounds check. */
+ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+ /* ctx_type is a pointer to prog_ctx_type in vmlinux.
+ * Like 'struct sk_buff'
+ */
+ return ctx_type->type;
+}
+
+const struct btf_type *
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, enum bpf_prog_type prog_type,
+ int arg)
+{
+ const struct btf_type *ctx_type;
+ const char *tname, *ctx_tname;
+
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
@@ -5639,17 +5671,15 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
return NULL;
}
- /* prog_type is valid bpf program type. No need for bounds check. */
- ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
- /* ctx_struct is a pointer to prog_ctx_type in vmlinux.
- * Like 'struct __sk_buff'
- */
- ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type);
- if (!ctx_struct)
+
+ ctx_type = find_canonical_prog_ctx_type(prog_type);
+ if (!ctx_type) {
+ bpf_log(log, "btf_vmlinux is malformed\n");
/* should not happen */
return NULL;
+ }
again:
- ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);
+ ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
if (!ctx_tname) {
/* should not happen */
bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
@@ -5670,28 +5700,167 @@ again:
/* bpf_user_pt_regs_t is a typedef, so resolve it to
* underlying struct and check name again
*/
- if (!btf_type_is_modifier(ctx_struct))
+ if (!btf_type_is_modifier(ctx_type))
return NULL;
- while (btf_type_is_modifier(ctx_struct))
- ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type);
+ while (btf_type_is_modifier(ctx_type))
+ ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
goto again;
}
return ctx_type;
}
+/* forward declarations for arch-specific underlying types of
+ * bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef
+ * compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still
+ * works correctly with __builtin_types_compatible_p() on respective
+ * architectures
+ */
+struct user_regs_struct;
+struct user_pt_regs;
+
+static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, int arg,
+ enum bpf_prog_type prog_type,
+ enum bpf_attach_type attach_type)
+{
+ const struct btf_type *ctx_type;
+ const char *tname, *ctx_tname;
+
+ if (!btf_is_ptr(t)) {
+ bpf_log(log, "arg#%d type isn't a pointer\n", arg);
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, t->type);
+
+ /* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */
+ if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) {
+ while (btf_type_is_modifier(t) && !btf_type_is_typedef(t))
+ t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_typedef(t)) {
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0)
+ return 0;
+ }
+ }
+
+ /* all other program types don't use typedefs for context type */
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+
+ /* `void *ctx __arg_ctx` is always valid */
+ if (btf_type_is_void(t))
+ return 0;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (str_is_empty(tname)) {
+ bpf_log(log, "arg#%d type doesn't have a name\n", arg);
+ return -EINVAL;
+ }
+
+ /* special cases */
+ switch (prog_type) {
+ case BPF_PROG_TYPE_KPROBE:
+ if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_PERF_EVENT:
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) &&
+ __btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
+ return 0;
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) &&
+ __btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0)
+ return 0;
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) &&
+ __btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_TRACING:
+ switch (attach_type) {
+ case BPF_TRACE_RAW_TP:
+ /* tp_btf program is TRACING, so need special case here */
+ if (__btf_type_is_struct(t) &&
+ strcmp(tname, "bpf_raw_tracepoint_args") == 0)
+ return 0;
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_TRACE_ITER:
+ /* allow struct bpf_iter__xxx types only */
+ if (__btf_type_is_struct(t) &&
+ strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0)
+ return 0;
+ break;
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ default:
+ break;
+ }
+ break;
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_SYSCALL:
+ case BPF_PROG_TYPE_EXT:
+ return 0; /* anything goes */
+ default:
+ break;
+ }
+
+ ctx_type = find_canonical_prog_ctx_type(prog_type);
+ if (!ctx_type) {
+ /* should not happen */
+ bpf_log(log, "btf_vmlinux is malformed\n");
+ return -EINVAL;
+ }
+
+ /* resolve typedefs and check that underlying structs are matching as well */
+ while (btf_type_is_modifier(ctx_type))
+ ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
+
+ /* if program type doesn't have distinctly named struct type for
+ * context, then __arg_ctx argument can only be `void *`, which we
+ * already checked above
+ */
+ if (!__btf_type_is_struct(ctx_type)) {
+ bpf_log(log, "arg#%d should be void pointer\n", arg);
+ return -EINVAL;
+ }
+
+ ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
+ if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) {
+ bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *t,
enum bpf_prog_type prog_type,
int arg)
{
- const struct btf_member *prog_ctx_type, *kern_ctx_type;
-
- prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg);
- if (!prog_ctx_type)
+ if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg))
return -ENOENT;
- kern_ctx_type = prog_ctx_type + 1;
- return kern_ctx_type->type;
+ return find_kern_ctx_type_id(prog_type);
}
int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
@@ -6758,222 +6927,64 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}
-static int btf_check_func_arg_match(struct bpf_verifier_env *env,
- const struct btf *btf, u32 func_id,
- struct bpf_reg_state *regs,
- bool ptr_to_mem_ok,
- bool processing_call)
+static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t)
{
- enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
- struct bpf_verifier_log *log = &env->log;
- const char *func_name, *ref_tname;
- const struct btf_type *t, *ref_t;
- const struct btf_param *args;
- u32 i, nargs, ref_id;
- int ret;
-
- t = btf_type_by_id(btf, func_id);
- if (!t || !btf_type_is_func(t)) {
- /* These checks were already done by the verifier while loading
- * struct bpf_func_info or in add_kfunc_call().
- */
- bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
- func_id);
- return -EFAULT;
- }
- func_name = btf_name_by_offset(btf, t->name_off);
-
- t = btf_type_by_id(btf, t->type);
- if (!t || !btf_type_is_func_proto(t)) {
- bpf_log(log, "Invalid BTF of func %s\n", func_name);
- return -EFAULT;
- }
- args = (const struct btf_param *)(t + 1);
- nargs = btf_type_vlen(t);
- if (nargs > MAX_BPF_FUNC_REG_ARGS) {
- bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
- MAX_BPF_FUNC_REG_ARGS);
- return -EINVAL;
- }
-
- /* check that BTF function arguments match actual types that the
- * verifier sees.
- */
- for (i = 0; i < nargs; i++) {
- enum bpf_arg_type arg_type = ARG_DONTCARE;
- u32 regno = i + 1;
- struct bpf_reg_state *reg = &regs[regno];
-
- t = btf_type_skip_modifiers(btf, args[i].type, NULL);
- if (btf_type_is_scalar(t)) {
- if (reg->type == SCALAR_VALUE)
- continue;
- bpf_log(log, "R%d is not a scalar\n", regno);
- return -EINVAL;
- }
-
- if (!btf_type_is_ptr(t)) {
- bpf_log(log, "Unrecognized arg#%d type %s\n",
- i, btf_type_str(t));
- return -EINVAL;
- }
-
- ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
- ref_tname = btf_name_by_offset(btf, ref_t->name_off);
-
- ret = check_func_arg_reg_off(env, reg, regno, arg_type);
- if (ret < 0)
- return ret;
+ const char *name;
- if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
- /* If function expects ctx type in BTF check that caller
- * is passing PTR_TO_CTX.
- */
- if (reg->type != PTR_TO_CTX) {
- bpf_log(log,
- "arg#%d expected pointer to ctx, but got %s\n",
- i, btf_type_str(t));
- return -EINVAL;
- }
- } else if (ptr_to_mem_ok && processing_call) {
- const struct btf_type *resolve_ret;
- u32 type_size;
+ t = btf_type_by_id(btf, t->type); /* skip PTR */
- resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
- if (IS_ERR(resolve_ret)) {
- bpf_log(log,
- "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
- i, btf_type_str(ref_t), ref_tname,
- PTR_ERR(resolve_ret));
- return -EINVAL;
- }
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
- if (check_mem_reg(env, reg, regno, type_size))
- return -EINVAL;
- } else {
- bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i,
- func_name, func_id);
- return -EINVAL;
- }
+ /* allow either struct or struct forward declaration */
+ if (btf_type_is_struct(t) ||
+ (btf_type_is_fwd(t) && btf_type_kflag(t) == 0)) {
+ name = btf_str_by_offset(btf, t->name_off);
+ return name && strcmp(name, "bpf_dynptr") == 0;
}
- return 0;
-}
-
-/* Compare BTF of a function declaration with given bpf_reg_state.
- * Returns:
- * EFAULT - there is a verifier bug. Abort verification.
- * EINVAL - there is a type mismatch or BTF is not available.
- * 0 - BTF matches with what bpf_reg_state expects.
- * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- */
-int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *regs)
-{
- struct bpf_prog *prog = env->prog;
- struct btf *btf = prog->aux->btf;
- bool is_global;
- u32 btf_id;
- int err;
-
- if (!prog->aux->func_info)
- return -EINVAL;
-
- btf_id = prog->aux->func_info[subprog].type_id;
- if (!btf_id)
- return -EFAULT;
-
- if (prog->aux->func_info_aux[subprog].unreliable)
- return -EINVAL;
-
- is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, false);
-
- /* Compiler optimizations can remove arguments from static functions
- * or mismatched type can be passed into a global function.
- * In such cases mark the function as unreliable from BTF point of view.
- */
- if (err)
- prog->aux->func_info_aux[subprog].unreliable = true;
- return err;
-}
-
-/* Compare BTF of a function call with given bpf_reg_state.
- * Returns:
- * EFAULT - there is a verifier bug. Abort verification.
- * EINVAL - there is a type mismatch or BTF is not available.
- * 0 - BTF matches with what bpf_reg_state expects.
- * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- *
- * NOTE: the code is duplicated from btf_check_subprog_arg_match()
- * because btf_check_func_arg_match() is still doing both. Once that
- * function is split in 2, we can call from here btf_check_subprog_arg_match()
- * first, and then treat the calling part in a new code path.
- */
-int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *regs)
-{
- struct bpf_prog *prog = env->prog;
- struct btf *btf = prog->aux->btf;
- bool is_global;
- u32 btf_id;
- int err;
-
- if (!prog->aux->func_info)
- return -EINVAL;
-
- btf_id = prog->aux->func_info[subprog].type_id;
- if (!btf_id)
- return -EFAULT;
-
- if (prog->aux->func_info_aux[subprog].unreliable)
- return -EINVAL;
-
- is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, true);
-
- /* Compiler optimizations can remove arguments from static functions
- * or mismatched type can be passed into a global function.
- * In such cases mark the function as unreliable from BTF point of view.
- */
- if (err)
- prog->aux->func_info_aux[subprog].unreliable = true;
- return err;
+ return false;
}
-/* Convert BTF of a function into bpf_reg_state if possible
+/* Process BTF of a function to produce high-level expectation of function
+ * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
+ * is cached in subprog info for reuse.
* Returns:
* EFAULT - there is a verifier bug. Abort verification.
* EINVAL - cannot convert BTF.
- * 0 - Successfully converted BTF into bpf_reg_state
- * (either PTR_TO_CTX or SCALAR_VALUE).
+ * 0 - Successfully processed BTF and constructed argument expectations.
*/
-int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *regs)
+int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
{
+ bool is_global = subprog_aux(env, subprog)->linkage == BTF_FUNC_GLOBAL;
+ struct bpf_subprog_info *sub = subprog_info(env, subprog);
struct bpf_verifier_log *log = &env->log;
struct bpf_prog *prog = env->prog;
enum bpf_prog_type prog_type = prog->type;
struct btf *btf = prog->aux->btf;
const struct btf_param *args;
- const struct btf_type *t, *ref_t;
+ const struct btf_type *t, *ref_t, *fn_t;
u32 i, nargs, btf_id;
const char *tname;
- if (!prog->aux->func_info ||
- prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) {
+ if (sub->args_cached)
+ return 0;
+
+ if (!prog->aux->func_info) {
bpf_log(log, "Verifier bug\n");
return -EFAULT;
}
btf_id = prog->aux->func_info[subprog].type_id;
if (!btf_id) {
+ if (!is_global) /* not fatal for static funcs */
+ return -EINVAL;
bpf_log(log, "Global functions need valid BTF\n");
return -EFAULT;
}
- t = btf_type_by_id(btf, btf_id);
- if (!t || !btf_type_is_func(t)) {
+ fn_t = btf_type_by_id(btf, btf_id);
+ if (!fn_t || !btf_type_is_func(fn_t)) {
/* These checks were already done by the verifier while loading
* struct bpf_func_info
*/
@@ -6981,11 +6992,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
subprog);
return -EFAULT;
}
- tname = btf_name_by_offset(btf, t->name_off);
-
- if (log->level & BPF_LOG_LEVEL)
- bpf_log(log, "Validating %s() func#%d...\n",
- tname, subprog);
+ tname = btf_name_by_offset(btf, fn_t->name_off);
if (prog->aux->func_info_aux[subprog].unreliable) {
bpf_log(log, "Verifier bug in function %s()\n", tname);
@@ -6994,7 +7001,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
if (prog_type == BPF_PROG_TYPE_EXT)
prog_type = prog->aux->dst_prog->type;
- t = btf_type_by_id(btf, t->type);
+ t = btf_type_by_id(btf, fn_t->type);
if (!t || !btf_type_is_func_proto(t)) {
bpf_log(log, "Invalid type of function %s()\n", tname);
return -EFAULT;
@@ -7006,7 +7013,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
- /* check that function returns int */
+ /* check that function returns int, exception cb also requires this */
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
@@ -7020,24 +7027,54 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
* Only PTR_TO_CTX and SCALAR are supported atm.
*/
for (i = 0; i < nargs; i++) {
- struct bpf_reg_state *reg = &regs[i + 1];
+ bool is_nonnull = false;
+ const char *tag;
t = btf_type_by_id(btf, args[i].type);
+
+ tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:");
+ if (IS_ERR(tag) && PTR_ERR(tag) == -ENOENT) {
+ tag = NULL;
+ } else if (IS_ERR(tag)) {
+ bpf_log(log, "arg#%d type's tag fetching failure: %ld\n", i, PTR_ERR(tag));
+ return PTR_ERR(tag);
+ }
+ /* 'arg:<tag>' decl_tag takes precedence over derivation of
+ * register type from BTF type itself
+ */
+ if (tag) {
+ /* disallow arg tags in static subprogs */
+ if (!is_global) {
+ bpf_log(log, "arg#%d type tag is not supported in static functions\n", i);
+ return -EOPNOTSUPP;
+ }
+ if (strcmp(tag, "ctx") == 0) {
+ sub->args[i].arg_type = ARG_PTR_TO_CTX;
+ continue;
+ }
+ if (strcmp(tag, "nonnull") == 0)
+ is_nonnull = true;
+ }
+
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (btf_type_is_int(t) || btf_is_any_enum(t)) {
- reg->type = SCALAR_VALUE;
+ sub->args[i].arg_type = ARG_ANYTHING;
continue;
}
- if (btf_type_is_ptr(t)) {
- if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
- reg->type = PTR_TO_CTX;
- continue;
- }
+ if (btf_type_is_ptr(t) && btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+ sub->args[i].arg_type = ARG_PTR_TO_CTX;
+ continue;
+ }
+ if (btf_type_is_ptr(t) && btf_is_dynptr_ptr(btf, t)) {
+ sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY;
+ continue;
+ }
+ if (is_global && btf_type_is_ptr(t)) {
+ u32 mem_size;
t = btf_type_skip_modifiers(btf, t->type, NULL);
-
- ref_t = btf_resolve_size(btf, t, &reg->mem_size);
+ ref_t = btf_resolve_size(btf, t, &mem_size);
if (IS_ERR(ref_t)) {
bpf_log(log,
"arg#%d reference type('%s %s') size cannot be determined: %ld\n",
@@ -7046,15 +7083,39 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
}
- reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
- reg->id = ++env->id_gen;
-
+ sub->args[i].arg_type = is_nonnull ? ARG_PTR_TO_MEM : ARG_PTR_TO_MEM_OR_NULL;
+ sub->args[i].mem_size = mem_size;
continue;
}
+ if (is_nonnull) {
+ bpf_log(log, "arg#%d marked as non-null, but is not a pointer type\n", i);
+ return -EINVAL;
+ }
bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
i, btf_type_str(t), tname);
return -EINVAL;
}
+
+ for (i = 0; i < nargs; i++) {
+ const char *tag;
+
+ if (sub->args[i].arg_type != ARG_PTR_TO_CTX)
+ continue;
+
+ /* check if arg has "arg:ctx" tag */
+ t = btf_type_by_id(btf, args[i].type);
+ tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:");
+ if (IS_ERR_OR_NULL(tag) || strcmp(tag, "ctx") != 0)
+ continue;
+
+ if (btf_validate_prog_ctx_type(log, btf, t, i, prog_type,
+ prog->expected_attach_type))
+ return -EINVAL;
+ }
+
+ sub->arg_cnt = nargs;
+ sub->args_cached = true;
+
return 0;
}
@@ -7832,6 +7893,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_SYSCALL:
return BTF_KFUNC_HOOK_SYSCALL;
case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
return BTF_KFUNC_HOOK_CGROUP_SKB;
case BPF_PROG_TYPE_SCHED_ACT:
return BTF_KFUNC_HOOK_SCHED_ACT;
@@ -8501,7 +8563,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
tname = btf_name_by_offset(btf, walk_type->name_off);
ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
- if (ret < 0)
+ if (ret >= sizeof(safe_tname))
return false;
safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info));
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 5b2741aa0d9b..491d20038cbe 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -785,7 +785,8 @@ found:
* to descendants
* @cgrp: The cgroup which descendants to traverse
* @link: A link for which to replace BPF program
- * @type: Type of attach operation
+ * @new_prog: &struct bpf_prog for the target BPF program with its refcnt
+ * incremented
*
* Must be called with cgroup_mutex held.
*/
@@ -1334,7 +1335,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
* @sk: The socket sending or receiving traffic
* @skb: The skb that is being sent or received
- * @type: The type of program to be executed
+ * @atype: The type of program to be executed
*
* If no socket is passed, or the socket is not of type INET or INET6,
* this function does nothing and returns 0.
@@ -1424,7 +1425,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
/**
* __cgroup_bpf_run_filter_sk() - Run a program on a sock
* @sk: sock structure to manipulate
- * @type: The type of program to be executed
+ * @atype: The type of program to be executed
*
* socket is passed is expected to be of type INET or INET6.
*
@@ -1449,18 +1450,22 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* provided by user sockaddr
* @sk: sock struct that will use sockaddr
* @uaddr: sockaddr struct provided by user
- * @type: The type of program to be executed
+ * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
+ * read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
+ * uaddr.
+ * @atype: The type of program to be executed
* @t_ctx: Pointer to attach type specific context
* @flags: Pointer to u32 which contains higher bits of BPF program
* return value (OR'ed together).
*
- * socket is expected to be of type INET or INET6.
+ * socket is expected to be of type INET, INET6 or UNIX.
*
* This function will return %-EPERM if an attached program is found and
* returned value != 1 during execution. In all other cases, 0 is returned.
*/
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
struct sockaddr *uaddr,
+ int *uaddrlen,
enum cgroup_bpf_attach_type atype,
void *t_ctx,
u32 *flags)
@@ -1472,21 +1477,31 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
};
struct sockaddr_storage unspec;
struct cgroup *cgrp;
+ int ret;
/* Check socket family since not all sockets represent network
* endpoint (e.g. AF_UNIX).
*/
- if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
+ sk->sk_family != AF_UNIX)
return 0;
if (!ctx.uaddr) {
memset(&unspec, 0, sizeof(unspec));
ctx.uaddr = (struct sockaddr *)&unspec;
+ ctx.uaddrlen = 0;
+ } else {
+ ctx.uaddrlen = *uaddrlen;
}
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
- 0, flags);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
+ 0, flags);
+
+ if (!ret && uaddr)
+ *uaddrlen = ctx.uaddrlen;
+
+ return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
@@ -1496,7 +1511,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
* sk with connection information (IP addresses, etc.) May not contain
* cgroup info if it is a req sock.
- * @type: The type of program to be executed
+ * @atype: The type of program to be executed
*
* socket passed is expected to be of type INET or INET6.
*
@@ -1670,7 +1685,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* @ppos: value-result argument: value is position at which read from or write
* to sysctl is happening, result is new position if program overrode it,
* initial value otherwise
- * @type: type of program to be executed
+ * @atype: type of program to be executed
*
* Program is run when sysctl is being accessed, either read or written, and
* can allow or deny such access.
@@ -1785,7 +1800,7 @@ static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
}
int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
- int *optname, char __user *optval,
+ int *optname, sockptr_t optval,
int *optlen, char **kernel_optval)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
@@ -1808,7 +1823,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
ctx.optlen = *optlen;
- if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
+ if (copy_from_sockptr(ctx.optval, optval,
+ min(*optlen, max_optlen))) {
ret = -EFAULT;
goto out;
}
@@ -1875,8 +1891,8 @@ out:
}
int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
- int optname, char __user *optval,
- int __user *optlen, int max_optlen,
+ int optname, sockptr_t optval,
+ sockptr_t optlen, int max_optlen,
int retval)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
@@ -1903,8 +1919,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
* one that kernel returned as well to let
* BPF programs inspect the value.
*/
-
- if (get_user(ctx.optlen, optlen)) {
+ if (copy_from_sockptr(&ctx.optlen, optlen,
+ sizeof(ctx.optlen))) {
ret = -EFAULT;
goto out;
}
@@ -1915,8 +1931,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
}
orig_optlen = ctx.optlen;
- if (copy_from_user(ctx.optval, optval,
- min(ctx.optlen, max_optlen)) != 0) {
+ if (copy_from_sockptr(ctx.optval, optval,
+ min(ctx.optlen, max_optlen))) {
ret = -EFAULT;
goto out;
}
@@ -1930,7 +1946,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
if (ret < 0)
goto out;
- if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) {
+ if (!sockptr_is_null(optval) &&
+ (ctx.optlen > max_optlen || ctx.optlen < 0)) {
if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
ctx.optlen, max_optlen);
@@ -1942,11 +1959,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
}
if (ctx.optlen != 0) {
- if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) {
+ if (!sockptr_is_null(optval) &&
+ copy_to_sockptr(optval, ctx.optval, ctx.optlen)) {
ret = -EFAULT;
goto out;
}
- if (put_user(ctx.optlen, optlen)) {
+ if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) {
ret = -EFAULT;
goto out;
}
@@ -2519,10 +2537,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
return NULL;
default:
return &bpf_get_retval_proto;
@@ -2534,10 +2555,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
return NULL;
default:
return &bpf_set_retval_proto;
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index 810378f04fbc..f04a468cf6a7 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -282,7 +282,7 @@ static struct bpf_iter_reg bpf_cgroup_reg_info = {
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__cgroup, cgroup),
- PTR_TO_BTF_ID_OR_NULL },
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
},
.seq_info = &cgroup_iter_seq_info,
};
@@ -294,3 +294,66 @@ static int __init bpf_cgroup_iter_init(void)
}
late_initcall(bpf_cgroup_iter_init);
+
+struct bpf_iter_css {
+ __u64 __opaque[3];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_kern {
+ struct cgroup_subsys_state *start;
+ struct cgroup_subsys_state *pos;
+ unsigned int flags;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
+ struct cgroup_subsys_state *start, unsigned int flags)
+{
+ struct bpf_iter_css_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) > sizeof(struct bpf_iter_css));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css));
+
+ kit->start = NULL;
+ switch (flags) {
+ case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+ case BPF_CGROUP_ITER_DESCENDANTS_POST:
+ case BPF_CGROUP_ITER_ANCESTORS_UP:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ kit->start = start;
+ kit->pos = NULL;
+ kit->flags = flags;
+ return 0;
+}
+
+__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it)
+{
+ struct bpf_iter_css_kern *kit = (void *)it;
+
+ if (!kit->start)
+ return NULL;
+
+ switch (kit->flags) {
+ case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+ kit->pos = css_next_descendant_pre(kit->pos, kit->start);
+ break;
+ case BPF_CGROUP_ITER_DESCENDANTS_POST:
+ kit->pos = css_next_descendant_post(kit->pos, kit->start);
+ break;
+ case BPF_CGROUP_ITER_ANCESTORS_UP:
+ kit->pos = kit->pos ? kit->pos->parent : kit->start;
+ }
+
+ return kit->pos;
+}
+
+__bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it)
+{
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 0f8f036d8bd1..ea6843be2616 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -121,6 +121,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
#endif
INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
+#ifdef CONFIG_FINEIBT
+ INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
+#endif
mutex_init(&fp->aux->used_maps_mutex);
mutex_init(&fp->aux->dst_mutex);
@@ -212,7 +215,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
const struct bpf_line_info *linfo;
void **jited_linfo;
- if (!prog->aux->jited_linfo)
+ if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
/* Userspace did not provide linfo */
return;
@@ -371,14 +374,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
s32 end_new, s32 curr, const bool probe_pass)
{
- const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s64 off_min, off_max, off;
s32 delta = end_new - end_old;
- s32 off;
- if (insn->code == (BPF_JMP32 | BPF_JA))
+ if (insn->code == (BPF_JMP32 | BPF_JA)) {
off = insn->imm;
- else
+ off_min = S32_MIN;
+ off_max = S32_MAX;
+ } else {
off = insn->off;
+ off_min = S16_MIN;
+ off_max = S16_MAX;
+ }
if (curr < pos && curr + off + 1 >= end_old)
off += delta;
@@ -539,7 +546,7 @@ static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
int i;
- for (i = 0; i < fp->aux->func_cnt; i++)
+ for (i = 0; i < fp->aux->real_func_cnt; i++)
bpf_prog_kallsyms_del(fp->aux->func[i]);
}
@@ -589,7 +596,7 @@ bpf_prog_ksym_set_name(struct bpf_prog *prog)
sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
/* prog->aux->name will be ignored if full btf name is available */
- if (prog->aux->func_info_cnt) {
+ if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
type = btf_type_by_id(prog->aux->btf,
prog->aux->func_info[prog->aux->func_idx].type_id);
func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
@@ -623,7 +630,11 @@ static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
if (val < ksym->start)
return -1;
- if (val >= ksym->end)
+ /* Ensure that we detect return addresses as part of the program, when
+ * the final instruction is a call for a program part of the stack
+ * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
+ */
+ if (val > ksym->end)
return 1;
return 0;
@@ -679,6 +690,23 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp)
fp->aux->ksym.prog = true;
bpf_ksym_add(&fp->aux->ksym);
+
+#ifdef CONFIG_FINEIBT
+ /*
+ * When FineIBT, code in the __cfi_foo() symbols can get executed
+ * and hence unwinder needs help.
+ */
+ if (cfi_mode != CFI_FINEIBT)
+ return;
+
+ snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
+ "__cfi_%s", fp->aux->ksym.name);
+
+ fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
+ fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func;
+
+ bpf_ksym_add(&fp->aux->ksym_prefix);
+#endif
}
void bpf_prog_kallsyms_del(struct bpf_prog *fp)
@@ -687,6 +715,11 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp)
return;
bpf_ksym_del(&fp->aux->ksym);
+#ifdef CONFIG_FINEIBT
+ if (cfi_mode != CFI_FINEIBT)
+ return;
+ bpf_ksym_del(&fp->aux->ksym_prefix);
+#endif
}
static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
@@ -733,7 +766,7 @@ bool is_bpf_text_address(unsigned long addr)
return ret;
}
-static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
struct bpf_ksym *ksym = bpf_ksym_find(addr);
@@ -870,7 +903,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins
GFP_KERNEL);
if (!pack)
return NULL;
- pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
+ pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
if (!pack->ptr) {
kfree(pack);
return NULL;
@@ -894,7 +927,7 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
mutex_lock(&pack_mutex);
if (size > BPF_PROG_PACK_SIZE) {
size = round_up(size, PAGE_SIZE);
- ptr = module_alloc(size);
+ ptr = bpf_jit_alloc_exec(size);
if (ptr) {
bpf_fill_ill_insns(ptr, size);
set_vm_flush_reset_perms(ptr);
@@ -924,20 +957,20 @@ out:
return ptr;
}
-void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+void bpf_prog_pack_free(void *ptr, u32 size)
{
struct bpf_prog_pack *pack = NULL, *tmp;
unsigned int nbits;
unsigned long pos;
mutex_lock(&pack_mutex);
- if (hdr->size > BPF_PROG_PACK_SIZE) {
- module_memfree(hdr);
+ if (size > BPF_PROG_PACK_SIZE) {
+ bpf_jit_free_exec(ptr);
goto out;
}
list_for_each_entry(tmp, &pack_list, list) {
- if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) {
+ if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
pack = tmp;
break;
}
@@ -946,17 +979,17 @@ void bpf_prog_pack_free(struct bpf_binary_header *hdr)
if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
goto out;
- nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
- pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
+ nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
- WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
+ WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
"bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
bitmap_clear(pack->bitmap, pos, nbits);
if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
BPF_PROG_CHUNK_COUNT, 0) == 0) {
list_del(&pack->list);
- module_memfree(pack->ptr);
+ bpf_jit_free_exec(pack->ptr);
kfree(pack);
}
out:
@@ -1096,8 +1129,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
*rw_header = kvmalloc(size, GFP_KERNEL);
if (!*rw_header) {
- bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
- bpf_prog_pack_free(ro_header);
+ bpf_prog_pack_free(ro_header, size);
bpf_jit_uncharge_modmem(size);
return NULL;
}
@@ -1128,7 +1160,7 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
kvfree(rw_header);
if (IS_ERR(ptr)) {
- bpf_prog_pack_free(ro_header);
+ bpf_prog_pack_free(ro_header, ro_header->size);
return PTR_ERR(ptr);
}
return 0;
@@ -1149,7 +1181,7 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
{
u32 size = ro_header->size;
- bpf_prog_pack_free(ro_header);
+ bpf_prog_pack_free(ro_header, size);
kvfree(rw_header);
bpf_jit_uncharge_modmem(size);
}
@@ -1208,7 +1240,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
if (!extra_pass)
addr = NULL;
else if (prog->aux->func &&
- off >= 0 && off < prog->aux->func_cnt)
+ off >= 0 && off < prog->aux->real_func_cnt)
addr = (u8 *)prog->aux->func[off]->bpf_func;
else
return -EINVAL;
@@ -2660,12 +2692,16 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
struct bpf_map **used_maps, u32 len)
{
struct bpf_map *map;
+ bool sleepable;
u32 i;
+ sleepable = aux->sleepable;
for (i = 0; i < len; i++) {
map = used_maps[i];
if (map->ops->map_poke_untrack)
map->ops->map_poke_untrack(map, aux);
+ if (sleepable)
+ atomic64_dec(&map->sleepable_refcnt);
bpf_map_put(map);
}
}
@@ -2721,7 +2757,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
#endif
if (aux->dst_trampoline)
bpf_trampoline_put(aux->dst_trampoline);
- for (i = 0; i < aux->func_cnt; i++) {
+ for (i = 0; i < aux->real_func_cnt; i++) {
/* We can just unlink the subprog poke descriptor table as
* it was originally linked to the main program and is also
* released along with it.
@@ -2729,7 +2765,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
aux->func[i]->aux->poke_tab = NULL;
bpf_jit_free(aux->func[i]);
}
- if (aux->func_cnt) {
+ if (aux->real_func_cnt) {
kfree(aux->func);
bpf_prog_unlock_free(aux->prog);
} else {
@@ -2914,6 +2950,15 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len)
return -ENOTSUPP;
}
+bool __weak bpf_jit_supports_exceptions(void)
+{
+ return false;
+}
+
+void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+}
+
#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
{
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index e42a1bdb7f53..8a0bb80fe48a 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -764,6 +764,16 @@ void __cpu_map_flush(void)
}
}
+#ifdef CONFIG_DEBUG_NET
+bool cpu_map_check_flush(void)
+{
+ if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
+ return false;
+ __cpu_map_flush();
+ return true;
+}
+#endif
+
static int __init cpu_map_init(void)
{
int cpu;
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 6983af8e093c..2e73533a3811 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -34,9 +34,7 @@ static bool cpu_valid(u32 cpu)
return cpu < nr_cpu_ids;
}
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
- "Global kfuncs as their definitions will be in BTF");
+__bpf_kfunc_start_defs();
/**
* bpf_cpumask_create() - Create a mutable BPF cpumask.
@@ -98,6 +96,12 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
migrate_enable();
}
+__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask)
+{
+ bpf_cpumask_release(cpumask);
+}
+CFI_NOSEAL(bpf_cpumask_release_dtor);
+
/**
* bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask.
* @cpumask: The cpumask being queried.
@@ -407,7 +411,18 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
return cpumask_any_and_distribute(src1, src2);
}
-__diag_pop();
+/**
+ * bpf_cpumask_weight() - Return the number of bits in @cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Count the number of set bits in the given cpumask.
+ */
+__bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
+{
+ return cpumask_weight(cpumask);
+}
+
+__bpf_kfunc_end_defs();
BTF_SET8_START(cpumask_kfunc_btf_ids)
BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
@@ -434,6 +449,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
BTF_SET8_END(cpumask_kfunc_btf_ids)
static const struct btf_kfunc_id_set cpumask_kfunc_set = {
@@ -443,7 +459,7 @@ static const struct btf_kfunc_id_set cpumask_kfunc_set = {
BTF_ID_LIST(cpumask_dtor_ids)
BTF_ID(struct, bpf_cpumask)
-BTF_ID(func, bpf_cpumask_release)
+BTF_ID(func, bpf_cpumask_release_dtor)
static int __init cpumask_kfunc_init(void)
{
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 4d42f6ed6c11..a936c704d4e7 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -418,6 +418,16 @@ void __dev_flush(void)
}
}
+#ifdef CONFIG_DEBUG_NET
+bool dev_check_flush(void)
+{
+ if (list_empty(this_cpu_ptr(&dev_flush_list)))
+ return false;
+ __dev_flush();
+ return true;
+}
+#endif
+
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
* by local_bh_disable() (from XDP calls inside NAPI). The
* rcu_read_lock_bh_held() below makes lockdep accept both.
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index fa3e9225aedc..70fb82bf1637 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -150,14 +150,11 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
goto out;
d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!d->rw_image) {
- u32 size = PAGE_SIZE;
-
- bpf_arch_text_copy(d->image, &size, sizeof(size));
- bpf_prog_pack_free((struct bpf_binary_header *)d->image);
+ bpf_prog_pack_free(d->image, PAGE_SIZE);
d->image = NULL;
goto out;
}
- bpf_image_ksym_add(d->image, &d->ksym);
+ bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym);
}
prev_num_progs = d->num_progs;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a8c7e1c5abfa..03a6a2500b6a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -7,6 +7,7 @@
#include <linux/jhash.h>
#include <linux/filter.h>
#include <linux/rculist_nulls.h>
+#include <linux/rcupdate_wait.h>
#include <linux/random.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
@@ -155,13 +156,15 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab,
hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
preempt_disable();
+ local_irq_save(flags);
if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
__this_cpu_dec(*(htab->map_locked[hash]));
+ local_irq_restore(flags);
preempt_enable();
return -EBUSY;
}
- raw_spin_lock_irqsave(&b->raw_lock, flags);
+ raw_spin_lock(&b->raw_lock);
*pflags = flags;
return 0;
@@ -172,8 +175,9 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,
unsigned long flags)
{
hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
- raw_spin_unlock_irqrestore(&b->raw_lock, flags);
+ raw_spin_unlock(&b->raw_lock);
__this_cpu_dec(*(htab->map_locked[hash]));
+ local_irq_restore(flags);
preempt_enable();
}
@@ -894,7 +898,7 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
if (map->ops->map_fd_put_ptr) {
ptr = fd_htab_map_get_ptr(map, l);
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, true);
}
}
@@ -2481,7 +2485,7 @@ static void fd_htab_map_free(struct bpf_map *map)
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
void *ptr = fd_htab_map_get_ptr(map, l);
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, false);
}
}
@@ -2520,9 +2524,15 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
if (IS_ERR(ptr))
return PTR_ERR(ptr);
+ /* The htab bucket lock is always held during update operations in fd
+ * htab map, and the following rcu_read_lock() is only used to avoid
+ * the WARN_ON_ONCE in htab_map_update_elem().
+ */
+ rcu_read_lock();
ret = htab_map_update_elem(map, key, &ptr, map_flags);
+ rcu_read_unlock();
if (ret)
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, false);
return ret;
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 8bd3812fb8df..be72824f32b2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -22,6 +22,7 @@
#include <linux/security.h>
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>
+#include <linux/kasan.h>
#include "../../lib/kstrtox.h"
@@ -31,12 +32,13 @@
*
* Different map implementations will rely on rcu in map methods
* lookup/update/delete, therefore eBPF programs must run under rcu lock
- * if program is allowed to access maps, so check rcu_read_lock_held in
- * all three functions.
+ * if program is allowed to access maps, so check rcu_read_lock_held() or
+ * rcu_read_lock_trace_held() in all three functions.
*/
BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return (unsigned long) map->ops->map_lookup_elem(map, key);
}
@@ -52,7 +54,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = {
BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
void *, value, u64, flags)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return map->ops->map_update_elem(map, key, value, flags);
}
@@ -69,7 +72,8 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return map->ops->map_delete_elem(map, key);
}
@@ -1176,13 +1180,6 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
ret = -EBUSY;
goto out;
}
- if (!atomic64_read(&map->usercnt)) {
- /* maps with timers must be either held by user space
- * or pinned in bpffs.
- */
- ret = -EPERM;
- goto out;
- }
/* allocate hrtimer via map_kmalloc to use memcg accounting */
t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
if (!t) {
@@ -1195,7 +1192,21 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
rcu_assign_pointer(t->callback_fn, NULL);
hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
t->timer.function = bpf_timer_cb;
- timer->timer = t;
+ WRITE_ONCE(timer->timer, t);
+ /* Guarantee the order between timer->timer and map->usercnt. So
+ * when there are concurrent uref release and bpf timer init, either
+ * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
+ * timer or atomic64_read() below returns a zero usercnt.
+ */
+ smp_mb();
+ if (!atomic64_read(&map->usercnt)) {
+ /* maps with timers must be either held by user space
+ * or pinned in bpffs.
+ */
+ WRITE_ONCE(timer->timer, NULL);
+ kfree(t);
+ ret = -EPERM;
+ }
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
return ret;
@@ -1271,7 +1282,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
if (in_nmi())
return -EOPNOTSUPP;
- if (flags > BPF_F_TIMER_ABS)
+ if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
return -EINVAL;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
@@ -1285,6 +1296,9 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
else
mode = HRTIMER_MODE_REL_SOFT;
+ if (flags & BPF_F_TIMER_CPU_PIN)
+ mode |= HRTIMER_MODE_PINNED;
+
hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
@@ -1370,7 +1384,7 @@ void bpf_timer_cancel_and_free(void *val)
/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
* this timer, since it won't be initialized.
*/
- timer->timer = NULL;
+ WRITE_ONCE(timer->timer, NULL);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
if (!t)
@@ -1807,8 +1821,6 @@ bpf_base_func_proto(enum bpf_func_id func_id)
}
}
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock)
{
@@ -1840,7 +1852,7 @@ unlock:
* bpf_list_head which needs to be freed.
*/
migrate_disable();
- __bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
migrate_enable();
}
}
@@ -1879,14 +1891,12 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
migrate_disable();
- __bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
migrate_enable();
}
}
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
- "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
{
@@ -1902,9 +1912,19 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
return p;
}
+__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ u64 size = local_type_id__k;
+
+ /* The verifier has ensured that meta__ign must be NULL */
+ return bpf_mem_alloc(&bpf_global_percpu_ma, size);
+}
+
/* Must be called under migrate_disable(), as required by bpf_mem_free */
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
{
+ struct bpf_mem_alloc *ma;
+
if (rec && rec->refcount_off >= 0 &&
!refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
/* Object is refcounted and refcount_dec didn't result in 0
@@ -1916,10 +1936,11 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
if (rec)
bpf_obj_free_fields(rec, p);
- if (rec && rec->refcount_off >= 0)
- bpf_mem_free_rcu(&bpf_global_ma, p);
+ if (percpu)
+ ma = &bpf_global_percpu_ma;
else
- bpf_mem_free(&bpf_global_ma, p);
+ ma = &bpf_global_ma;
+ bpf_mem_free_rcu(ma, p);
}
__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -1927,7 +1948,13 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
struct btf_struct_meta *meta = meta__ign;
void *p = p__alloc;
- __bpf_obj_drop_impl(p, meta ? meta->record : NULL);
+ __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
+}
+
+__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ /* The verifier has ensured that meta__ign must be NULL */
+ bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc);
}
__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
@@ -1965,7 +1992,7 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
*/
if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
- __bpf_obj_drop_impl((void *)n - off, rec);
+ __bpf_obj_drop_impl((void *)n - off, rec, false);
return -EINVAL;
}
@@ -2064,7 +2091,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root,
*/
if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
- __bpf_obj_drop_impl((void *)n - off, rec);
+ __bpf_obj_drop_impl((void *)n - off, rec, false);
return -EINVAL;
}
@@ -2123,6 +2150,12 @@ __bpf_kfunc void bpf_task_release(struct task_struct *p)
put_task_struct_rcu_user(p);
}
+__bpf_kfunc void bpf_task_release_dtor(void *p)
+{
+ put_task_struct_rcu_user(p);
+}
+CFI_NOSEAL(bpf_task_release_dtor);
+
#ifdef CONFIG_CGROUPS
/**
* bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
@@ -2147,6 +2180,12 @@ __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
cgroup_put(cgrp);
}
+__bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp)
+{
+ cgroup_put(cgrp);
+}
+CFI_NOSEAL(bpf_cgroup_release_dtor);
+
/**
* bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
* array. A cgroup returned by this kfunc which is not subsequently stored in a
@@ -2197,7 +2236,31 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
__bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
struct cgroup *ancestor)
{
- return task_under_cgroup_hierarchy(task, ancestor);
+ long ret;
+
+ rcu_read_lock();
+ ret = task_under_cgroup_hierarchy(task, ancestor);
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a
+ * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
+ * hierarchy ID.
+ * @task: The target task
+ * @hierarchy_id: The ID of a cgroup1 hierarchy
+ *
+ * On success, the cgroup is returen. On failure, NULL is returned.
+ */
+__bpf_kfunc struct cgroup *
+bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id)
+{
+ struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id);
+
+ if (IS_ERR(cgrp))
+ return NULL;
+ return cgrp;
}
#endif /* CONFIG_CGROUPS */
@@ -2435,15 +2498,60 @@ __bpf_kfunc void bpf_rcu_read_unlock(void)
rcu_read_unlock();
}
-__diag_pop();
+struct bpf_throw_ctx {
+ struct bpf_prog_aux *aux;
+ u64 sp;
+ u64 bp;
+ int cnt;
+};
+
+static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+ struct bpf_throw_ctx *ctx = cookie;
+ struct bpf_prog *prog;
+
+ if (!is_bpf_text_address(ip))
+ return !ctx->cnt;
+ prog = bpf_prog_ksym_find(ip);
+ ctx->cnt++;
+ if (bpf_is_subprog(prog))
+ return true;
+ ctx->aux = prog->aux;
+ ctx->sp = sp;
+ ctx->bp = bp;
+ return false;
+}
+
+__bpf_kfunc void bpf_throw(u64 cookie)
+{
+ struct bpf_throw_ctx ctx = {};
+
+ arch_bpf_stack_walk(bpf_stack_walker, &ctx);
+ WARN_ON_ONCE(!ctx.aux);
+ if (ctx.aux)
+ WARN_ON_ONCE(!ctx.aux->exception_boundary);
+ WARN_ON_ONCE(!ctx.bp);
+ WARN_ON_ONCE(!ctx.cnt);
+ /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning
+ * deeper stack depths than ctx.sp as we do not return from bpf_throw,
+ * which skips compiler generated instrumentation to do the same.
+ */
+ kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
+ ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0);
+ WARN(1, "A call to BPF exception callback should never return\n");
+}
+
+__bpf_kfunc_end_defs();
BTF_SET8_START(generic_btf_ids)
#ifdef CONFIG_KEXEC_CORE
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
BTF_ID_FLAGS(func, bpf_list_push_front_impl)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
@@ -2460,8 +2568,10 @@ BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
+BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_throw)
BTF_SET8_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
@@ -2472,10 +2582,10 @@ static const struct btf_kfunc_id_set generic_kfunc_set = {
BTF_ID_LIST(generic_dtor_ids)
BTF_ID(struct, task_struct)
-BTF_ID(func, bpf_task_release)
+BTF_ID(func, bpf_task_release_dtor)
#ifdef CONFIG_CGROUPS
BTF_ID(struct, cgroup)
-BTF_ID(func, bpf_cgroup_release)
+BTF_ID(func, bpf_cgroup_release_dtor)
#endif
BTF_SET8_START(common_btf_ids)
@@ -2488,6 +2598,20 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
+#ifdef CONFIG_CGROUPS
+BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
+#endif
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, bpf_dynptr_adjust)
BTF_ID_FLAGS(func, bpf_dynptr_is_null)
BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
@@ -2518,6 +2642,7 @@ static int __init kfunc_init(void)
ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
ARRAY_SIZE(generic_dtors),
@@ -2526,3 +2651,22 @@ static int __init kfunc_init(void)
}
late_initcall(kfunc_init);
+
+/* Get a pointer to dynptr data up to len bytes for read only access. If
+ * the dynptr doesn't have continuous data up to len bytes, return NULL.
+ */
+const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
+{
+ return bpf_dynptr_slice(ptr, 0, NULL, len);
+}
+
+/* Get a pointer to dynptr data up to len bytes for read write access. If
+ * the dynptr doesn't have continuous data up to len bytes, or the dynptr
+ * is read only, return NULL.
+ */
+void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len)
+{
+ if (__bpf_dynptr_is_rdonly(ptr))
+ return NULL;
+ return (void *)__bpf_dynptr_data(ptr, len);
+}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 99d0625b6c82..41e0a55c35f5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -118,8 +118,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
return ERR_PTR(-ENOSPC);
inode->i_ino = get_next_ino();
- inode->i_atime = inode_set_ctime_current(inode);
- inode->i_mtime = inode->i_atime;
+ simple_inode_init_ts(inode);
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
@@ -147,7 +146,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
d_instantiate(dentry, inode);
dget(dentry);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
@@ -600,8 +599,15 @@ EXPORT_SYMBOL(bpf_prog_get_type_path);
*/
static int bpf_show_options(struct seq_file *m, struct dentry *root)
{
- umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
+ struct inode *inode = d_inode(root);
+ umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
+ if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, inode->i_uid));
+ if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, inode->i_gid));
if (mode != S_IRWXUGO)
seq_printf(m, ",mode=%o", mode);
return 0;
@@ -626,15 +632,21 @@ static const struct super_operations bpf_super_ops = {
};
enum {
+ OPT_UID,
+ OPT_GID,
OPT_MODE,
};
static const struct fs_parameter_spec bpf_fs_parameters[] = {
+ fsparam_u32 ("uid", OPT_UID),
+ fsparam_u32 ("gid", OPT_GID),
fsparam_u32oct ("mode", OPT_MODE),
{}
};
struct bpf_mount_opts {
+ kuid_t uid;
+ kgid_t gid;
umode_t mode;
};
@@ -642,6 +654,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct bpf_mount_opts *opts = fc->fs_private;
struct fs_parse_result result;
+ kuid_t uid;
+ kgid_t gid;
int opt;
opt = fs_parse(fc, bpf_fs_parameters, param, &result);
@@ -663,12 +677,42 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
}
switch (opt) {
+ case OPT_UID:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ goto bad_value;
+
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fc->user_ns, uid))
+ goto bad_value;
+
+ opts->uid = uid;
+ break;
+ case OPT_GID:
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ goto bad_value;
+
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fc->user_ns, gid))
+ goto bad_value;
+
+ opts->gid = gid;
+ break;
case OPT_MODE:
opts->mode = result.uint_32 & S_IALLUGO;
break;
}
return 0;
+bad_value:
+ return invalfc(fc, "Bad value for '%s'", param->key);
}
struct bpf_preload_ops *bpf_preload_ops;
@@ -751,6 +795,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_op = &bpf_super_ops;
inode = sb->s_root->d_inode;
+ inode->i_uid = opts->uid;
+ inode->i_gid = opts->gid;
inode->i_op = &bpf_dir_iops;
inode->i_mode &= ~S_IALLUGO;
populate_bpffs(sb->s_root);
@@ -786,6 +832,8 @@ static int bpf_init_fs_context(struct fs_context *fc)
return -ENOMEM;
opts->mode = S_IRWXUGO;
+ opts->uid = current_fsuid();
+ opts->gid = current_fsgid();
fc->fs_private = opts;
fc->ops = &bpf_context_ops;
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 850494423530..594a234f122b 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -10,6 +10,8 @@
#include <linux/bpf_verifier.h>
#include <linux/math64.h>
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
{
/* ubuf and len_total should both be specified (or not) together */
@@ -325,3 +327,505 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
va_end(args);
}
EXPORT_SYMBOL_GPL(bpf_log);
+
+static const struct bpf_line_info *
+find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
+{
+ const struct bpf_line_info *linfo;
+ const struct bpf_prog *prog;
+ u32 i, nr_linfo;
+
+ prog = env->prog;
+ nr_linfo = prog->aux->nr_linfo;
+
+ if (!nr_linfo || insn_off >= prog->len)
+ return NULL;
+
+ linfo = prog->aux->linfo;
+ for (i = 1; i < nr_linfo; i++)
+ if (insn_off < linfo[i].insn_off)
+ break;
+
+ return &linfo[i - 1];
+}
+
+static const char *ltrim(const char *s)
+{
+ while (isspace(*s))
+ s++;
+
+ return s;
+}
+
+__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
+ u32 insn_off,
+ const char *prefix_fmt, ...)
+{
+ const struct bpf_line_info *linfo;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ linfo = find_linfo(env, insn_off);
+ if (!linfo || linfo == env->prev_linfo)
+ return;
+
+ if (prefix_fmt) {
+ va_list args;
+
+ va_start(args, prefix_fmt);
+ bpf_verifier_vlog(&env->log, prefix_fmt, args);
+ va_end(args);
+ }
+
+ verbose(env, "%s\n",
+ ltrim(btf_name_by_offset(env->prog->aux->btf,
+ linfo->line_off)));
+
+ env->prev_linfo = linfo;
+}
+
+static const char *btf_type_name(const struct btf *btf, u32 id)
+{
+ return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
+}
+
+/* string representation of 'enum bpf_reg_type'
+ *
+ * Note that reg_type_str() can not appear more than once in a single verbose()
+ * statement.
+ */
+const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
+{
+ char postfix[16] = {0}, prefix[64] = {0};
+ static const char * const str[] = {
+ [NOT_INIT] = "?",
+ [SCALAR_VALUE] = "scalar",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_STACK] = "fp",
+ [PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_META] = "pkt_meta",
+ [PTR_TO_PACKET_END] = "pkt_end",
+ [PTR_TO_FLOW_KEYS] = "flow_keys",
+ [PTR_TO_SOCKET] = "sock",
+ [PTR_TO_SOCK_COMMON] = "sock_common",
+ [PTR_TO_TCP_SOCK] = "tcp_sock",
+ [PTR_TO_TP_BUFFER] = "tp_buffer",
+ [PTR_TO_XDP_SOCK] = "xdp_sock",
+ [PTR_TO_BTF_ID] = "ptr_",
+ [PTR_TO_MEM] = "mem",
+ [PTR_TO_BUF] = "buf",
+ [PTR_TO_FUNC] = "func",
+ [PTR_TO_MAP_KEY] = "map_key",
+ [CONST_PTR_TO_DYNPTR] = "dynptr_ptr",
+ };
+
+ if (type & PTR_MAYBE_NULL) {
+ if (base_type(type) == PTR_TO_BTF_ID)
+ strncpy(postfix, "or_null_", 16);
+ else
+ strncpy(postfix, "_or_null", 16);
+ }
+
+ snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
+ type & MEM_RDONLY ? "rdonly_" : "",
+ type & MEM_RINGBUF ? "ringbuf_" : "",
+ type & MEM_USER ? "user_" : "",
+ type & MEM_PERCPU ? "percpu_" : "",
+ type & MEM_RCU ? "rcu_" : "",
+ type & PTR_UNTRUSTED ? "untrusted_" : "",
+ type & PTR_TRUSTED ? "trusted_" : ""
+ );
+
+ snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
+ prefix, str[base_type(type)], postfix);
+ return env->tmp_str_buf;
+}
+
+const char *dynptr_type_str(enum bpf_dynptr_type type)
+{
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return "local";
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return "ringbuf";
+ case BPF_DYNPTR_TYPE_SKB:
+ return "skb";
+ case BPF_DYNPTR_TYPE_XDP:
+ return "xdp";
+ case BPF_DYNPTR_TYPE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown dynptr type %d\n", type);
+ return "<unknown>";
+ }
+}
+
+const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+ if (!btf || btf_id == 0)
+ return "<invalid>";
+
+ /* we already validated that type is valid and has conforming name */
+ return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+const char *iter_state_str(enum bpf_iter_state state)
+{
+ switch (state) {
+ case BPF_ITER_STATE_ACTIVE:
+ return "active";
+ case BPF_ITER_STATE_DRAINED:
+ return "drained";
+ case BPF_ITER_STATE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown iter state %d\n", state);
+ return "<unknown>";
+ }
+}
+
+static char slot_type_char[] = {
+ [STACK_INVALID] = '?',
+ [STACK_SPILL] = 'r',
+ [STACK_MISC] = 'm',
+ [STACK_ZERO] = '0',
+ [STACK_DYNPTR] = 'd',
+ [STACK_ITER] = 'i',
+};
+
+static void print_liveness(struct bpf_verifier_env *env,
+ enum bpf_reg_liveness live)
+{
+ if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
+ verbose(env, "_");
+ if (live & REG_LIVE_READ)
+ verbose(env, "r");
+ if (live & REG_LIVE_WRITTEN)
+ verbose(env, "w");
+ if (live & REG_LIVE_DONE)
+ verbose(env, "D");
+}
+
+#define UNUM_MAX_DECIMAL U16_MAX
+#define SNUM_MAX_DECIMAL S16_MAX
+#define SNUM_MIN_DECIMAL S16_MIN
+
+static bool is_unum_decimal(u64 num)
+{
+ return num <= UNUM_MAX_DECIMAL;
+}
+
+static bool is_snum_decimal(s64 num)
+{
+ return num >= SNUM_MIN_DECIMAL && num <= SNUM_MAX_DECIMAL;
+}
+
+static void verbose_unum(struct bpf_verifier_env *env, u64 num)
+{
+ if (is_unum_decimal(num))
+ verbose(env, "%llu", num);
+ else
+ verbose(env, "%#llx", num);
+}
+
+static void verbose_snum(struct bpf_verifier_env *env, s64 num)
+{
+ if (is_snum_decimal(num))
+ verbose(env, "%lld", num);
+ else
+ verbose(env, "%#llx", num);
+}
+
+int tnum_strn(char *str, size_t size, struct tnum a)
+{
+ /* print as a constant, if tnum is fully known */
+ if (a.mask == 0) {
+ if (is_unum_decimal(a.value))
+ return snprintf(str, size, "%llu", a.value);
+ else
+ return snprintf(str, size, "%#llx", a.value);
+ }
+ return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
+}
+EXPORT_SYMBOL_GPL(tnum_strn);
+
+static void print_scalar_ranges(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ const char **sep)
+{
+ /* For signed ranges, we want to unify 64-bit and 32-bit values in the
+ * output as much as possible, but there is a bit of a complication.
+ * If we choose to print values as decimals, this is natural to do,
+ * because negative 64-bit and 32-bit values >= -S32_MIN have the same
+ * representation due to sign extension. But if we choose to print
+ * them in hex format (see is_snum_decimal()), then sign extension is
+ * misleading.
+ * E.g., smin=-2 and smin32=-2 are exactly the same in decimal, but in
+ * hex they will be smin=0xfffffffffffffffe and smin32=0xfffffffe, two
+ * very different numbers.
+ * So we avoid sign extension if we choose to print values in hex.
+ */
+ struct {
+ const char *name;
+ u64 val;
+ bool omit;
+ } minmaxs[] = {
+ {"smin", reg->smin_value, reg->smin_value == S64_MIN},
+ {"smax", reg->smax_value, reg->smax_value == S64_MAX},
+ {"umin", reg->umin_value, reg->umin_value == 0},
+ {"umax", reg->umax_value, reg->umax_value == U64_MAX},
+ {"smin32",
+ is_snum_decimal((s64)reg->s32_min_value)
+ ? (s64)reg->s32_min_value
+ : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN},
+ {"smax32",
+ is_snum_decimal((s64)reg->s32_max_value)
+ ? (s64)reg->s32_max_value
+ : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX},
+ {"umin32", reg->u32_min_value, reg->u32_min_value == 0},
+ {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX},
+ }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
+ bool neg1, neg2;
+
+ for (m1 = &minmaxs[0]; m1 < mend; m1++) {
+ if (m1->omit)
+ continue;
+
+ neg1 = m1->name[0] == 's' && (s64)m1->val < 0;
+
+ verbose(env, "%s%s=", *sep, m1->name);
+ *sep = ",";
+
+ for (m2 = m1 + 2; m2 < mend; m2 += 2) {
+ if (m2->omit || m2->val != m1->val)
+ continue;
+ /* don't mix negatives with positives */
+ neg2 = m2->name[0] == 's' && (s64)m2->val < 0;
+ if (neg2 != neg1)
+ continue;
+ m2->omit = true;
+ verbose(env, "%s=", m2->name);
+ }
+
+ if (m1->name[0] == 's')
+ verbose_snum(env, m1->val);
+ else
+ verbose_unum(env, m1->val);
+ }
+}
+
+static bool type_is_map_ptr(enum bpf_reg_type t) {
+ switch (base_type(t)) {
+ case CONST_PTR_TO_MAP:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; })
+
+static void print_reg_state(struct bpf_verifier_env *env,
+ const struct bpf_func_state *state,
+ const struct bpf_reg_state *reg)
+{
+ enum bpf_reg_type t;
+ const char *sep = "";
+
+ t = reg->type;
+ if (t == SCALAR_VALUE && reg->precise)
+ verbose(env, "P");
+ if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) {
+ /* reg->off should be 0 for SCALAR_VALUE */
+ verbose_snum(env, reg->var_off.value + reg->off);
+ return;
+ }
+
+ verbose(env, "%s", reg_type_str(env, t));
+ if (t == PTR_TO_STACK) {
+ if (state->frameno != reg->frameno)
+ verbose(env, "[%d]", reg->frameno);
+ if (tnum_is_const(reg->var_off)) {
+ verbose_snum(env, reg->var_off.value + reg->off);
+ return;
+ }
+ }
+ if (base_type(t) == PTR_TO_BTF_ID)
+ verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
+ verbose(env, "(");
+ if (reg->id)
+ verbose_a("id=%d", reg->id);
+ if (reg->ref_obj_id)
+ verbose_a("ref_obj_id=%d", reg->ref_obj_id);
+ if (type_is_non_owning_ref(reg->type))
+ verbose_a("%s", "non_own_ref");
+ if (type_is_map_ptr(t)) {
+ if (reg->map_ptr->name[0])
+ verbose_a("map=%s", reg->map_ptr->name);
+ verbose_a("ks=%d,vs=%d",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
+ }
+ if (t != SCALAR_VALUE && reg->off) {
+ verbose_a("off=");
+ verbose_snum(env, reg->off);
+ }
+ if (type_is_pkt_pointer(t)) {
+ verbose_a("r=");
+ verbose_unum(env, reg->range);
+ }
+ if (base_type(t) == PTR_TO_MEM) {
+ verbose_a("sz=");
+ verbose_unum(env, reg->mem_size);
+ }
+ if (t == CONST_PTR_TO_DYNPTR)
+ verbose_a("type=%s", dynptr_type_str(reg->dynptr.type));
+ if (tnum_is_const(reg->var_off)) {
+ /* a pointer register with fixed offset */
+ if (reg->var_off.value) {
+ verbose_a("imm=");
+ verbose_snum(env, reg->var_off.value);
+ }
+ } else {
+ print_scalar_ranges(env, reg, &sep);
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose_a("var_off=%s", tn_buf);
+ }
+ }
+ verbose(env, ")");
+}
+
+void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state,
+ bool print_all)
+{
+ const struct bpf_reg_state *reg;
+ int i;
+
+ if (state->frameno)
+ verbose(env, " frame%d:", state->frameno);
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ reg = &state->regs[i];
+ if (reg->type == NOT_INIT)
+ continue;
+ if (!print_all && !reg_scratched(env, i))
+ continue;
+ verbose(env, " R%d", i);
+ print_liveness(env, reg->live);
+ verbose(env, "=");
+ print_reg_state(env, state, reg);
+ }
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ char types_buf[BPF_REG_SIZE + 1];
+ const char *sep = "";
+ bool valid = false;
+ u8 slot_type;
+ int j;
+
+ if (!print_all && !stack_slot_scratched(env, i))
+ continue;
+
+ for (j = 0; j < BPF_REG_SIZE; j++) {
+ slot_type = state->stack[i].slot_type[j];
+ if (slot_type != STACK_INVALID)
+ valid = true;
+ types_buf[j] = slot_type_char[slot_type];
+ }
+ types_buf[BPF_REG_SIZE] = 0;
+ if (!valid)
+ continue;
+
+ reg = &state->stack[i].spilled_ptr;
+ switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
+ /* print MISC/ZERO/INVALID slots above subreg spill */
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (state->stack[i].slot_type[j] == STACK_SPILL)
+ break;
+ types_buf[j] = '\0';
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=%s", types_buf);
+ print_reg_state(env, state, reg);
+ break;
+ case STACK_DYNPTR:
+ /* skip to main dynptr slot */
+ i += BPF_DYNPTR_NR_SLOTS - 1;
+ reg = &state->stack[i].spilled_ptr;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type));
+ if (reg->id)
+ verbose_a("id=%d", reg->id);
+ if (reg->ref_obj_id)
+ verbose_a("ref_id=%d", reg->ref_obj_id);
+ if (reg->dynptr_id)
+ verbose_a("dynptr_id=%d", reg->dynptr_id);
+ verbose(env, ")");
+ break;
+ case STACK_ITER:
+ /* only main slot has ref_obj_id set; skip others */
+ if (!reg->ref_obj_id)
+ continue;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+ iter_type_str(reg->iter.btf, reg->iter.btf_id),
+ reg->ref_obj_id, iter_state_str(reg->iter.state),
+ reg->iter.depth);
+ break;
+ case STACK_MISC:
+ case STACK_ZERO:
+ default:
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=%s", types_buf);
+ break;
+ }
+ }
+ if (state->acquired_refs && state->refs[0].id) {
+ verbose(env, " refs=%d", state->refs[0].id);
+ for (i = 1; i < state->acquired_refs; i++)
+ if (state->refs[i].id)
+ verbose(env, ",%d", state->refs[i].id);
+ }
+ if (state->in_callback_fn)
+ verbose(env, " cb");
+ if (state->in_async_callback_fn)
+ verbose(env, " async_cb");
+ verbose(env, "\n");
+ if (!print_all)
+ mark_verifier_state_clean(env);
+}
+
+static inline u32 vlog_alignment(u32 pos)
+{
+ return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
+ BPF_LOG_MIN_ALIGNMENT) - pos - 1;
+}
+
+void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state)
+{
+ if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
+ /* remove new line character */
+ bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
+ verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
+ } else {
+ verbose(env, "%d:", env->insn_idx);
+ }
+ print_verifier_state(env, state, false);
+}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 17c7e7782a1f..b32be680da6c 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -231,6 +231,9 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
struct lpm_trie_node *node, *found = NULL;
struct bpf_lpm_trie_key *key = _key;
+ if (key->prefixlen > trie->max_prefixlen)
+ return NULL;
+
/* Start walking the trie from the root node ... */
for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held());
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index cd5eafaba97e..8ef269e66ba5 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -127,12 +127,21 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
return inner_map;
}
-void bpf_map_fd_put_ptr(void *ptr)
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
- /* ptr->ops->map_free() has to go through one
- * rcu grace period by itself.
+ struct bpf_map *inner_map = ptr;
+
+ /* Defer the freeing of inner map according to the sleepable attribute
+ * of bpf program which owns the outer map, so unnecessary waiting for
+ * RCU tasks trace grace period can be avoided.
*/
- bpf_map_put(ptr);
+ if (need_defer) {
+ if (atomic64_read(&map->sleepable_refcnt))
+ WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+ else
+ WRITE_ONCE(inner_map->free_after_rcu_gp, true);
+ }
+ bpf_map_put(inner_map);
}
u32 bpf_map_fd_sys_lookup_elem(void *ptr)
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index bcb7534afb3c..7d61602354de 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -13,7 +13,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
void bpf_map_meta_free(struct bpf_map *map_meta);
void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
int ufd);
-void bpf_map_fd_put_ptr(void *ptr);
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer);
u32 bpf_map_fd_sys_lookup_elem(void *ptr);
#endif
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 6fc9dae9edc8..6abd7c5df4b3 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -193,9 +193,7 @@ static int __init bpf_map_iter_init(void)
late_initcall(bpf_map_iter_init);
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
- "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
__bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
{
@@ -213,7 +211,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
return ret;
}
-__diag_pop();
+__bpf_kfunc_end_defs();
BTF_SET8_START(bpf_map_iter_kfunc_ids)
BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 9c49ae53deaf..550f02e2cb13 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -121,6 +121,8 @@ struct bpf_mem_caches {
struct bpf_mem_cache cache[NUM_CACHES];
};
+static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+
static struct llist_node notrace *__llist_del_first(struct llist_head *head)
{
struct llist_node *entry, *next;
@@ -340,6 +342,7 @@ static void free_bulk(struct bpf_mem_cache *c)
int cnt;
WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+ WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
do {
inc_active(c, &flags);
@@ -365,6 +368,9 @@ static void __free_by_rcu(struct rcu_head *head)
struct bpf_mem_cache *tgt = c->tgt;
struct llist_node *llnode;
+ WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+ WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
+
llnode = llist_del_all(&c->waiting_for_gp);
if (!llnode)
goto out;
@@ -458,12 +464,17 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
* consume ~ 11 Kbyte per cpu.
* Typical case will be between 11K and 116K closer to 11K.
* bpf progs can and should share bpf_mem_cache when possible.
+ *
+ * Percpu allocation is typically rare. To avoid potential unnecessary large
+ * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1.
*/
-
-static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+static void init_refill_work(struct bpf_mem_cache *c)
{
init_irq_work(&c->refill_work, bpf_mem_refill);
- if (c->unit_size <= 256) {
+ if (c->percpu_size) {
+ c->low_watermark = 1;
+ c->high_watermark = 3;
+ } else if (c->unit_size <= 256) {
c->low_watermark = 32;
c->high_watermark = 96;
} else {
@@ -476,12 +487,20 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
c->high_watermark = max(96 * 256 / c->unit_size, 3);
}
c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
+}
- /* To avoid consuming memory assume that 1st run of bpf
- * prog won't be doing more than 4 map_update_elem from
- * irq disabled region
+static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+{
+ int cnt = 1;
+
+ /* To avoid consuming memory, for non-percpu allocation, assume that
+ * 1st run of bpf prog won't be doing more than 4 map_update_elem from
+ * irq disabled region if unit size is less than or equal to 256.
+ * For all other cases, let us just do one allocation.
*/
- alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
+ if (!c->percpu_size && c->unit_size <= 256)
+ cnt = 4;
+ alloc_bulk(c, cnt, cpu_to_node(cpu), false);
}
/* When size != 0 bpf_mem_cache for each cpu.
@@ -493,21 +512,25 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
*/
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
{
- static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
struct bpf_mem_caches *cc, __percpu *pcc;
struct bpf_mem_cache *c, __percpu *pc;
struct obj_cgroup *objcg = NULL;
int cpu, i, unit_size, percpu_size = 0;
+ if (percpu && size == 0)
+ return -EINVAL;
+
+ /* room for llist_node and per-cpu pointer */
+ if (percpu)
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+ ma->percpu = percpu;
+
if (size) {
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
if (!pc)
return -ENOMEM;
- if (percpu)
- /* room for llist_node and per-cpu pointer */
- percpu_size = LLIST_NODE_SZ + sizeof(void *);
- else
+ if (!percpu)
size += LLIST_NODE_SZ; /* room for llist_node */
unit_size = size;
@@ -515,39 +538,93 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
if (memcg_bpf_enabled())
objcg = get_obj_cgroup_from_current();
#endif
+ ma->objcg = objcg;
+
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(pc, cpu);
c->unit_size = unit_size;
c->objcg = objcg;
c->percpu_size = percpu_size;
c->tgt = c;
+ init_refill_work(c);
prefill_mem_cache(c, cpu);
}
ma->cache = pc;
return 0;
}
- /* size == 0 && percpu is an invalid combination */
- if (WARN_ON_ONCE(percpu))
- return -EINVAL;
-
pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
if (!pcc)
return -ENOMEM;
#ifdef CONFIG_MEMCG_KMEM
objcg = get_obj_cgroup_from_current();
#endif
+ ma->objcg = objcg;
for_each_possible_cpu(cpu) {
cc = per_cpu_ptr(pcc, cpu);
for (i = 0; i < NUM_CACHES; i++) {
c = &cc->cache[i];
c->unit_size = sizes[i];
c->objcg = objcg;
+ c->percpu_size = percpu_size;
c->tgt = c;
+
+ init_refill_work(c);
prefill_mem_cache(c, cpu);
}
}
+
+ ma->caches = pcc;
+ return 0;
+}
+
+int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg)
+{
+ struct bpf_mem_caches __percpu *pcc;
+
+ pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL);
+ if (!pcc)
+ return -ENOMEM;
+
ma->caches = pcc;
+ ma->objcg = objcg;
+ ma->percpu = true;
+ return 0;
+}
+
+int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
+{
+ struct bpf_mem_caches *cc, __percpu *pcc;
+ int cpu, i, unit_size, percpu_size;
+ struct obj_cgroup *objcg;
+ struct bpf_mem_cache *c;
+
+ i = bpf_mem_cache_idx(size);
+ if (i < 0)
+ return -EINVAL;
+
+ /* room for llist_node and per-cpu pointer */
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+
+ unit_size = sizes[i];
+ objcg = ma->objcg;
+ pcc = ma->caches;
+
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(pcc, cpu);
+ c = &cc->cache[i];
+ if (c->unit_size)
+ break;
+
+ c->unit_size = unit_size;
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ c->tgt = c;
+
+ init_refill_work(c);
+ prefill_mem_cache(c, cpu);
+ }
+
return 0;
}
@@ -682,9 +759,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
- /* objcg is the same across cpus */
- if (c->objcg)
- obj_cgroup_put(c->objcg);
+ if (ma->objcg)
+ obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
if (ma->caches) {
@@ -700,8 +776,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
}
- if (c->objcg)
- obj_cgroup_put(c->objcg);
+ if (ma->objcg)
+ obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
}
@@ -734,12 +810,17 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
}
}
local_dec(&c->active);
- local_irq_restore(flags);
WARN_ON(cnt < 0);
if (cnt < c->low_watermark)
irq_work_raise(c);
+ /* Enable IRQ after the enqueue of irq work completes, so irq work
+ * will run after IRQ is enabled and free_llist may be refilled by
+ * irq work before other task preempts current task.
+ */
+ local_irq_restore(flags);
+
return llnode;
}
@@ -775,11 +856,16 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
llist_add(llnode, &c->free_llist_extra);
}
local_dec(&c->active);
- local_irq_restore(flags);
if (cnt > c->high_watermark)
/* free few objects from current cpu into global kmalloc pool */
irq_work_raise(c);
+ /* Enable IRQ after irq_work_raise() completes, otherwise when current
+ * task is preempted by task which does unit_alloc(), unit_alloc() may
+ * return NULL unexpectedly because irq work is already pending but can
+ * not been triggered and free_llist can not be refilled timely.
+ */
+ local_irq_restore(flags);
}
static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
@@ -797,10 +883,10 @@ static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
llist_add(llnode, &c->free_llist_extra_rcu);
}
local_dec(&c->active);
- local_irq_restore(flags);
if (!atomic_read(&c->call_rcu_in_progress))
irq_work_raise(c);
+ local_irq_restore(flags);
}
/* Called from BPF program or from sys_bpf syscall.
@@ -812,9 +898,11 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
void *ret;
if (!size)
- return ZERO_SIZE_PTR;
+ return NULL;
- idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
+ if (!ma->percpu)
+ size += LLIST_NODE_SZ;
+ idx = bpf_mem_cache_idx(size);
if (idx < 0)
return NULL;
@@ -824,13 +912,15 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
{
+ struct bpf_mem_cache *c;
int idx;
if (!ptr)
return;
- idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
- if (idx < 0)
+ c = *(void **)(ptr - LLIST_NODE_SZ);
+ idx = bpf_mem_cache_idx(c->unit_size);
+ if (WARN_ON_ONCE(idx < 0))
return;
unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
@@ -838,13 +928,15 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
{
+ struct bpf_mem_cache *c;
int idx;
if (!ptr)
return;
- idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
- if (idx < 0)
+ c = *(void **)(ptr - LLIST_NODE_SZ);
+ idx = bpf_mem_cache_idx(c->unit_size);
+ if (WARN_ON_ONCE(idx < 0))
return;
unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
@@ -910,6 +1002,8 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
memcg = get_memcg(c);
old_memcg = set_active_memcg(memcg);
ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
+ if (ret)
+ *(struct bpf_mem_cache **)ret = c;
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
}
diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c
index 32d2c4829eb8..1394168062e8 100644
--- a/kernel/bpf/mprog.c
+++ b/kernel/bpf/mprog.c
@@ -253,6 +253,9 @@ int bpf_mprog_attach(struct bpf_mprog_entry *entry,
goto out;
}
idx = tidx;
+ } else if (bpf_mprog_total(entry) == bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
}
if (flags & BPF_F_BEFORE) {
tidx = bpf_mprog_pos_before(entry, &rtuple);
@@ -398,14 +401,16 @@ int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr,
struct bpf_mprog_cp *cp;
struct bpf_prog *prog;
const u32 flags = 0;
+ u32 id, count = 0;
+ u64 revision = 1;
int i, ret = 0;
- u32 id, count;
- u64 revision;
if (attr->query.query_flags || attr->query.attach_flags)
return -EINVAL;
- revision = bpf_mprog_revision(entry);
- count = bpf_mprog_total(entry);
+ if (entry) {
+ revision = bpf_mprog_revision(entry);
+ count = bpf_mprog_total(entry);
+ }
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
return -EFAULT;
if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 3e4f2ec1af06..1a4fec330eaa 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -199,12 +199,14 @@ static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *n
offload->netdev = netdev;
ondev = bpf_offload_find_netdev(offload->netdev);
+ /* When program is offloaded require presence of "true"
+ * bpf_offload_netdev, avoid the one created for !ondev case below.
+ */
+ if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) {
+ err = -EINVAL;
+ goto err_free;
+ }
if (!ondev) {
- if (bpf_prog_is_offloaded(prog->aux)) {
- err = -EINVAL;
- goto err_free;
- }
-
/* When only binding to the device, explicitly
* create an entry in the hashtable.
*/
@@ -232,7 +234,14 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
attr->prog_type != BPF_PROG_TYPE_XDP)
return -EINVAL;
- if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY)
+ if (attr->prog_flags & ~(BPF_F_XDP_DEV_BOUND_ONLY | BPF_F_XDP_HAS_FRAGS))
+ return -EINVAL;
+
+ /* Frags are allowed only if program is dev-bound-only, but not
+ * if it is requesting bpf offload.
+ */
+ if (attr->prog_flags & BPF_F_XDP_HAS_FRAGS &&
+ !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY))
return -EINVAL;
if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS &&
@@ -845,10 +854,11 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
if (!ops)
goto out;
- if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP))
- p = ops->xmo_rx_timestamp;
- else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH))
- p = ops->xmo_rx_hash;
+#define XDP_METADATA_KFUNC(name, _, __, xmo) \
+ if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo;
+ XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
out:
up_read(&bpf_devs_lock);
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 8d2ddcb7566b..d869f51ea93a 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -98,7 +98,12 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
int err = 0;
void *ptr;
- raw_spin_lock_irqsave(&qs->lock, flags);
+ if (in_nmi()) {
+ if (!raw_spin_trylock_irqsave(&qs->lock, flags))
+ return -EBUSY;
+ } else {
+ raw_spin_lock_irqsave(&qs->lock, flags);
+ }
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -128,7 +133,12 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
void *ptr;
u32 index;
- raw_spin_lock_irqsave(&qs->lock, flags);
+ if (in_nmi()) {
+ if (!raw_spin_trylock_irqsave(&qs->lock, flags))
+ return -EBUSY;
+ } else {
+ raw_spin_lock_irqsave(&qs->lock, flags);
+ }
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -193,7 +203,12 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
if (flags & BPF_NOEXIST || flags > BPF_EXIST)
return -EINVAL;
- raw_spin_lock_irqsave(&qs->lock, irq_flags);
+ if (in_nmi()) {
+ if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags))
+ return -EBUSY;
+ } else {
+ raw_spin_lock_irqsave(&qs->lock, irq_flags);
+ }
if (queue_stack_map_is_full(qs)) {
if (!replace) {
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index f045fde632e5..0ee653a936ea 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -770,8 +770,7 @@ schedule_work_return:
/* Prevent the clearing of the busy-bit from being reordered before the
* storing of any rb consumer or producer positions.
*/
- smp_mb__before_atomic();
- atomic_set(&rb->busy, 0);
+ atomic_set_release(&rb->busy, 0);
if (flags & BPF_RB_FORCE_WAKEUP)
irq_work_queue(&rb->work);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 458bb80b14d5..dff7ba539701 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -28,7 +28,7 @@ struct bpf_stack_map {
void *elems;
struct pcpu_freelist freelist;
u32 n_buckets;
- struct stack_map_bucket *buckets[];
+ struct stack_map_bucket *buckets[] __counted_by(n_buckets);
};
static inline bool stack_map_use_build_id(struct bpf_map *map)
@@ -388,6 +388,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
{
u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+ bool crosstask = task && task != current;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace;
@@ -410,6 +411,14 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (task && user && !user_mode(regs))
goto err_fault;
+ /* get_perf_callchain does not support crosstask user stack walking
+ * but returns an empty stack instead of NULL.
+ */
+ if (crosstask && user) {
+ err = -EOPNOTSUPP;
+ goto clear;
+ }
+
num_elem = size / elem_size;
max_depth = num_elem + skip;
if (sysctl_perf_event_max_stack < max_depth)
@@ -421,7 +430,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
trace = get_callchain_entry_for_task(task, max_depth);
else
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
- false, false);
+ crosstask, false);
if (unlikely(!trace))
goto err_fault;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ebeb0695305a..a1f18681721c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,8 +35,9 @@
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#include <linux/trace_events.h>
-#include <net/netfilter/nf_bpf_link.h>
+#include <net/netfilter/nf_bpf_link.h>
+#include <net/netkit.h>
#include <net/tcx.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -141,9 +142,13 @@ static u32 bpf_map_value_size(const struct bpf_map *map)
static void maybe_wait_bpf_programs(struct bpf_map *map)
{
- /* Wait for any running BPF programs to complete so that
- * userspace, when we return to it, knows that all programs
- * that could be running use the new map value.
+ /* Wait for any running non-sleepable BPF programs to complete so that
+ * userspace, when we return to it, knows that all non-sleepable
+ * programs that could be running use the new map value. For sleepable
+ * BPF programs, synchronize_rcu_tasks_trace() should be used to wait
+ * for the completions of these programs, but considering the waiting
+ * time can be very long and userspace may think it will hang forever,
+ * so don't handle sleepable BPF programs now.
*/
if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
@@ -179,15 +184,11 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
err = bpf_percpu_cgroup_storage_update(map, key, value,
flags);
} else if (IS_FD_ARRAY(map)) {
- rcu_read_lock();
err = bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
- rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
- rcu_read_lock();
err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
flags);
- rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
/* rcu_read_lock() is not needed */
err = bpf_fd_reuseport_array_update_elem(map, key, value,
@@ -202,7 +203,6 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
rcu_read_unlock();
}
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
return err;
}
@@ -263,7 +263,6 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
}
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
return err;
}
@@ -514,6 +513,7 @@ void btf_record_free(struct btf_record *rec)
switch (rec->fields[i].type) {
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
if (rec->fields[i].kptr.module)
module_put(rec->fields[i].kptr.module);
btf_put(rec->fields[i].kptr.btf);
@@ -560,6 +560,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
switch (fields[i].type) {
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
btf_get(fields[i].kptr.btf);
if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
ret = -ENXIO;
@@ -624,8 +625,6 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
bpf_timer_cancel_and_free(obj + rec->timer_off);
}
-extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@@ -650,6 +649,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
WRITE_ONCE(*(u64 *)field_ptr, 0);
break;
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
if (!xchgd_field)
break;
@@ -659,8 +659,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
field->kptr.btf_id);
migrate_disable();
__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
- pointee_struct_meta->record :
- NULL);
+ pointee_struct_meta->record : NULL,
+ fields[i].type == BPF_KPTR_PERCPU);
migrate_enable();
} else {
field->kptr.dtor(xchgd_field);
@@ -692,6 +692,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
struct btf_record *rec = map->record;
+ struct btf *btf = map->btf;
security_bpf_map_free(map);
bpf_map_release_memcg(map);
@@ -707,6 +708,10 @@ static void bpf_map_free_deferred(struct work_struct *work)
* template bpf_map struct used during verification.
*/
btf_record_free(rec);
+ /* Delay freeing of btf for maps, as map_free callback may need
+ * struct_meta info which will be freed with btf_put().
+ */
+ btf_put(btf);
}
static void bpf_map_put_uref(struct bpf_map *map)
@@ -717,6 +722,28 @@ static void bpf_map_put_uref(struct bpf_map *map)
}
}
+static void bpf_map_free_in_work(struct bpf_map *map)
+{
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ /* Avoid spawning kworkers, since they all might contend
+ * for the same mutex like slab_mutex.
+ */
+ queue_work(system_unbound_wq, &map->work);
+}
+
+static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
+{
+ bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
+}
+
+static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_map_free_rcu_gp(rcu);
+ else
+ call_rcu(rcu, bpf_map_free_rcu_gp);
+}
+
/* decrement map refcnt and schedule it for freeing via workqueue
* (underlying map implementation ops->map_free() might sleep)
*/
@@ -725,12 +752,14 @@ void bpf_map_put(struct bpf_map *map)
if (atomic64_dec_and_test(&map->refcnt)) {
/* bpf_map_free_id() must be called first */
bpf_map_free_id(map);
- btf_put(map->btf);
- INIT_WORK(&map->work, bpf_map_free_deferred);
- /* Avoid spawning kworkers, since they all might contend
- * for the same mutex like slab_mutex.
- */
- queue_work(system_unbound_wq, &map->work);
+
+ WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
+ if (READ_ONCE(map->free_after_mult_rcu_gp))
+ call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
+ else if (READ_ONCE(map->free_after_rcu_gp))
+ call_rcu(&map->rcu, bpf_map_free_rcu_gp);
+ else
+ bpf_map_free_in_work(map);
}
}
EXPORT_SYMBOL_GPL(bpf_map_put);
@@ -1045,6 +1074,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
case BPF_REFCOUNT:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
@@ -1521,6 +1551,8 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
}
err = bpf_map_update_value(map, f.file, key, value, attr->flags);
+ if (!err)
+ maybe_wait_bpf_programs(map);
kvfree(value);
free_key:
@@ -1576,7 +1608,8 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
+ if (!err)
+ maybe_wait_bpf_programs(map);
out:
kvfree(key);
err_put:
@@ -1673,6 +1706,9 @@ int generic_map_delete_batch(struct bpf_map *map,
if (!max_count)
return 0;
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
@@ -1702,7 +1738,6 @@ int generic_map_delete_batch(struct bpf_map *map,
kvfree(key);
- maybe_wait_bpf_programs(map);
return err;
}
@@ -1730,6 +1765,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
if (!max_count)
return 0;
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
@@ -1760,6 +1798,7 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
kvfree(value);
kvfree(key);
+
return err;
}
@@ -2442,14 +2481,19 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
return 0;
default:
return -EINVAL;
@@ -2565,7 +2609,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
BPF_F_SLEEPABLE |
BPF_F_TEST_RND_HI32 |
BPF_F_XDP_HAS_FRAGS |
- BPF_F_XDP_DEV_BOUND_ONLY))
+ BPF_F_XDP_DEV_BOUND_ONLY |
+ BPF_F_TEST_REG_INVARIANTS))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2693,6 +2738,22 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
goto free_prog_sec;
}
+ /*
+ * Bookkeeping for managing the program attachment chain.
+ *
+ * It might be tempting to set attach_tracing_prog flag at the attachment
+ * time, but this will not prevent from loading bunch of tracing prog
+ * first, then attach them one to another.
+ *
+ * The flag attach_tracing_prog is set for the whole program lifecycle, and
+ * doesn't have to be cleared in bpf_tracing_link_release, since tracing
+ * programs cannot change attachment target.
+ */
+ if (type == BPF_PROG_TYPE_TRACING && dst_prog &&
+ dst_prog->type == BPF_PROG_TYPE_TRACING) {
+ prog->aux->attach_tracing_prog = true;
+ }
+
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
@@ -2745,7 +2806,7 @@ free_used_maps:
* period before we can tear down JIT memory since symbols
* are already exposed under kallsyms.
*/
- __bpf_prog_put_noref(prog, prog->aux->func_cnt);
+ __bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
return err;
free_prog_sec:
free_uid(prog->aux->user);
@@ -3126,7 +3187,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
}
if (tgt_prog_fd) {
- /* For now we only allow new targets for BPF_PROG_TYPE_EXT */
+ /*
+ * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this
+ * part would be changed to implement the same for
+ * BPF_PROG_TYPE_TRACING, do not forget to update the way how
+ * attach_tracing_prog flag is set.
+ */
if (prog->type != BPF_PROG_TYPE_EXT) {
err = -EINVAL;
goto out_put_prog;
@@ -3171,6 +3237,10 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
*
* - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
* was detached and is going for re-attachment.
+ *
+ * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf
+ * are NULL, then program was already attached and user did not provide
+ * tgt_prog_fd so we have no way to find out or create trampoline
*/
if (!prog->aux->dst_trampoline && !tgt_prog) {
/*
@@ -3184,6 +3254,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
err = -EINVAL;
goto out_unlock;
}
+ /* We can allow re-attach only if we have valid attach_btf. */
+ if (!prog->aux->attach_btf) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
btf_id = prog->aux->attach_btf_id;
key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
}
@@ -3370,7 +3445,7 @@ static void bpf_perf_link_dealloc(struct bpf_link *link)
static int bpf_perf_link_fill_common(const struct perf_event *event,
char __user *uname, u32 ulen,
u64 *probe_offset, u64 *probe_addr,
- u32 *fd_type)
+ u32 *fd_type, unsigned long *missed)
{
const char *buf;
u32 prog_id;
@@ -3381,7 +3456,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event,
return -EINVAL;
err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
- probe_offset, probe_addr);
+ probe_offset, probe_addr, missed);
if (err)
return err;
if (!uname)
@@ -3404,6 +3479,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event,
static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
struct bpf_link_info *info)
{
+ unsigned long missed;
char __user *uname;
u64 addr, offset;
u32 ulen, type;
@@ -3412,7 +3488,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
ulen = info->perf_event.kprobe.name_len;
err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
- &type);
+ &type, &missed);
if (err)
return err;
if (type == BPF_FD_TYPE_KRETPROBE)
@@ -3421,6 +3497,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
info->perf_event.type = BPF_PERF_EVENT_KPROBE;
info->perf_event.kprobe.offset = offset;
+ info->perf_event.kprobe.missed = missed;
if (!kallsyms_show_value(current_cred()))
addr = 0;
info->perf_event.kprobe.addr = addr;
@@ -3440,7 +3517,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
ulen = info->perf_event.uprobe.name_len;
err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
- &type);
+ &type, NULL);
if (err)
return err;
@@ -3476,7 +3553,7 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
ulen = info->perf_event.tracepoint.name_len;
info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
- return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL);
+ return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
}
static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
@@ -3672,14 +3749,19 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
case BPF_CGROUP_SOCK_OPS:
return BPF_PROG_TYPE_SOCK_OPS;
@@ -3716,6 +3798,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_LSM;
case BPF_TCX_INGRESS:
case BPF_TCX_EGRESS:
+ case BPF_NETKIT_PRIMARY:
+ case BPF_NETKIT_PEER:
return BPF_PROG_TYPE_SCHED_CLS;
default:
return BPF_PROG_TYPE_UNSPEC;
@@ -3767,7 +3851,9 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
return 0;
case BPF_PROG_TYPE_SCHED_CLS:
if (attach_type != BPF_TCX_INGRESS &&
- attach_type != BPF_TCX_EGRESS)
+ attach_type != BPF_TCX_EGRESS &&
+ attach_type != BPF_NETKIT_PRIMARY &&
+ attach_type != BPF_NETKIT_PEER)
return -EINVAL;
return 0;
default:
@@ -3796,7 +3882,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
struct bpf_prog *prog;
- u32 mask;
int ret;
if (CHECK_ATTR(BPF_PROG_ATTACH))
@@ -3805,10 +3890,16 @@ static int bpf_prog_attach(const union bpf_attr *attr)
ptype = attach_type_to_prog_type(attr->attach_type);
if (ptype == BPF_PROG_TYPE_UNSPEC)
return -EINVAL;
- mask = bpf_mprog_supported(ptype) ?
- BPF_F_ATTACH_MASK_MPROG : BPF_F_ATTACH_MASK_BASE;
- if (attr->attach_flags & ~mask)
- return -EINVAL;
+ if (bpf_mprog_supported(ptype)) {
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
+ return -EINVAL;
+ } else {
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
+ return -EINVAL;
+ if (attr->relative_fd ||
+ attr->expected_revision)
+ return -EINVAL;
+ }
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
@@ -3845,7 +3936,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
break;
case BPF_PROG_TYPE_SCHED_CLS:
- ret = tcx_prog_attach(attr, prog);
+ if (attr->attach_type == BPF_TCX_INGRESS ||
+ attr->attach_type == BPF_TCX_EGRESS)
+ ret = tcx_prog_attach(attr, prog);
+ else
+ ret = netkit_prog_attach(attr, prog);
break;
default:
ret = -EINVAL;
@@ -3878,6 +3973,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
if (IS_ERR(prog))
return PTR_ERR(prog);
}
+ } else if (attr->attach_flags ||
+ attr->relative_fd ||
+ attr->expected_revision) {
+ return -EINVAL;
}
switch (ptype) {
@@ -3902,7 +4001,11 @@ static int bpf_prog_detach(const union bpf_attr *attr)
ret = cgroup_bpf_prog_detach(attr, ptype);
break;
case BPF_PROG_TYPE_SCHED_CLS:
- ret = tcx_prog_detach(attr, prog);
+ if (attr->attach_type == BPF_TCX_INGRESS ||
+ attr->attach_type == BPF_TCX_EGRESS)
+ ret = tcx_prog_detach(attr, prog);
+ else
+ ret = netkit_prog_detach(attr, prog);
break;
default:
ret = -EINVAL;
@@ -3913,7 +4016,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
return ret;
}
-#define BPF_PROG_QUERY_LAST_FIELD query.link_attach_flags
+#define BPF_PROG_QUERY_LAST_FIELD query.revision
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -3936,14 +4039,19 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
case BPF_CGROUP_SYSCTL:
@@ -3964,6 +4072,9 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_TCX_INGRESS:
case BPF_TCX_EGRESS:
return tcx_prog_query(attr, uattr);
+ case BPF_NETKIT_PRIMARY:
+ case BPF_NETKIT_PEER:
+ return netkit_prog_query(attr, uattr);
default:
return -EINVAL;
}
@@ -4809,7 +4920,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
&buf, &probe_offset,
- &probe_addr);
+ &probe_addr, NULL);
if (!err)
err = bpf_task_fd_query_copy(attr, uattr, prog_id,
fd_type, buf,
@@ -4875,8 +4986,10 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
else
BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
- if (has_write)
+ if (has_write) {
+ maybe_wait_bpf_programs(map);
bpf_map_write_active_dec(map);
+ }
fdput(f);
return err;
}
@@ -4945,7 +5058,11 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = bpf_xdp_link_attach(attr, prog);
break;
case BPF_PROG_TYPE_SCHED_CLS:
- ret = tcx_link_attach(attr, prog);
+ if (attr->link_create.attach_type == BPF_TCX_INGRESS ||
+ attr->link_create.attach_type == BPF_TCX_EGRESS)
+ ret = tcx_link_attach(attr, prog);
+ else
+ ret = netkit_link_attach(attr, prog);
break;
case BPF_PROG_TYPE_NETFILTER:
ret = bpf_nf_link_attach(attr, prog);
@@ -5274,6 +5391,11 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
goto out_unlock;
}
+ /* The bpf program will not access the bpf map, but for the sake of
+ * simplicity, increase sleepable_refcnt for sleepable program as well.
+ */
+ if (prog->aux->sleepable)
+ atomic64_inc(&map->sleepable_refcnt);
memcpy(used_maps_new, used_maps_old,
sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
used_maps_new[prog->aux->used_map_cnt] = map;
@@ -5502,9 +5624,9 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
}
run_ctx.bpf_cookie = 0;
- run_ctx.saved_run_ctx = NULL;
if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
/* recursion detected */
+ __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
bpf_prog_put(prog);
return -EBUSY;
}
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index c4ab9d6cdbe9..e5c3500443c6 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -7,7 +7,9 @@
#include <linux/fs.h>
#include <linux/fdtable.h>
#include <linux/filter.h>
+#include <linux/bpf_mem_alloc.h>
#include <linux/btf_ids.h>
+#include <linux/mm_types.h>
#include "mmap_unlock_work.h"
static const char * const iter_task_type_names[] = {
@@ -35,16 +37,13 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
u32 *tid,
bool skip_if_dup_files)
{
- struct task_struct *task, *next_task;
+ struct task_struct *task;
struct pid *pid;
- u32 saved_tid;
+ u32 next_tid;
if (!*tid) {
/* The first time, the iterator calls this function. */
pid = find_pid_ns(common->pid, common->ns);
- if (!pid)
- return NULL;
-
task = get_pid_task(pid, PIDTYPE_TGID);
if (!task)
return NULL;
@@ -66,44 +65,25 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
return task;
}
- pid = find_pid_ns(common->pid_visiting, common->ns);
- if (!pid)
- return NULL;
-
- task = get_pid_task(pid, PIDTYPE_PID);
+ task = find_task_by_pid_ns(common->pid_visiting, common->ns);
if (!task)
return NULL;
retry:
- if (!pid_alive(task)) {
- put_task_struct(task);
- return NULL;
- }
-
- next_task = next_thread(task);
- put_task_struct(task);
- if (!next_task)
- return NULL;
-
- saved_tid = *tid;
- *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
- if (!*tid || *tid == common->pid) {
- /* Run out of tasks of a process. The tasks of a
- * thread_group are linked as circular linked list.
- */
- *tid = saved_tid;
+ task = __next_thread(task);
+ if (!task)
return NULL;
- }
- get_task_struct(next_task);
- common->pid_visiting = *tid;
+ next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns);
+ if (!next_tid)
+ goto retry;
- if (skip_if_dup_files && task->files == task->group_leader->files) {
- task = next_task;
+ if (skip_if_dup_files && task->files == task->group_leader->files)
goto retry;
- }
- return next_task;
+ *tid = common->pid_visiting = next_tid;
+ get_task_struct(task);
+ return task;
}
static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
@@ -308,11 +288,9 @@ again:
rcu_read_lock();
for (;; curr_fd++) {
struct file *f;
- f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
+ f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
if (!f)
break;
- if (!get_file_rcu(f))
- continue;
/* set info->fd */
info->fd = curr_fd;
@@ -724,7 +702,7 @@ static struct bpf_iter_reg task_reg_info = {
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__task, task),
- PTR_TO_BTF_ID_OR_NULL },
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
},
.seq_info = &task_seq_info,
.fill_link_info = bpf_iter_fill_link_info,
@@ -823,6 +801,239 @@ const struct bpf_func_proto bpf_find_vma_proto = {
.arg5_type = ARG_ANYTHING,
};
+struct bpf_iter_task_vma_kern_data {
+ struct task_struct *task;
+ struct mm_struct *mm;
+ struct mmap_unlock_irq_work *work;
+ struct vma_iterator vmi;
+};
+
+struct bpf_iter_task_vma {
+ /* opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+/* Non-opaque version of bpf_iter_task_vma */
+struct bpf_iter_task_vma_kern {
+ struct bpf_iter_task_vma_kern_data *data;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
+ struct task_struct *task, u64 addr)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+ bool irq_work_busy = false;
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
+
+ /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
+ * before, so non-NULL kit->data doesn't point to previously
+ * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
+ */
+ kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
+ if (!kit->data)
+ return -ENOMEM;
+
+ kit->data->task = get_task_struct(task);
+ kit->data->mm = task->mm;
+ if (!kit->data->mm) {
+ err = -ENOENT;
+ goto err_cleanup_iter;
+ }
+
+ /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
+ irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
+ if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
+ err = -EBUSY;
+ goto err_cleanup_iter;
+ }
+
+ vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
+ return 0;
+
+err_cleanup_iter:
+ if (kit->data->task)
+ put_task_struct(kit->data->task);
+ bpf_mem_free(&bpf_global_ma, kit->data);
+ /* NULL kit->data signals failed bpf_iter_task_vma initialization */
+ kit->data = NULL;
+ return err;
+}
+
+__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+ if (!kit->data) /* bpf_iter_task_vma_new failed */
+ return NULL;
+ return vma_next(&kit->data->vmi);
+}
+
+__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+ if (kit->data) {
+ bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
+ put_task_struct(kit->data->task);
+ bpf_mem_free(&bpf_global_ma, kit->data);
+ }
+}
+
+__bpf_kfunc_end_defs();
+
+#ifdef CONFIG_CGROUPS
+
+struct bpf_iter_css_task {
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_task_kern {
+ struct css_task_iter *css_it;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
+ struct cgroup_subsys_state *css, unsigned int flags)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) !=
+ __alignof__(struct bpf_iter_css_task));
+ kit->css_it = NULL;
+ switch (flags) {
+ case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED:
+ case CSS_TASK_ITER_PROCS:
+ case 0:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter));
+ if (!kit->css_it)
+ return -ENOMEM;
+ css_task_iter_start(css, flags, kit->css_it);
+ return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ if (!kit->css_it)
+ return NULL;
+ return css_task_iter_next(kit->css_it);
+}
+
+__bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ if (!kit->css_it)
+ return;
+ css_task_iter_end(kit->css_it);
+ bpf_mem_free(&bpf_global_ma, kit->css_it);
+}
+
+__bpf_kfunc_end_defs();
+
+#endif /* CONFIG_CGROUPS */
+
+struct bpf_iter_task {
+ __u64 __opaque[3];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_task_kern {
+ struct task_struct *task;
+ struct task_struct *pos;
+ unsigned int flags;
+} __attribute__((aligned(8)));
+
+enum {
+ /* all process in the system */
+ BPF_TASK_ITER_ALL_PROCS,
+ /* all threads in the system */
+ BPF_TASK_ITER_ALL_THREADS,
+ /* all threads of a specific process */
+ BPF_TASK_ITER_PROC_THREADS
+};
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
+ struct task_struct *task__nullable, unsigned int flags)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
+ __alignof__(struct bpf_iter_task));
+
+ switch (flags) {
+ case BPF_TASK_ITER_ALL_THREADS:
+ case BPF_TASK_ITER_ALL_PROCS:
+ break;
+ case BPF_TASK_ITER_PROC_THREADS:
+ if (!task__nullable)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (flags == BPF_TASK_ITER_PROC_THREADS)
+ kit->task = task__nullable;
+ else
+ kit->task = &init_task;
+ kit->pos = kit->task;
+ kit->flags = flags;
+ return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+ struct task_struct *pos;
+ unsigned int flags;
+
+ flags = kit->flags;
+ pos = kit->pos;
+
+ if (!pos)
+ return pos;
+
+ if (flags == BPF_TASK_ITER_ALL_PROCS)
+ goto get_next_task;
+
+ kit->pos = __next_thread(kit->pos);
+ if (kit->pos || flags == BPF_TASK_ITER_PROC_THREADS)
+ return pos;
+
+get_next_task:
+ kit->task = next_task(kit->task);
+ if (kit->task == &init_task)
+ kit->pos = NULL;
+ else
+ kit->pos = kit->task;
+
+ return pos;
+}
+
+__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
+{
+}
+
+__bpf_kfunc_end_defs();
+
DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
static void do_mmap_read_unlock(struct irq_work *entry)
diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c
index 13f0b5dc8262..2e4885e7781f 100644
--- a/kernel/bpf/tcx.c
+++ b/kernel/bpf/tcx.c
@@ -123,7 +123,6 @@ int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
{
bool ingress = attr->query.attach_type == BPF_TCX_INGRESS;
struct net *net = current->nsproxy->net_ns;
- struct bpf_mprog_entry *entry;
struct net_device *dev;
int ret;
@@ -133,12 +132,7 @@ int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
ret = -ENODEV;
goto out;
}
- entry = tcx_entry_fetch(dev, ingress);
- if (!entry) {
- ret = -ENOENT;
- goto out;
- }
- ret = bpf_mprog_query(attr, uattr, entry);
+ ret = bpf_mprog_query(attr, uattr, tcx_entry_fetch(dev, ingress));
out:
rtnl_unlock();
return ret;
@@ -256,7 +250,7 @@ static void tcx_link_dealloc(struct bpf_link *link)
static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
{
- const struct tcx_link *tcx = tcx_link_const(link);
+ const struct tcx_link *tcx = tcx_link(link);
u32 ifindex = 0;
rtnl_lock();
@@ -273,7 +267,7 @@ static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
static int tcx_link_fill_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
- const struct tcx_link *tcx = tcx_link_const(link);
+ const struct tcx_link *tcx = tcx_link(link);
u32 ifindex = 0;
rtnl_lock();
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 3d7127f439a1..9dbc31b25e3d 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -172,12 +172,6 @@ bool tnum_in(struct tnum a, struct tnum b)
return a.value == b.value;
}
-int tnum_strn(char *str, size_t size, struct tnum a)
-{
- return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
-}
-EXPORT_SYMBOL_GPL(tnum_strn);
-
int tnum_sbin(char *str, size_t size, struct tnum a)
{
size_t n;
@@ -208,7 +202,12 @@ struct tnum tnum_clear_subreg(struct tnum a)
return tnum_lshift(tnum_rshift(a, 32), 32);
}
+struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg)
+{
+ return tnum_or(tnum_clear_subreg(reg), tnum_subreg(subreg));
+}
+
struct tnum tnum_const_subreg(struct tnum a, u32 value)
{
- return tnum_or(tnum_clear_subreg(a), tnum_const(value));
+ return tnum_with_subreg(a, tnum_const(value));
}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 78acf28d4873..d382f5ebe06c 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -115,10 +115,10 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
}
-void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
+void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym)
{
ksym->start = (unsigned long) data;
- ksym->end = ksym->start + PAGE_SIZE;
+ ksym->end = ksym->start + size;
bpf_ksym_add(ksym);
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
PAGE_SIZE, false, ksym->name);
@@ -254,8 +254,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a
static void bpf_tramp_image_free(struct bpf_tramp_image *im)
{
bpf_image_ksym_del(&im->ksym);
- bpf_jit_free_exec(im->image);
- bpf_jit_uncharge_modmem(PAGE_SIZE);
+ arch_free_bpf_trampoline(im->image, im->size);
+ bpf_jit_uncharge_modmem(im->size);
percpu_ref_exit(&im->pcref);
kfree_rcu(im, rcu);
}
@@ -349,7 +349,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
}
-static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size)
{
struct bpf_tramp_image *im;
struct bpf_ksym *ksym;
@@ -360,15 +360,15 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
if (!im)
goto out;
- err = bpf_jit_charge_modmem(PAGE_SIZE);
+ err = bpf_jit_charge_modmem(size);
if (err)
goto out_free_im;
+ im->size = size;
err = -ENOMEM;
- im->image = image = bpf_jit_alloc_exec(PAGE_SIZE);
+ im->image = image = arch_alloc_bpf_trampoline(size);
if (!image)
goto out_uncharge;
- set_vm_flush_reset_perms(image);
err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
if (err)
@@ -377,13 +377,13 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
ksym = &im->ksym;
INIT_LIST_HEAD_RCU(&ksym->lnode);
snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
- bpf_image_ksym_add(image, ksym);
+ bpf_image_ksym_add(image, size, ksym);
return im;
out_free_image:
- bpf_jit_free_exec(im->image);
+ arch_free_bpf_trampoline(im->image, im->size);
out_uncharge:
- bpf_jit_uncharge_modmem(PAGE_SIZE);
+ bpf_jit_uncharge_modmem(size);
out_free_im:
kfree(im);
out:
@@ -396,7 +396,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
struct bpf_tramp_links *tlinks;
u32 orig_flags = tr->flags;
bool ip_arg = false;
- int err, total;
+ int err, total, size;
tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg);
if (IS_ERR(tlinks))
@@ -409,14 +409,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
goto out;
}
- im = bpf_tramp_image_alloc(tr->key);
- if (IS_ERR(im)) {
- err = PTR_ERR(im);
- goto out;
- }
-
- /* clear all bits except SHARE_IPMODIFY */
- tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY;
+ /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
+ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
@@ -438,13 +432,31 @@ again:
tr->flags |= BPF_TRAMP_F_ORIG_STACK;
#endif
- err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
+ size = arch_bpf_trampoline_size(&tr->func.model, tr->flags,
+ tlinks, tr->func.addr);
+ if (size < 0) {
+ err = size;
+ goto out;
+ }
+
+ if (size > PAGE_SIZE) {
+ err = -E2BIG;
+ goto out;
+ }
+
+ im = bpf_tramp_image_alloc(tr->key, size);
+ if (IS_ERR(im)) {
+ err = PTR_ERR(im);
+ goto out;
+ }
+
+ err = arch_prepare_bpf_trampoline(im, im->image, im->image + size,
&tr->func.model, tr->flags, tlinks,
tr->func.addr);
if (err < 0)
goto out_free;
- set_memory_rox((long)im->image, 1);
+ arch_protect_bpf_trampoline(im->image, im->size);
WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
@@ -464,9 +476,8 @@ again:
tr->fops->func = NULL;
tr->fops->trampoline = 0;
- /* reset im->image memory attr for arch_prepare_bpf_trampoline */
- set_memory_nx((long)im->image, 1);
- set_memory_rw((long)im->image, 1);
+ /* free im memory and reallocate later */
+ bpf_tramp_image_free(im);
goto again;
}
#endif
@@ -926,13 +937,12 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
migrate_disable();
might_fault();
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
return 0;
}
-
- run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
-
return bpf_prog_start_time();
}
@@ -1033,10 +1043,50 @@ bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
}
int __weak
-arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
+arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
struct bpf_tramp_links *tlinks,
- void *orig_call)
+ void *func_addr)
+{
+ return -ENOTSUPP;
+}
+
+void * __weak arch_alloc_bpf_trampoline(unsigned int size)
+{
+ void *image;
+
+ if (WARN_ON_ONCE(size > PAGE_SIZE))
+ return NULL;
+ image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (image)
+ set_vm_flush_reset_perms(image);
+ return image;
+}
+
+void __weak arch_free_bpf_trampoline(void *image, unsigned int size)
+{
+ WARN_ON_ONCE(size > PAGE_SIZE);
+ /* bpf_jit_free_exec doesn't need "size", but
+ * bpf_prog_pack_free() needs it.
+ */
+ bpf_jit_free_exec(image);
+}
+
+void __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
+{
+ WARN_ON_ONCE(size > PAGE_SIZE);
+ set_memory_rox((long)image, 1);
+}
+
+void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+{
+ WARN_ON_ONCE(size > PAGE_SIZE);
+ set_memory_nx((long)image, 1);
+ set_memory_rw((long)image, 1);
+}
+
+int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks, void *func_addr)
{
return -ENOTSUPP;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bb78212fa5b2..65f598694d55 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -26,6 +26,7 @@
#include <linux/poison.h>
#include <linux/module.h>
#include <linux/cpumask.h>
+#include <linux/bpf_mem_alloc.h>
#include <net/xdp.h>
#include "disasm.h"
@@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
#undef BPF_LINK_TYPE
};
+struct bpf_mem_alloc bpf_global_percpu_ma;
+static bool bpf_global_percpu_ma_set;
+
/* bpf_check() is a static code analyzer that walks eBPF program
* instruction by instruction and updates register/stack state.
* All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -191,6 +195,8 @@ struct bpf_verifier_stack_elem {
POISON_POINTER_DELTA))
#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
+#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
+
static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
@@ -304,7 +310,7 @@ struct bpf_kfunc_call_arg_meta {
/* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
* generally to pass info about user-defined local kptr types to later
* verification logic
- * bpf_obj_drop
+ * bpf_obj_drop/bpf_percpu_obj_drop
* Record the local kptr type to be drop'd
* bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
* Record the local kptr type to be refcount_incr'd and use
@@ -335,29 +341,14 @@ struct bpf_kfunc_call_arg_meta {
struct btf *btf_vmlinux;
-static DEFINE_MUTEX(bpf_verifier_lock);
-
-static const struct bpf_line_info *
-find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
+static const char *btf_type_name(const struct btf *btf, u32 id)
{
- const struct bpf_line_info *linfo;
- const struct bpf_prog *prog;
- u32 i, nr_linfo;
-
- prog = env->prog;
- nr_linfo = prog->aux->nr_linfo;
-
- if (!nr_linfo || insn_off >= prog->len)
- return NULL;
-
- linfo = prog->aux->linfo;
- for (i = 1; i < nr_linfo; i++)
- if (insn_off < linfo[i].insn_off)
- break;
-
- return &linfo[i - 1];
+ return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
}
+static DEFINE_MUTEX(bpf_verifier_lock);
+static DEFINE_MUTEX(bpf_percpu_ma_lock);
+
__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
{
struct bpf_verifier_env *env = private_data;
@@ -371,73 +362,25 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
va_end(args);
}
-static const char *ltrim(const char *s)
-{
- while (isspace(*s))
- s++;
-
- return s;
-}
-
-__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env,
- u32 insn_off,
- const char *prefix_fmt, ...)
-{
- const struct bpf_line_info *linfo;
-
- if (!bpf_verifier_log_needed(&env->log))
- return;
-
- linfo = find_linfo(env, insn_off);
- if (!linfo || linfo == env->prev_linfo)
- return;
-
- if (prefix_fmt) {
- va_list args;
-
- va_start(args, prefix_fmt);
- bpf_verifier_vlog(&env->log, prefix_fmt, args);
- va_end(args);
- }
-
- verbose(env, "%s\n",
- ltrim(btf_name_by_offset(env->prog->aux->btf,
- linfo->line_off)));
-
- env->prev_linfo = linfo;
-}
-
static void verbose_invalid_scalar(struct bpf_verifier_env *env,
struct bpf_reg_state *reg,
- struct tnum *range, const char *ctx,
+ struct bpf_retval_range range, const char *ctx,
const char *reg_name)
{
- char tn_buf[48];
+ bool unknown = true;
- verbose(env, "At %s the register %s ", ctx, reg_name);
- if (!tnum_is_unknown(reg->var_off)) {
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "has value %s", tn_buf);
- } else {
- verbose(env, "has unknown scalar value");
+ verbose(env, "%s the register %s has", ctx, reg_name);
+ if (reg->smin_value > S64_MIN) {
+ verbose(env, " smin=%lld", reg->smin_value);
+ unknown = false;
}
- tnum_strn(tn_buf, sizeof(tn_buf), *range);
- verbose(env, " should have been in %s\n", tn_buf);
-}
-
-static bool type_is_pkt_pointer(enum bpf_reg_type type)
-{
- type = base_type(type);
- return type == PTR_TO_PACKET ||
- type == PTR_TO_PACKET_META;
-}
-
-static bool type_is_sk_pointer(enum bpf_reg_type type)
-{
- return type == PTR_TO_SOCKET ||
- type == PTR_TO_SOCK_COMMON ||
- type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_XDP_SOCK;
+ if (reg->smax_value < S64_MAX) {
+ verbose(env, " smax=%lld", reg->smax_value);
+ unknown = false;
+ }
+ if (unknown)
+ verbose(env, " unknown scalar value");
+ verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
}
static bool type_may_be_null(u32 type)
@@ -463,16 +406,6 @@ static bool reg_not_null(const struct bpf_reg_state *reg)
type == PTR_TO_MEM;
}
-static bool type_is_ptr_alloc_obj(u32 type)
-{
- return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
-}
-
-static bool type_is_non_owning_ref(u32 type)
-{
- return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
-}
-
static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
{
struct btf_record *rec = NULL;
@@ -495,6 +428,31 @@ static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
}
+static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_func_info *info;
+
+ if (!env->prog->aux->func_info)
+ return "";
+
+ info = &env->prog->aux->func_info[subprog];
+ return btf_type_name(env->prog->aux->btf, info->type_id);
+}
+
+static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_subprog_info *info = subprog_info(env, subprog);
+
+ info->is_cb = true;
+ info->is_async_cb = true;
+ info->is_exception_cb = true;
+}
+
+static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog)
+{
+ return subprog_info(env, subprog)->is_exception_cb;
+}
+
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
@@ -542,12 +500,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_dynptr_data;
}
-static bool is_callback_calling_kfunc(u32 btf_id);
+static bool is_sync_callback_calling_kfunc(u32 btf_id);
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
-static bool is_callback_calling_function(enum bpf_func_id func_id)
+static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_for_each_map_elem ||
- func_id == BPF_FUNC_timer_set_callback ||
func_id == BPF_FUNC_find_vma ||
func_id == BPF_FUNC_loop ||
func_id == BPF_FUNC_user_ringbuf_drain;
@@ -558,6 +516,18 @@ static bool is_async_callback_calling_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_timer_set_callback;
}
+static bool is_callback_calling_function(enum bpf_func_id func_id)
+{
+ return is_sync_callback_calling_function(func_id) ||
+ is_async_callback_calling_function(func_id);
+}
+
+static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
+{
+ return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
+ (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
+}
+
static bool is_storage_get_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_storage_get ||
@@ -588,83 +558,6 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
insn->imm == BPF_CMPXCHG;
}
-/* string representation of 'enum bpf_reg_type'
- *
- * Note that reg_type_str() can not appear more than once in a single verbose()
- * statement.
- */
-static const char *reg_type_str(struct bpf_verifier_env *env,
- enum bpf_reg_type type)
-{
- char postfix[16] = {0}, prefix[64] = {0};
- static const char * const str[] = {
- [NOT_INIT] = "?",
- [SCALAR_VALUE] = "scalar",
- [PTR_TO_CTX] = "ctx",
- [CONST_PTR_TO_MAP] = "map_ptr",
- [PTR_TO_MAP_VALUE] = "map_value",
- [PTR_TO_STACK] = "fp",
- [PTR_TO_PACKET] = "pkt",
- [PTR_TO_PACKET_META] = "pkt_meta",
- [PTR_TO_PACKET_END] = "pkt_end",
- [PTR_TO_FLOW_KEYS] = "flow_keys",
- [PTR_TO_SOCKET] = "sock",
- [PTR_TO_SOCK_COMMON] = "sock_common",
- [PTR_TO_TCP_SOCK] = "tcp_sock",
- [PTR_TO_TP_BUFFER] = "tp_buffer",
- [PTR_TO_XDP_SOCK] = "xdp_sock",
- [PTR_TO_BTF_ID] = "ptr_",
- [PTR_TO_MEM] = "mem",
- [PTR_TO_BUF] = "buf",
- [PTR_TO_FUNC] = "func",
- [PTR_TO_MAP_KEY] = "map_key",
- [CONST_PTR_TO_DYNPTR] = "dynptr_ptr",
- };
-
- if (type & PTR_MAYBE_NULL) {
- if (base_type(type) == PTR_TO_BTF_ID)
- strncpy(postfix, "or_null_", 16);
- else
- strncpy(postfix, "_or_null", 16);
- }
-
- snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
- type & MEM_RDONLY ? "rdonly_" : "",
- type & MEM_RINGBUF ? "ringbuf_" : "",
- type & MEM_USER ? "user_" : "",
- type & MEM_PERCPU ? "percpu_" : "",
- type & MEM_RCU ? "rcu_" : "",
- type & PTR_UNTRUSTED ? "untrusted_" : "",
- type & PTR_TRUSTED ? "trusted_" : ""
- );
-
- snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
- prefix, str[base_type(type)], postfix);
- return env->tmp_str_buf;
-}
-
-static char slot_type_char[] = {
- [STACK_INVALID] = '?',
- [STACK_SPILL] = 'r',
- [STACK_MISC] = 'm',
- [STACK_ZERO] = '0',
- [STACK_DYNPTR] = 'd',
- [STACK_ITER] = 'i',
-};
-
-static void print_liveness(struct bpf_verifier_env *env,
- enum bpf_reg_liveness live)
-{
- if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
- verbose(env, "_");
- if (live & REG_LIVE_READ)
- verbose(env, "r");
- if (live & REG_LIVE_WRITTEN)
- verbose(env, "w");
- if (live & REG_LIVE_DONE)
- verbose(env, "D");
-}
-
static int __get_spi(s32 off)
{
return (-off - 1) / BPF_REG_SIZE;
@@ -729,92 +622,6 @@ static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
}
-static const char *btf_type_name(const struct btf *btf, u32 id)
-{
- return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
-}
-
-static const char *dynptr_type_str(enum bpf_dynptr_type type)
-{
- switch (type) {
- case BPF_DYNPTR_TYPE_LOCAL:
- return "local";
- case BPF_DYNPTR_TYPE_RINGBUF:
- return "ringbuf";
- case BPF_DYNPTR_TYPE_SKB:
- return "skb";
- case BPF_DYNPTR_TYPE_XDP:
- return "xdp";
- case BPF_DYNPTR_TYPE_INVALID:
- return "<invalid>";
- default:
- WARN_ONCE(1, "unknown dynptr type %d\n", type);
- return "<unknown>";
- }
-}
-
-static const char *iter_type_str(const struct btf *btf, u32 btf_id)
-{
- if (!btf || btf_id == 0)
- return "<invalid>";
-
- /* we already validated that type is valid and has conforming name */
- return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
-}
-
-static const char *iter_state_str(enum bpf_iter_state state)
-{
- switch (state) {
- case BPF_ITER_STATE_ACTIVE:
- return "active";
- case BPF_ITER_STATE_DRAINED:
- return "drained";
- case BPF_ITER_STATE_INVALID:
- return "<invalid>";
- default:
- WARN_ONCE(1, "unknown iter state %d\n", state);
- return "<unknown>";
- }
-}
-
-static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
-{
- env->scratched_regs |= 1U << regno;
-}
-
-static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
-{
- env->scratched_stack_slots |= 1ULL << spi;
-}
-
-static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
-{
- return (env->scratched_regs >> regno) & 1;
-}
-
-static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
-{
- return (env->scratched_stack_slots >> regno) & 1;
-}
-
-static bool verifier_state_scratched(const struct bpf_verifier_env *env)
-{
- return env->scratched_regs || env->scratched_stack_slots;
-}
-
-static void mark_verifier_state_clean(struct bpf_verifier_env *env)
-{
- env->scratched_regs = 0U;
- env->scratched_stack_slots = 0ULL;
-}
-
-/* Used for printing the entire verifier state. */
-static void mark_verifier_state_scratched(struct bpf_verifier_env *env)
-{
- env->scratched_regs = ~0U;
- env->scratched_stack_slots = ~0ULL;
-}
-
static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
{
switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
@@ -1172,7 +979,12 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+static bool in_rcu_cs(struct bpf_verifier_env *env);
+
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta);
+
static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
struct bpf_reg_state *reg, int insn_idx,
struct btf *btf, u32 btf_id, int nr_slots)
{
@@ -1193,6 +1005,12 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
__mark_reg_known_zero(st);
st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ if (is_kfunc_rcu_protected(meta)) {
+ if (in_rcu_cs(env))
+ st->type |= MEM_RCU;
+ else
+ st->type |= PTR_UNTRUSTED;
+ }
st->live |= REG_LIVE_WRITTEN;
st->ref_obj_id = i == 0 ? id : 0;
st->iter.btf = btf;
@@ -1267,7 +1085,7 @@ static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
return true;
}
-static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
struct btf *btf, u32 btf_id, int nr_slots)
{
struct bpf_func_state *state = func(env, reg);
@@ -1275,26 +1093,28 @@ static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_
spi = iter_get_spi(env, reg, nr_slots);
if (spi < 0)
- return false;
+ return -EINVAL;
for (i = 0; i < nr_slots; i++) {
struct bpf_stack_state *slot = &state->stack[spi - i];
struct bpf_reg_state *st = &slot->spilled_ptr;
+ if (st->type & PTR_UNTRUSTED)
+ return -EPROTO;
/* only main (first) slot has ref_obj_id set */
if (i == 0 && !st->ref_obj_id)
- return false;
+ return -EINVAL;
if (i != 0 && st->ref_obj_id)
- return false;
+ return -EINVAL;
if (st->iter.btf != btf || st->iter.btf_id != btf_id)
- return false;
+ return -EINVAL;
for (j = 0; j < BPF_REG_SIZE; j++)
if (slot->slot_type[j] != STACK_ITER)
- return false;
+ return -EINVAL;
}
- return true;
+ return 0;
}
/* Check if given stack slot is "special":
@@ -1335,206 +1155,25 @@ static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
stack->spilled_ptr.type == SCALAR_VALUE;
}
-static void scrub_spilled_slot(u8 *stype)
-{
- if (*stype != STACK_INVALID)
- *stype = STACK_MISC;
-}
-
-static void print_verifier_state(struct bpf_verifier_env *env,
- const struct bpf_func_state *state,
- bool print_all)
-{
- const struct bpf_reg_state *reg;
- enum bpf_reg_type t;
- int i;
-
- if (state->frameno)
- verbose(env, " frame%d:", state->frameno);
- for (i = 0; i < MAX_BPF_REG; i++) {
- reg = &state->regs[i];
- t = reg->type;
- if (t == NOT_INIT)
- continue;
- if (!print_all && !reg_scratched(env, i))
- continue;
- verbose(env, " R%d", i);
- print_liveness(env, reg->live);
- verbose(env, "=");
- if (t == SCALAR_VALUE && reg->precise)
- verbose(env, "P");
- if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
- tnum_is_const(reg->var_off)) {
- /* reg->off should be 0 for SCALAR_VALUE */
- verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
- verbose(env, "%lld", reg->var_off.value + reg->off);
- } else {
- const char *sep = "";
-
- verbose(env, "%s", reg_type_str(env, t));
- if (base_type(t) == PTR_TO_BTF_ID)
- verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
- verbose(env, "(");
-/*
- * _a stands for append, was shortened to avoid multiline statements below.
- * This macro is used to output a comma separated list of attributes.
+/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
+ * case they are equivalent, or it's STACK_ZERO, in which case we preserve
+ * more precise STACK_ZERO.
+ * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take
+ * env->allow_ptr_leaks into account and force STACK_MISC, if necessary.
*/
-#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; })
-
- if (reg->id)
- verbose_a("id=%d", reg->id);
- if (reg->ref_obj_id)
- verbose_a("ref_obj_id=%d", reg->ref_obj_id);
- if (type_is_non_owning_ref(reg->type))
- verbose_a("%s", "non_own_ref");
- if (t != SCALAR_VALUE)
- verbose_a("off=%d", reg->off);
- if (type_is_pkt_pointer(t))
- verbose_a("r=%d", reg->range);
- else if (base_type(t) == CONST_PTR_TO_MAP ||
- base_type(t) == PTR_TO_MAP_KEY ||
- base_type(t) == PTR_TO_MAP_VALUE)
- verbose_a("ks=%d,vs=%d",
- reg->map_ptr->key_size,
- reg->map_ptr->value_size);
- if (tnum_is_const(reg->var_off)) {
- /* Typically an immediate SCALAR_VALUE, but
- * could be a pointer whose offset is too big
- * for reg->off
- */
- verbose_a("imm=%llx", reg->var_off.value);
- } else {
- if (reg->smin_value != reg->umin_value &&
- reg->smin_value != S64_MIN)
- verbose_a("smin=%lld", (long long)reg->smin_value);
- if (reg->smax_value != reg->umax_value &&
- reg->smax_value != S64_MAX)
- verbose_a("smax=%lld", (long long)reg->smax_value);
- if (reg->umin_value != 0)
- verbose_a("umin=%llu", (unsigned long long)reg->umin_value);
- if (reg->umax_value != U64_MAX)
- verbose_a("umax=%llu", (unsigned long long)reg->umax_value);
- if (!tnum_is_unknown(reg->var_off)) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose_a("var_off=%s", tn_buf);
- }
- if (reg->s32_min_value != reg->smin_value &&
- reg->s32_min_value != S32_MIN)
- verbose_a("s32_min=%d", (int)(reg->s32_min_value));
- if (reg->s32_max_value != reg->smax_value &&
- reg->s32_max_value != S32_MAX)
- verbose_a("s32_max=%d", (int)(reg->s32_max_value));
- if (reg->u32_min_value != reg->umin_value &&
- reg->u32_min_value != U32_MIN)
- verbose_a("u32_min=%d", (int)(reg->u32_min_value));
- if (reg->u32_max_value != reg->umax_value &&
- reg->u32_max_value != U32_MAX)
- verbose_a("u32_max=%d", (int)(reg->u32_max_value));
- }
-#undef verbose_a
-
- verbose(env, ")");
- }
- }
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- char types_buf[BPF_REG_SIZE + 1];
- bool valid = false;
- int j;
-
- for (j = 0; j < BPF_REG_SIZE; j++) {
- if (state->stack[i].slot_type[j] != STACK_INVALID)
- valid = true;
- types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
- }
- types_buf[BPF_REG_SIZE] = 0;
- if (!valid)
- continue;
- if (!print_all && !stack_slot_scratched(env, i))
- continue;
- switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
- case STACK_SPILL:
- reg = &state->stack[i].spilled_ptr;
- t = reg->type;
-
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, reg->live);
- verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
- if (t == SCALAR_VALUE && reg->precise)
- verbose(env, "P");
- if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
- verbose(env, "%lld", reg->var_off.value + reg->off);
- break;
- case STACK_DYNPTR:
- i += BPF_DYNPTR_NR_SLOTS - 1;
- reg = &state->stack[i].spilled_ptr;
-
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, reg->live);
- verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type));
- if (reg->ref_obj_id)
- verbose(env, "(ref_id=%d)", reg->ref_obj_id);
- break;
- case STACK_ITER:
- /* only main slot has ref_obj_id set; skip others */
- reg = &state->stack[i].spilled_ptr;
- if (!reg->ref_obj_id)
- continue;
-
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, reg->live);
- verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
- iter_type_str(reg->iter.btf, reg->iter.btf_id),
- reg->ref_obj_id, iter_state_str(reg->iter.state),
- reg->iter.depth);
- break;
- case STACK_MISC:
- case STACK_ZERO:
- default:
- reg = &state->stack[i].spilled_ptr;
-
- for (j = 0; j < BPF_REG_SIZE; j++)
- types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
- types_buf[BPF_REG_SIZE] = 0;
-
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, reg->live);
- verbose(env, "=%s", types_buf);
- break;
- }
- }
- if (state->acquired_refs && state->refs[0].id) {
- verbose(env, " refs=%d", state->refs[0].id);
- for (i = 1; i < state->acquired_refs; i++)
- if (state->refs[i].id)
- verbose(env, ",%d", state->refs[i].id);
- }
- if (state->in_callback_fn)
- verbose(env, " cb");
- if (state->in_async_callback_fn)
- verbose(env, " async_cb");
- verbose(env, "\n");
- mark_verifier_state_clean(env);
-}
-
-static inline u32 vlog_alignment(u32 pos)
+static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
{
- return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
- BPF_LOG_MIN_ALIGNMENT) - pos - 1;
+ if (*stype == STACK_ZERO)
+ return;
+ if (env->allow_ptr_leaks && *stype == STACK_INVALID)
+ return;
+ *stype = STACK_MISC;
}
-static void print_insn_state(struct bpf_verifier_env *env,
- const struct bpf_func_state *state)
+static void scrub_spilled_slot(u8 *stype)
{
- if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
- /* remove new line character */
- bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
- verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
- } else {
- verbose(env, "%d:", env->insn_idx);
- }
- print_verifier_state(env, state, false);
+ if (*stype != STACK_INVALID)
+ *stype = STACK_MISC;
}
/* copy array src of length n * size bytes to dst. dst is reallocated if it's too
@@ -1631,9 +1270,16 @@ static int resize_reference_state(struct bpf_func_state *state, size_t n)
return 0;
}
-static int grow_stack_state(struct bpf_func_state *state, int size)
+/* Possibly update state->allocated_stack to be at least size bytes. Also
+ * possibly update the function's high-water mark in its bpf_subprog_info.
+ */
+static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size)
{
- size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE;
+ size_t old_n = state->allocated_stack / BPF_REG_SIZE, n;
+
+ /* The stack size is always a multiple of BPF_REG_SIZE. */
+ size = round_up(size, BPF_REG_SIZE);
+ n = size / BPF_REG_SIZE;
if (old_n >= n)
return 0;
@@ -1643,6 +1289,11 @@ static int grow_stack_state(struct bpf_func_state *state, int size)
return -ENOMEM;
state->allocated_stack = size;
+
+ /* update known max for given subprogram */
+ if (env->subprog_info[state->subprogno].stack_depth < size)
+ env->subprog_info[state->subprogno].stack_depth = size;
+
return 0;
}
@@ -1742,13 +1393,15 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
int i, err;
dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
- src->jmp_history_cnt, sizeof(struct bpf_idx_pair),
- GFP_USER);
+ src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
+ GFP_USER);
if (!dst_state->jmp_history)
return -ENOMEM;
dst_state->jmp_history_cnt = src->jmp_history_cnt;
- /* if dst has more stack frames then src frame, free them */
+ /* if dst has more stack frames then src frame, free them, this is also
+ * necessary in case of exceptional exits using bpf_throw.
+ */
for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
free_func_state(dst_state->frame[i]);
dst_state->frame[i] = NULL;
@@ -1762,6 +1415,9 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
dst_state->last_insn_idx = src->last_insn_idx;
+ dst_state->dfs_depth = src->dfs_depth;
+ dst_state->callback_unroll_depth = src->callback_unroll_depth;
+ dst_state->used_as_loop_entry = src->used_as_loop_entry;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -1777,11 +1433,203 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
return 0;
}
+static u32 state_htab_size(struct bpf_verifier_env *env)
+{
+ return env->prog->len;
+}
+
+static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_func_state *state = cur->frame[cur->curframe];
+
+ return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+}
+
+static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b)
+{
+ int fr;
+
+ if (a->curframe != b->curframe)
+ return false;
+
+ for (fr = a->curframe; fr >= 0; fr--)
+ if (a->frame[fr]->callsite != b->frame[fr]->callsite)
+ return false;
+
+ return true;
+}
+
+/* Open coded iterators allow back-edges in the state graph in order to
+ * check unbounded loops that iterators.
+ *
+ * In is_state_visited() it is necessary to know if explored states are
+ * part of some loops in order to decide whether non-exact states
+ * comparison could be used:
+ * - non-exact states comparison establishes sub-state relation and uses
+ * read and precision marks to do so, these marks are propagated from
+ * children states and thus are not guaranteed to be final in a loop;
+ * - exact states comparison just checks if current and explored states
+ * are identical (and thus form a back-edge).
+ *
+ * Paper "A New Algorithm for Identifying Loops in Decompilation"
+ * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient
+ * algorithm for loop structure detection and gives an overview of
+ * relevant terminology. It also has helpful illustrations.
+ *
+ * [1] https://api.semanticscholar.org/CorpusID:15784067
+ *
+ * We use a similar algorithm but because loop nested structure is
+ * irrelevant for verifier ours is significantly simpler and resembles
+ * strongly connected components algorithm from Sedgewick's textbook.
+ *
+ * Define topmost loop entry as a first node of the loop traversed in a
+ * depth first search starting from initial state. The goal of the loop
+ * tracking algorithm is to associate topmost loop entries with states
+ * derived from these entries.
+ *
+ * For each step in the DFS states traversal algorithm needs to identify
+ * the following situations:
+ *
+ * initial initial initial
+ * | | |
+ * V V V
+ * ... ... .---------> hdr
+ * | | | |
+ * V V | V
+ * cur .-> succ | .------...
+ * | | | | | |
+ * V | V | V V
+ * succ '-- cur | ... ...
+ * | | |
+ * | V V
+ * | succ <- cur
+ * | |
+ * | V
+ * | ...
+ * | |
+ * '----'
+ *
+ * (A) successor state of cur (B) successor state of cur or it's entry
+ * not yet traversed are in current DFS path, thus cur and succ
+ * are members of the same outermost loop
+ *
+ * initial initial
+ * | |
+ * V V
+ * ... ...
+ * | |
+ * V V
+ * .------... .------...
+ * | | | |
+ * V V V V
+ * .-> hdr ... ... ...
+ * | | | | |
+ * | V V V V
+ * | succ <- cur succ <- cur
+ * | | |
+ * | V V
+ * | ... ...
+ * | | |
+ * '----' exit
+ *
+ * (C) successor state of cur is a part of some loop but this loop
+ * does not include cur or successor state is not in a loop at all.
+ *
+ * Algorithm could be described as the following python code:
+ *
+ * traversed = set() # Set of traversed nodes
+ * entries = {} # Mapping from node to loop entry
+ * depths = {} # Depth level assigned to graph node
+ * path = set() # Current DFS path
+ *
+ * # Find outermost loop entry known for n
+ * def get_loop_entry(n):
+ * h = entries.get(n, None)
+ * while h in entries and entries[h] != h:
+ * h = entries[h]
+ * return h
+ *
+ * # Update n's loop entry if h's outermost entry comes
+ * # before n's outermost entry in current DFS path.
+ * def update_loop_entry(n, h):
+ * n1 = get_loop_entry(n) or n
+ * h1 = get_loop_entry(h) or h
+ * if h1 in path and depths[h1] <= depths[n1]:
+ * entries[n] = h1
+ *
+ * def dfs(n, depth):
+ * traversed.add(n)
+ * path.add(n)
+ * depths[n] = depth
+ * for succ in G.successors(n):
+ * if succ not in traversed:
+ * # Case A: explore succ and update cur's loop entry
+ * # only if succ's entry is in current DFS path.
+ * dfs(succ, depth + 1)
+ * h = get_loop_entry(succ)
+ * update_loop_entry(n, h)
+ * else:
+ * # Case B or C depending on `h1 in path` check in update_loop_entry().
+ * update_loop_entry(n, succ)
+ * path.remove(n)
+ *
+ * To adapt this algorithm for use with verifier:
+ * - use st->branch == 0 as a signal that DFS of succ had been finished
+ * and cur's loop entry has to be updated (case A), handle this in
+ * update_branch_counts();
+ * - use st->branch > 0 as a signal that st is in the current DFS path;
+ * - handle cases B and C in is_state_visited();
+ * - update topmost loop entry for intermediate states in get_loop_entry().
+ */
+static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st)
+{
+ struct bpf_verifier_state *topmost = st->loop_entry, *old;
+
+ while (topmost && topmost->loop_entry && topmost != topmost->loop_entry)
+ topmost = topmost->loop_entry;
+ /* Update loop entries for intermediate states to avoid this
+ * traversal in future get_loop_entry() calls.
+ */
+ while (st && st->loop_entry != topmost) {
+ old = st->loop_entry;
+ st->loop_entry = topmost;
+ st = old;
+ }
+ return topmost;
+}
+
+static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
+{
+ struct bpf_verifier_state *cur1, *hdr1;
+
+ cur1 = get_loop_entry(cur) ?: cur;
+ hdr1 = get_loop_entry(hdr) ?: hdr;
+ /* The head1->branches check decides between cases B and C in
+ * comment for get_loop_entry(). If hdr1->branches == 0 then
+ * head's topmost loop entry is not in current DFS path,
+ * hence 'cur' and 'hdr' are not in the same loop and there is
+ * no need to update cur->loop_entry.
+ */
+ if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) {
+ cur->loop_entry = hdr;
+ hdr->used_as_loop_entry = true;
+ }
+}
+
static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
while (st) {
u32 br = --st->branches;
+ /* br == 0 signals that DFS exploration for 'st' is finished,
+ * thus it is necessary to update parent's loop entry if it
+ * turned out that st is a part of some loop.
+ * This is a part of 'case A' in get_loop_entry() comment.
+ */
+ if (br == 0 && st->parent && st->loop_entry)
+ update_loop_entry(st->parent, st->loop_entry);
+
/* WARN_ON(br > 1) technically makes sense here,
* but see comment in push_stack(), hence:
*/
@@ -1921,10 +1769,14 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
__mark_reg_known(reg, 0);
}
-static void __mark_reg_const_zero(struct bpf_reg_state *reg)
+static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
__mark_reg_known(reg, 0);
reg->type = SCALAR_VALUE;
+ /* all scalars are assumed imprecise initially (unless unprivileged,
+ * in which case everything is forced to be precise)
+ */
+ reg->precise = !env->bpf_capable;
}
static void mark_reg_known_zero(struct bpf_verifier_env *env,
@@ -2090,69 +1942,214 @@ static void __update_reg_bounds(struct bpf_reg_state *reg)
/* Uses signed min/max values to inform unsigned, and vice-versa */
static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
{
- /* Learn sign from signed bounds.
- * If we cannot cross the sign boundary, then signed and unsigned bounds
- * are the same, so combine. This works even in the negative case, e.g.
- * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
+ /* If upper 32 bits of u64/s64 range don't change, we can use lower 32
+ * bits to improve our u32/s32 boundaries.
+ *
+ * E.g., the case where we have upper 32 bits as zero ([10, 20] in
+ * u64) is pretty trivial, it's obvious that in u32 we'll also have
+ * [10, 20] range. But this property holds for any 64-bit range as
+ * long as upper 32 bits in that entire range of values stay the same.
+ *
+ * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311]
+ * in decimal) has the same upper 32 bits throughout all the values in
+ * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15])
+ * range.
+ *
+ * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32,
+ * following the rules outlined below about u64/s64 correspondence
+ * (which equally applies to u32 vs s32 correspondence). In general it
+ * depends on actual hexadecimal values of 32-bit range. They can form
+ * only valid u32, or only valid s32 ranges in some cases.
+ *
+ * So we use all these insights to derive bounds for subregisters here.
*/
- if (reg->s32_min_value >= 0 || reg->s32_max_value < 0) {
- reg->s32_min_value = reg->u32_min_value =
- max_t(u32, reg->s32_min_value, reg->u32_min_value);
- reg->s32_max_value = reg->u32_max_value =
- min_t(u32, reg->s32_max_value, reg->u32_max_value);
- return;
+ if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) {
+ /* u64 to u32 casting preserves validity of low 32 bits as
+ * a range, if upper 32 bits are the same
+ */
+ reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value);
+ reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value);
+
+ if ((s32)reg->umin_value <= (s32)reg->umax_value) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
+ }
+ }
+ if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) {
+ /* low 32 bits should form a proper u32 range */
+ if ((u32)reg->smin_value <= (u32)reg->smax_value) {
+ reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value);
+ reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value);
+ }
+ /* low 32 bits should form a proper s32 range */
+ if ((s32)reg->smin_value <= (s32)reg->smax_value) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
+ }
+ }
+ /* Special case where upper bits form a small sequence of two
+ * sequential numbers (in 32-bit unsigned space, so 0xffffffff to
+ * 0x00000000 is also valid), while lower bits form a proper s32 range
+ * going from negative numbers to positive numbers. E.g., let's say we
+ * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]).
+ * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff,
+ * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits,
+ * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]).
+ * Note that it doesn't have to be 0xffffffff going to 0x00000000 in
+ * upper 32 bits. As a random example, s64 range
+ * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range
+ * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister.
+ */
+ if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) &&
+ (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
+ }
+ if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) &&
+ (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
+ }
+ /* if u32 range forms a valid s32 range (due to matching sign bit),
+ * try to learn from that
+ */
+ if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value);
}
- /* Learn sign from unsigned bounds. Signed bounds cross the sign
- * boundary, so we must be careful.
+ /* If we cannot cross the sign boundary, then signed and unsigned bounds
+ * are the same, so combine. This works even in the negative case, e.g.
+ * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
*/
- if ((s32)reg->u32_max_value >= 0) {
- /* Positive. We can't learn anything from the smin, but smax
- * is positive, hence safe.
- */
- reg->s32_min_value = reg->u32_min_value;
- reg->s32_max_value = reg->u32_max_value =
- min_t(u32, reg->s32_max_value, reg->u32_max_value);
- } else if ((s32)reg->u32_min_value < 0) {
- /* Negative. We can't learn anything from the smax, but smin
- * is negative, hence safe.
- */
- reg->s32_min_value = reg->u32_min_value =
- max_t(u32, reg->s32_min_value, reg->u32_min_value);
- reg->s32_max_value = reg->u32_max_value;
+ if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
+ reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value);
+ reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value);
}
}
static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
{
- /* Learn sign from signed bounds.
- * If we cannot cross the sign boundary, then signed and unsigned bounds
+ /* If u64 range forms a valid s64 range (due to matching sign bit),
+ * try to learn from that. Let's do a bit of ASCII art to see when
+ * this is happening. Let's take u64 range first:
+ *
+ * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
+ * |-------------------------------|--------------------------------|
+ *
+ * Valid u64 range is formed when umin and umax are anywhere in the
+ * range [0, U64_MAX], and umin <= umax. u64 case is simple and
+ * straightforward. Let's see how s64 range maps onto the same range
+ * of values, annotated below the line for comparison:
+ *
+ * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
+ * |-------------------------------|--------------------------------|
+ * 0 S64_MAX S64_MIN -1
+ *
+ * So s64 values basically start in the middle and they are logically
+ * contiguous to the right of it, wrapping around from -1 to 0, and
+ * then finishing as S64_MAX (0x7fffffffffffffff) right before
+ * S64_MIN. We can try drawing the continuity of u64 vs s64 values
+ * more visually as mapped to sign-agnostic range of hex values.
+ *
+ * u64 start u64 end
+ * _______________________________________________________________
+ * / \
+ * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
+ * |-------------------------------|--------------------------------|
+ * 0 S64_MAX S64_MIN -1
+ * / \
+ * >------------------------------ ------------------------------->
+ * s64 continues... s64 end s64 start s64 "midpoint"
+ *
+ * What this means is that, in general, we can't always derive
+ * something new about u64 from any random s64 range, and vice versa.
+ *
+ * But we can do that in two particular cases. One is when entire
+ * u64/s64 range is *entirely* contained within left half of the above
+ * diagram or when it is *entirely* contained in the right half. I.e.:
+ *
+ * |-------------------------------|--------------------------------|
+ * ^ ^ ^ ^
+ * A B C D
+ *
+ * [A, B] and [C, D] are contained entirely in their respective halves
+ * and form valid contiguous ranges as both u64 and s64 values. [A, B]
+ * will be non-negative both as u64 and s64 (and in fact it will be
+ * identical ranges no matter the signedness). [C, D] treated as s64
+ * will be a range of negative values, while in u64 it will be
+ * non-negative range of values larger than 0x8000000000000000.
+ *
+ * Now, any other range here can't be represented in both u64 and s64
+ * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid
+ * contiguous u64 ranges, but they are discontinuous in s64. [B, C]
+ * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX],
+ * for example. Similarly, valid s64 range [D, A] (going from negative
+ * to positive values), would be two separate [D, U64_MAX] and [0, A]
+ * ranges as u64. Currently reg_state can't represent two segments per
+ * numeric domain, so in such situations we can only derive maximal
+ * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64).
+ *
+ * So we use these facts to derive umin/umax from smin/smax and vice
+ * versa only if they stay within the same "half". This is equivalent
+ * to checking sign bit: lower half will have sign bit as zero, upper
+ * half have sign bit 1. Below in code we simplify this by just
+ * casting umin/umax as smin/smax and checking if they form valid
+ * range, and vice versa. Those are equivalent checks.
+ */
+ if ((s64)reg->umin_value <= (s64)reg->umax_value) {
+ reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value);
+ reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value);
+ }
+ /* If we cannot cross the sign boundary, then signed and unsigned bounds
* are the same, so combine. This works even in the negative case, e.g.
* -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
*/
- if (reg->smin_value >= 0 || reg->smax_value < 0) {
- reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
- reg->umin_value);
- reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
- reg->umax_value);
- return;
+ if ((u64)reg->smin_value <= (u64)reg->smax_value) {
+ reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value);
+ reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value);
}
- /* Learn sign from unsigned bounds. Signed bounds cross the sign
- * boundary, so we must be careful.
+}
+
+static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
+{
+ /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
+ * values on both sides of 64-bit range in hope to have tigher range.
+ * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
+ * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
+ * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
+ * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of
+ * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
+ * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
+ * We just need to make sure that derived bounds we are intersecting
+ * with are well-formed ranges in respecitve s64 or u64 domain, just
+ * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
*/
- if ((s64)reg->umax_value >= 0) {
- /* Positive. We can't learn anything from the smin, but smax
- * is positive, hence safe.
- */
- reg->smin_value = reg->umin_value;
- reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
- reg->umax_value);
- } else if ((s64)reg->umin_value < 0) {
- /* Negative. We can't learn anything from the smax, but smin
- * is negative, hence safe.
- */
- reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
- reg->umin_value);
- reg->smax_value = reg->umax_value;
+ __u64 new_umin, new_umax;
+ __s64 new_smin, new_smax;
+
+ /* u32 -> u64 tightening, it's always well-formed */
+ new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value;
+ new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value;
+ reg->umin_value = max_t(u64, reg->umin_value, new_umin);
+ reg->umax_value = min_t(u64, reg->umax_value, new_umax);
+ /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */
+ new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value;
+ new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value;
+ reg->smin_value = max_t(s64, reg->smin_value, new_smin);
+ reg->smax_value = min_t(s64, reg->smax_value, new_smax);
+
+ /* if s32 can be treated as valid u32 range, we can use it as well */
+ if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
+ /* s32 -> u64 tightening */
+ new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
+ new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
+ reg->umin_value = max_t(u64, reg->umin_value, new_umin);
+ reg->umax_value = min_t(u64, reg->umax_value, new_umax);
+ /* s32 -> s64 tightening */
+ new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
+ new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
+ reg->smin_value = max_t(s64, reg->smin_value, new_smin);
+ reg->smax_value = min_t(s64, reg->smax_value, new_smax);
}
}
@@ -2160,6 +2157,7 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg)
{
__reg32_deduce_bounds(reg);
__reg64_deduce_bounds(reg);
+ __reg_deduce_mixed_bounds(reg);
}
/* Attempts to improve var_off based on unsigned min/max information */
@@ -2181,6 +2179,7 @@ static void reg_bounds_sync(struct bpf_reg_state *reg)
__update_reg_bounds(reg);
/* We might have learned something about the sign bit. */
__reg_deduce_bounds(reg);
+ __reg_deduce_bounds(reg);
/* We might have learned some bits from the bounds. */
__reg_bound_offset(reg);
/* Intersecting with the old var_off might have improved our bounds
@@ -2190,6 +2189,56 @@ static void reg_bounds_sync(struct bpf_reg_state *reg)
__update_reg_bounds(reg);
}
+static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, const char *ctx)
+{
+ const char *msg;
+
+ if (reg->umin_value > reg->umax_value ||
+ reg->smin_value > reg->smax_value ||
+ reg->u32_min_value > reg->u32_max_value ||
+ reg->s32_min_value > reg->s32_max_value) {
+ msg = "range bounds violation";
+ goto out;
+ }
+
+ if (tnum_is_const(reg->var_off)) {
+ u64 uval = reg->var_off.value;
+ s64 sval = (s64)uval;
+
+ if (reg->umin_value != uval || reg->umax_value != uval ||
+ reg->smin_value != sval || reg->smax_value != sval) {
+ msg = "const tnum out of sync with range bounds";
+ goto out;
+ }
+ }
+
+ if (tnum_subreg_is_const(reg->var_off)) {
+ u32 uval32 = tnum_subreg(reg->var_off).value;
+ s32 sval32 = (s32)uval32;
+
+ if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
+ reg->s32_min_value != sval32 || reg->s32_max_value != sval32) {
+ msg = "const subreg tnum out of sync with range bounds";
+ goto out;
+ }
+ }
+
+ return 0;
+out:
+ verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] "
+ "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n",
+ ctx, msg, reg->umin_value, reg->umax_value,
+ reg->smin_value, reg->smax_value,
+ reg->u32_min_value, reg->u32_max_value,
+ reg->s32_min_value, reg->s32_max_value,
+ reg->var_off.value, reg->var_off.mask);
+ if (env->test_reg_invariants)
+ return -EFAULT;
+ __mark_reg_unbounded(reg);
+ return 0;
+}
+
static bool __reg32_bound_s64(s32 a)
{
return a >= 0 && a <= S32_MAX;
@@ -2214,51 +2263,6 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
}
}
-static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
-{
- /* special case when 64-bit register has upper 32-bit register
- * zeroed. Typically happens after zext or <<32, >>32 sequence
- * allowing us to use 32-bit bounds directly,
- */
- if (tnum_equals_const(tnum_clear_subreg(reg->var_off), 0)) {
- __reg_assign_32_into_64(reg);
- } else {
- /* Otherwise the best we can do is push lower 32bit known and
- * unknown bits into register (var_off set from jmp logic)
- * then learn as much as possible from the 64-bit tnum
- * known and unknown bits. The previous smin/smax bounds are
- * invalid here because of jmp32 compare so mark them unknown
- * so they do not impact tnum bounds calculation.
- */
- __mark_reg64_unbounded(reg);
- }
- reg_bounds_sync(reg);
-}
-
-static bool __reg64_bound_s32(s64 a)
-{
- return a >= S32_MIN && a <= S32_MAX;
-}
-
-static bool __reg64_bound_u32(u64 a)
-{
- return a >= U32_MIN && a <= U32_MAX;
-}
-
-static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
-{
- __mark_reg32_unbounded(reg);
- if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) {
- reg->s32_min_value = (s32)reg->smin_value;
- reg->s32_max_value = (s32)reg->smax_value;
- }
- if (__reg64_bound_u32(reg->umin_value) && __reg64_bound_u32(reg->umax_value)) {
- reg->u32_min_value = (u32)reg->umin_value;
- reg->u32_max_value = (u32)reg->umax_value;
- }
- reg_bounds_sync(reg);
-}
-
/* Mark a register as having a completely unknown (scalar) value. */
static void __mark_reg_unknown(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg)
@@ -2346,6 +2350,11 @@ static void init_reg_state(struct bpf_verifier_env *env,
regs[BPF_REG_FP].frameno = state->frameno;
}
+static struct bpf_retval_range retval_range(s32 minval, s32 maxval)
+{
+ return (struct bpf_retval_range){ minval, maxval };
+}
+
#define BPF_MAIN_FUNC (-1)
static void init_func_state(struct bpf_verifier_env *env,
struct bpf_func_state *state,
@@ -2354,7 +2363,7 @@ static void init_func_state(struct bpf_verifier_env *env,
state->callsite = callsite;
state->frameno = frameno;
state->subprogno = subprogno;
- state->callback_ret_range = tnum_range(0, 0);
+ state->callback_ret_range = retval_range(0, 0);
init_reg_state(env, state);
mark_verifier_state_scratched(env);
}
@@ -2454,6 +2463,68 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
return env->subprog_cnt - 1;
}
+static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
+{
+ struct bpf_prog_aux *aux = env->prog->aux;
+ struct btf *btf = aux->btf;
+ const struct btf_type *t;
+ u32 main_btf_id, id;
+ const char *name;
+ int ret, i;
+
+ /* Non-zero func_info_cnt implies valid btf */
+ if (!aux->func_info_cnt)
+ return 0;
+ main_btf_id = aux->func_info[0].type_id;
+
+ t = btf_type_by_id(btf, main_btf_id);
+ if (!t) {
+ verbose(env, "invalid btf id for main subprog in func_info\n");
+ return -EINVAL;
+ }
+
+ name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:");
+ if (IS_ERR(name)) {
+ ret = PTR_ERR(name);
+ /* If there is no tag present, there is no exception callback */
+ if (ret == -ENOENT)
+ ret = 0;
+ else if (ret == -EEXIST)
+ verbose(env, "multiple exception callback tags for main subprog\n");
+ return ret;
+ }
+
+ ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC);
+ if (ret < 0) {
+ verbose(env, "exception callback '%s' could not be found in BTF\n", name);
+ return ret;
+ }
+ id = ret;
+ t = btf_type_by_id(btf, id);
+ if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) {
+ verbose(env, "exception callback '%s' must have global linkage\n", name);
+ return -EINVAL;
+ }
+ ret = 0;
+ for (i = 0; i < aux->func_info_cnt; i++) {
+ if (aux->func_info[i].type_id != id)
+ continue;
+ ret = aux->func_info[i].insn_off;
+ /* Further func_info and subprog checks will also happen
+ * later, so assume this is the right insn_off for now.
+ */
+ if (!ret) {
+ verbose(env, "invalid exception callback insn_off in func_info: 0\n");
+ ret = -EINVAL;
+ }
+ }
+ if (!ret) {
+ verbose(env, "exception callback type id not found in func_info\n");
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
#define MAX_KFUNC_DESCS 256
#define MAX_KFUNC_BTFS 256
@@ -2793,8 +2864,8 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
struct bpf_subprog_info *subprog = env->subprog_info;
+ int i, ret, insn_cnt = env->prog->len, ex_cb_insn;
struct bpf_insn *insn = env->prog->insnsi;
- int i, ret, insn_cnt = env->prog->len;
/* Add entry function. */
ret = add_subprog(env, 0);
@@ -2820,6 +2891,27 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
return ret;
}
+ ret = bpf_find_exception_callback_insn_off(env);
+ if (ret < 0)
+ return ret;
+ ex_cb_insn = ret;
+
+ /* If ex_cb_insn > 0, this means that the main program has a subprog
+ * marked using BTF decl tag to serve as the exception callback.
+ */
+ if (ex_cb_insn) {
+ ret = add_subprog(env, ex_cb_insn);
+ if (ret < 0)
+ return ret;
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (env->subprog_info[i].start != ex_cb_insn)
+ continue;
+ env->exception_callback_subprog = i;
+ mark_subprog_exc_cb(env, i);
+ break;
+ }
+ }
+
/* Add a fake 'exit' subprog which could simplify subprog iteration
* logic. 'subprog_cnt' should not be increased.
*/
@@ -2868,7 +2960,7 @@ next:
if (i == subprog_end - 1) {
/* to avoid fall-through from one subprog into another
* the last insn of the subprog should be either exit
- * or unconditional jump back
+ * or unconditional jump back or bpf_throw call
*/
if (code != (BPF_JMP | BPF_EXIT) &&
code != (BPF_JMP32 | BPF_JA) &&
@@ -3029,7 +3121,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (class == BPF_LDX) {
if (t != SRC_OP)
- return BPF_SIZE(code) == BPF_DW;
+ return BPF_SIZE(code) == BPF_DW || BPF_MODE(code) == BPF_MEMSX;
/* LDX source must be ptr. */
return true;
}
@@ -3118,13 +3210,11 @@ static void mark_insn_zext(struct bpf_verifier_env *env,
reg->subreg_def = DEF_NOT_SUBREG;
}
-static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
- enum reg_arg_type t)
+static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno,
+ enum reg_arg_type t)
{
- struct bpf_verifier_state *vstate = env->cur_state;
- struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
- struct bpf_reg_state *reg, *regs = state->regs;
+ struct bpf_reg_state *reg;
bool rw64;
if (regno >= MAX_BPF_REG) {
@@ -3165,6 +3255,30 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
+ enum reg_arg_type t)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+
+ return __check_reg_arg(env, state->regs, regno, t);
+}
+
+static int insn_stack_access_flags(int frameno, int spi)
+{
+ return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
+}
+
+static int insn_stack_access_spi(int insn_flags)
+{
+ return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
+}
+
+static int insn_stack_access_frameno(int insn_flags)
+{
+ return insn_flags & INSN_F_FRAMENO_MASK;
+}
+
static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
{
env->insn_aux_data[idx].jmp_point = true;
@@ -3176,36 +3290,76 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
}
/* for any branch, call, exit record the history of jmps in the given state */
-static int push_jmp_history(struct bpf_verifier_env *env,
- struct bpf_verifier_state *cur)
+static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+ int insn_flags)
{
u32 cnt = cur->jmp_history_cnt;
- struct bpf_idx_pair *p;
+ struct bpf_jmp_history_entry *p;
size_t alloc_size;
- if (!is_jmp_point(env, env->insn_idx))
+ /* combine instruction flags if we already recorded this instruction */
+ if (env->cur_hist_ent) {
+ /* atomic instructions push insn_flags twice, for READ and
+ * WRITE sides, but they should agree on stack slot
+ */
+ WARN_ONCE((env->cur_hist_ent->flags & insn_flags) &&
+ (env->cur_hist_ent->flags & insn_flags) != insn_flags,
+ "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n",
+ env->insn_idx, env->cur_hist_ent->flags, insn_flags);
+ env->cur_hist_ent->flags |= insn_flags;
return 0;
+ }
cnt++;
alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
if (!p)
return -ENOMEM;
- p[cnt - 1].idx = env->insn_idx;
- p[cnt - 1].prev_idx = env->prev_insn_idx;
cur->jmp_history = p;
+
+ p = &cur->jmp_history[cnt - 1];
+ p->idx = env->insn_idx;
+ p->prev_idx = env->prev_insn_idx;
+ p->flags = insn_flags;
cur->jmp_history_cnt = cnt;
+ env->cur_hist_ent = p;
+
return 0;
}
+static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
+ u32 hist_end, int insn_idx)
+{
+ if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
+ return &st->jmp_history[hist_end - 1];
+ return NULL;
+}
+
/* Backtrack one insn at a time. If idx is not at the top of recorded
* history then previous instruction came from straight line execution.
+ * Return -ENOENT if we exhausted all instructions within given state.
+ *
+ * It's legal to have a bit of a looping with the same starting and ending
+ * insn index within the same state, e.g.: 3->4->5->3, so just because current
+ * instruction index is the same as state's first_idx doesn't mean we are
+ * done. If there is still some jump history left, we should keep going. We
+ * need to take into account that we might have a jump history between given
+ * state's parent and itself, due to checkpointing. In this case, we'll have
+ * history entry recording a jump from last instruction of parent state and
+ * first instruction of given state.
*/
static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
u32 *history)
{
u32 cnt = *history;
+ if (i == st->first_insn_idx) {
+ if (cnt == 0)
+ return -ENOENT;
+ if (cnt == 1 && st->jmp_history[0].idx == i)
+ return -ENOENT;
+ }
+
if (cnt && st->jmp_history[cnt - 1].idx == i) {
i = st->jmp_history[cnt - 1].prev_idx;
(*history)--;
@@ -3307,16 +3461,6 @@ static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u3
bt->stack_masks[frame] &= ~(1ull << slot);
}
-static inline void bt_set_slot(struct backtrack_state *bt, u32 slot)
-{
- bt_set_frame_slot(bt, bt->frame, slot);
-}
-
-static inline void bt_clear_slot(struct backtrack_state *bt, u32 slot)
-{
- bt_clear_frame_slot(bt, bt->frame, slot);
-}
-
static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
{
return bt->reg_masks[frame];
@@ -3342,9 +3486,9 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
return bt->reg_masks[bt->frame] & (1 << reg);
}
-static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot)
+static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
{
- return bt->stack_masks[bt->frame] & (1ull << slot);
+ return bt->stack_masks[frame] & (1ull << slot);
}
/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
@@ -3386,6 +3530,8 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
}
}
+static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
+
/* For given verifier state backtrack_insn() is called from the last insn to
* the first insn. Its purpose is to compute a bitmask of registers and
* stack slots that needs precision in the parent verifier state.
@@ -3396,7 +3542,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
* - *was* processed previously during backtracking.
*/
static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
- struct backtrack_state *bt)
+ struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
{
const struct bpf_insn_cbs cbs = {
.cb_call = disasm_kfunc_name,
@@ -3409,7 +3555,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
u8 mode = BPF_MODE(insn->code);
u32 dreg = insn->dst_reg;
u32 sreg = insn->src_reg;
- u32 spi, i;
+ u32 spi, i, fr;
if (insn->code == 0)
return 0;
@@ -3426,7 +3572,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
if (class == BPF_ALU || class == BPF_ALU64) {
if (!bt_is_reg_set(bt, dreg))
return 0;
- if (opcode == BPF_MOV) {
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ /* sreg is reserved and unused
+ * dreg still need precision before this insn
+ */
+ return 0;
+ } else if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
/* dreg = sreg or dreg = (s8, s16, s32)sreg
* dreg needs precision after this insn
@@ -3465,20 +3616,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* by 'precise' mark in corresponding register of this state.
* No further tracking necessary.
*/
- if (insn->src_reg != BPF_REG_FP)
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
return 0;
-
/* dreg = *(u64 *)[fp - off] was a fill from the stack.
* that [fp - off] slot contains scalar that needs to be
* tracked with precision
*/
- spi = (-insn->off - 1) / BPF_REG_SIZE;
- if (spi >= 64) {
- verbose(env, "BUG spi %d\n", spi);
- WARN_ONCE(1, "verifier backtracking bug");
- return -EFAULT;
- }
- bt_set_slot(bt, spi);
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ bt_set_frame_slot(bt, fr, spi);
} else if (class == BPF_STX || class == BPF_ST) {
if (bt_is_reg_set(bt, dreg))
/* stx & st shouldn't be using _scalar_ dst_reg
@@ -3487,17 +3633,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
*/
return -ENOTSUPP;
/* scalars can only be spilled into stack */
- if (insn->dst_reg != BPF_REG_FP)
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
return 0;
- spi = (-insn->off - 1) / BPF_REG_SIZE;
- if (spi >= 64) {
- verbose(env, "BUG spi %d\n", spi);
- WARN_ONCE(1, "verifier backtracking bug");
- return -EFAULT;
- }
- if (!bt_is_slot_set(bt, spi))
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ if (!bt_is_frame_slot_set(bt, fr, spi))
return 0;
- bt_clear_slot(bt, spi);
+ bt_clear_frame_slot(bt, fr, spi);
if (class == BPF_STX)
bt_set_reg(bt, sreg);
} else if (class == BPF_JMP || class == BPF_JMP32) {
@@ -3541,10 +3683,14 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
- /* we don't track register spills perfectly,
- * so fallback to force-precise instead of failing */
- if (bt_stack_mask(bt) != 0)
- return -ENOTSUPP;
+ /* we are now tracking register spills correctly,
+ * so any instance of leftover slots is a bug
+ */
+ if (bt_stack_mask(bt) != 0) {
+ verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)");
+ return -EFAULT;
+ }
/* propagate r1-r5 to the caller */
for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
if (bt_is_reg_set(bt, i)) {
@@ -3556,24 +3702,24 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
return -EFAULT;
return 0;
}
- } else if ((bpf_helper_call(insn) &&
- is_callback_calling_function(insn->imm) &&
- !is_async_callback_calling_function(insn->imm)) ||
- (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) {
- /* callback-calling helper or kfunc call, which means
- * we are exiting from subprog, but unlike the subprog
- * call handling above, we shouldn't propagate
- * precision of r1-r5 (if any requested), as they are
- * not actually arguments passed directly to callback
- * subprogs
+ } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
+ /* exit from callback subprog to callback-calling helper or
+ * kfunc call. Use idx/subseq_idx check to discern it from
+ * straight line code backtracking.
+ * Unlike the subprog call handling above, we shouldn't
+ * propagate precision of r1-r5 (if any requested), as they are
+ * not actually arguments passed directly to callback subprogs
*/
if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
- if (bt_stack_mask(bt) != 0)
- return -ENOTSUPP;
+ if (bt_stack_mask(bt) != 0) {
+ verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)");
+ return -EFAULT;
+ }
/* clear r1-r5 in callback subprog's mask */
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
bt_clear_reg(bt, i);
@@ -3600,10 +3746,18 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
} else if (opcode == BPF_EXIT) {
bool r0_precise;
+ /* Backtracking to a nested function call, 'idx' is a part of
+ * the inner frame 'subseq_idx' is a part of the outer frame.
+ * In case of a regular function call, instructions giving
+ * precision to registers R1-R5 should have been found already.
+ * In case of a callback, it is ok to have R1-R5 marked for
+ * backtracking, as these registers are set by the function
+ * invoking callback.
+ */
+ if (subseq_idx >= 0 && calls_callback(env, subseq_idx))
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- /* if backtracing was looking for registers R1-R5
- * they should have been found already.
- */
verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
@@ -4002,6 +4156,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
for (;;) {
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;
+ struct bpf_jmp_history_entry *hist;
if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
@@ -4047,11 +4202,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
bitmap_from_u64(mask, bt_reg_mask(bt));
for_each_set_bit(i, mask, 32) {
reg = &st->frame[0]->regs[i];
- if (reg->type != SCALAR_VALUE) {
- bt_clear_reg(bt, i);
- continue;
- }
- reg->precise = true;
+ bt_clear_reg(bt, i);
+ if (reg->type == SCALAR_VALUE)
+ reg->precise = true;
}
return 0;
}
@@ -4067,7 +4220,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
err = 0;
skip_first = false;
} else {
- err = backtrack_insn(env, i, subseq_idx, bt);
+ hist = get_jmp_hist_entry(st, history, i);
+ err = backtrack_insn(env, i, subseq_idx, hist, bt);
}
if (err == -ENOTSUPP) {
mark_all_scalars_precise(env, env->cur_state);
@@ -4082,10 +4236,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
* Nothing to be tracked further in the parent state.
*/
return 0;
- if (i == first_idx)
- break;
subseq_idx = i;
i = get_prev_insn_idx(st, i, &history);
+ if (i == -ENOENT)
+ break;
if (i >= env->prog->len) {
/* This can happen if backtracking reached insn 0
* and there are still reg_mask or stack_mask
@@ -4120,22 +4274,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
for_each_set_bit(i, mask, 64) {
if (i >= func->allocated_stack / BPF_REG_SIZE) {
- /* the sequence of instructions:
- * 2: (bf) r3 = r10
- * 3: (7b) *(u64 *)(r3 -8) = r0
- * 4: (79) r4 = *(u64 *)(r10 -8)
- * doesn't contain jmps. It's backtracked
- * as a single block.
- * During backtracking insn 3 is not recognized as
- * stack access, so at the end of backtracking
- * stack slot fp-8 is still marked in stack_mask.
- * However the parent state may not have accessed
- * fp-8 and it's "unallocated" stack space.
- * In such case fallback to conservative.
- */
- mark_all_scalars_precise(env, env->cur_state);
- bt_reset(bt);
- return 0;
+ verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n",
+ i, func->allocated_stack / BPF_REG_SIZE);
+ WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)");
+ return -EFAULT;
}
if (!is_spilled_scalar_reg(&func->stack[i])) {
@@ -4225,9 +4367,17 @@ static bool register_is_null(struct bpf_reg_state *reg)
return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
}
-static bool register_is_const(struct bpf_reg_state *reg)
+/* check if register is a constant scalar value */
+static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
+{
+ return reg->type == SCALAR_VALUE &&
+ tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off);
+}
+
+/* assuming is_reg_const() is true, return constant value of a register */
+static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
{
- return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
+ return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
}
static bool __is_scalar_unbounded(struct bpf_reg_state *reg)
@@ -4264,7 +4414,8 @@ static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_
dst->live = live;
}
-static void save_register_state(struct bpf_func_state *state,
+static void save_register_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *state,
int spi, struct bpf_reg_state *reg,
int size)
{
@@ -4279,7 +4430,7 @@ static void save_register_state(struct bpf_func_state *state,
/* size < 8 bytes spill */
for (; i; i--)
- scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]);
+ mark_stack_slot_misc(env, &state->stack[spi].slot_type[i - 1]);
}
static bool is_bpf_st_mem(struct bpf_insn *insn)
@@ -4300,16 +4451,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
struct bpf_reg_state *reg = NULL;
- u32 dst_reg = insn->dst_reg;
+ int insn_flags = insn_stack_access_flags(state->frameno, spi);
- err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
- if (err)
- return err;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
if (!env->allow_ptr_leaks &&
- state->stack[spi].slot_type[0] == STACK_SPILL &&
+ is_spilled_reg(&state->stack[spi]) &&
size != BPF_REG_SIZE) {
verbose(env, "attempt to corrupt spilled pointer on stack\n");
return -EACCES;
@@ -4339,20 +4487,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
return err;
mark_stack_slot_scratched(env, spi);
- if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
- !register_is_null(reg) && env->bpf_capable) {
- if (dst_reg != BPF_REG_FP) {
- /* The backtracking logic can only recognize explicit
- * stack slot address like [fp - 8]. Other spill of
- * scalar via different register has to be conservative.
- * Backtrack from here and mark all registers as precise
- * that contributed into 'reg' being a constant.
- */
- err = mark_chain_precision(env, value_regno);
- if (err)
- return err;
- }
- save_register_state(state, spi, reg, size);
+ if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && env->bpf_capable) {
+ save_register_state(env, state, spi, reg, size);
/* Break the relation on a narrowing spill. */
if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
state->stack[spi].spilled_ptr.id = 0;
@@ -4360,9 +4496,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
insn->imm != 0 && env->bpf_capable) {
struct bpf_reg_state fake_reg = {};
- __mark_reg_known(&fake_reg, (u32)insn->imm);
+ __mark_reg_known(&fake_reg, insn->imm);
fake_reg.type = SCALAR_VALUE;
- save_register_state(state, spi, &fake_reg, size);
+ save_register_state(env, state, spi, &fake_reg, size);
} else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -4374,7 +4510,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
return -EINVAL;
}
- save_register_state(state, spi, reg, size);
+ save_register_state(env, state, spi, reg, size);
} else {
u8 type = STACK_MISC;
@@ -4399,7 +4535,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
/* when we zero initialize stack slots mark them as such */
if ((reg && register_is_null(reg)) ||
(!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
- /* backtracking doesn't work for STACK_ZERO yet. */
+ /* STACK_ZERO case happened because register spill
+ * wasn't properly aligned at the stack slot boundary,
+ * so it's not a register spill anymore; force
+ * originating register to be precise to make
+ * STACK_ZERO correct for subsequent states
+ */
err = mark_chain_precision(env, value_regno);
if (err)
return err;
@@ -4408,9 +4549,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
/* Mark slots affected by this stack write. */
for (i = 0; i < size; i++)
- state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
- type;
+ state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type;
+ insn_flags = 0; /* not a register spill */
}
+
+ if (insn_flags)
+ return push_jmp_history(env, env->cur_state, insn_flags);
return 0;
}
@@ -4460,10 +4604,6 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
(!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
writing_zero = true;
- err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE));
- if (err)
- return err;
-
for (i = min_off; i < max_off; i++) {
int spi;
@@ -4562,21 +4702,10 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env,
zeros++;
}
if (zeros == max_off - min_off) {
- /* any access_size read into register is zero extended,
- * so the whole register == const_zero
+ /* Any access_size read into register is zero extended,
+ * so the whole register == const_zero.
*/
- __mark_reg_const_zero(&state->regs[dst_regno]);
- /* backtracking doesn't support STACK_ZERO yet,
- * so mark it precise here, so that later
- * backtracking can stop here.
- * Backtracking may not need this if this register
- * doesn't participate in pointer adjustment.
- * Forward propagation of precise flag is not
- * necessary either. This mark is only to stop
- * backtracking. Any register that contributed
- * to const 0 was marked precise before spill.
- */
- state->regs[dst_regno].precise = true;
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
} else {
/* have read misc data from the stack */
mark_reg_unknown(env, state->regs, dst_regno);
@@ -4603,6 +4732,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
struct bpf_reg_state *reg;
u8 *stype, type;
+ int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
@@ -4635,25 +4765,42 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
copy_register_state(&state->regs[dst_regno], reg);
state->regs[dst_regno].subreg_def = subreg_def;
} else {
+ int spill_cnt = 0, zero_cnt = 0;
+
for (i = 0; i < size; i++) {
type = stype[(slot - i) % BPF_REG_SIZE];
- if (type == STACK_SPILL)
+ if (type == STACK_SPILL) {
+ spill_cnt++;
continue;
+ }
if (type == STACK_MISC)
continue;
+ if (type == STACK_ZERO) {
+ zero_cnt++;
+ continue;
+ }
if (type == STACK_INVALID && env->allow_uninit_stack)
continue;
verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
}
- mark_reg_unknown(env, state->regs, dst_regno);
+
+ if (spill_cnt == size &&
+ tnum_is_const(reg->var_off) && reg->var_off.value == 0) {
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
+ /* this IS register fill, so keep insn_flags */
+ } else if (zero_cnt == size) {
+ /* similarly to mark_reg_stack_read(), preserve zeroes */
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
+ insn_flags = 0; /* not restoring original register state */
+ } else {
+ mark_reg_unknown(env, state->regs, dst_regno);
+ insn_flags = 0; /* not restoring original register state */
+ }
}
state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
- return 0;
- }
-
- if (dst_regno >= 0) {
+ } else if (dst_regno >= 0) {
/* restore register state from stack */
copy_register_state(&state->regs[dst_regno], reg);
/* mark reg as written since spilled pointer state likely
@@ -4689,7 +4836,10 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
if (dst_regno >= 0)
mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
+ insn_flags = 0; /* we are not restoring spilled register */
}
+ if (insn_flags)
+ return push_jmp_history(env, env->cur_state, insn_flags);
return 0;
}
@@ -4979,8 +5129,8 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env,
return 0;
}
-int check_ptr_off_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno)
+static int check_ptr_off_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno)
{
return __check_ptr_off_reg(env, reg, regno, false);
}
@@ -5001,6 +5151,8 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
perm_flags |= PTR_UNTRUSTED;
} else {
perm_flags = PTR_MAYBE_NULL | MEM_ALLOC;
+ if (kptr_field->type == BPF_KPTR_PERCPU)
+ perm_flags |= MEM_PERCPU;
}
if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
@@ -5044,7 +5196,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
*/
if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
kptr_field->kptr.btf, kptr_field->kptr.btf_id,
- kptr_field->type == BPF_KPTR_REF))
+ kptr_field->type != BPF_KPTR_UNREF))
goto bad_type;
return 0;
bad_type:
@@ -5072,7 +5224,9 @@ static bool in_rcu_cs(struct bpf_verifier_env *env)
/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
BTF_SET_START(rcu_protected_types)
BTF_ID(struct, prog_test_ref_kfunc)
+#ifdef CONFIG_CGROUPS
BTF_ID(struct, cgroup)
+#endif
BTF_ID(struct, bpf_cpumask)
BTF_ID(struct, task_struct)
BTF_SET_END(rcu_protected_types)
@@ -5080,15 +5234,52 @@ BTF_SET_END(rcu_protected_types)
static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
{
if (!btf_is_kernel(btf))
- return false;
+ return true;
return btf_id_set_contains(&rcu_protected_types, btf_id);
}
+static struct btf_record *kptr_pointee_btf_record(struct btf_field *kptr_field)
+{
+ struct btf_struct_meta *meta;
+
+ if (btf_is_kernel(kptr_field->kptr.btf))
+ return NULL;
+
+ meta = btf_find_struct_meta(kptr_field->kptr.btf,
+ kptr_field->kptr.btf_id);
+
+ return meta ? meta->record : NULL;
+}
+
static bool rcu_safe_kptr(const struct btf_field *field)
{
const struct btf_field_kptr *kptr = &field->kptr;
- return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
+ return field->type == BPF_KPTR_PERCPU ||
+ (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id));
+}
+
+static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
+{
+ struct btf_record *rec;
+ u32 ret;
+
+ ret = PTR_MAYBE_NULL;
+ if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) {
+ ret |= MEM_RCU;
+ if (kptr_field->type == BPF_KPTR_PERCPU)
+ ret |= MEM_PERCPU;
+ else if (!btf_is_kernel(kptr_field->kptr.btf))
+ ret |= MEM_ALLOC;
+
+ rec = kptr_pointee_btf_record(kptr_field);
+ if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE))
+ ret |= NON_OWN_REF;
+ } else {
+ ret |= PTR_UNTRUSTED;
+ }
+
+ return ret;
}
static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
@@ -5114,7 +5305,8 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
/* We only allow loading referenced kptr, since it will be marked as
* untrusted, similar to unreferenced kptr.
*/
- if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {
+ if (class != BPF_LDX &&
+ (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU)) {
verbose(env, "store to referenced kptr disallowed\n");
return -EACCES;
}
@@ -5125,10 +5317,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
* value from map as PTR_TO_BTF_ID, with the correct type.
*/
mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
- kptr_field->kptr.btf_id,
- rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ?
- PTR_MAYBE_NULL | MEM_RCU :
- PTR_MAYBE_NULL | PTR_UNTRUSTED);
+ kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field));
/* For mark_ptr_or_null_reg */
val_reg->id = ++env->id_gen;
} else if (class == BPF_STX) {
@@ -5182,6 +5371,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
switch (field->type) {
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
if (src != ACCESS_DIRECT) {
verbose(env, "kptr cannot be accessed indirectly by helper\n");
return -EACCES;
@@ -5578,20 +5768,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
strict);
}
-static int update_stack_depth(struct bpf_verifier_env *env,
- const struct bpf_func_state *func,
- int off)
-{
- u16 stack = env->subprog_info[func->subprogno].stack_depth;
-
- if (stack >= -off)
- return 0;
-
- /* update known max for given subprogram */
- env->subprog_info[func->subprogno].stack_depth = -off;
- return 0;
-}
-
/* starting from main bpf function walk all instructions of the function
* and recursively walk all callees that given function can call.
* Ignore jump and exit insns.
@@ -5649,6 +5825,27 @@ continue_func:
for (; i < subprog_end; i++) {
int next_insn, sidx;
+ if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) {
+ bool err = false;
+
+ if (!is_bpf_throw_kfunc(insn + i))
+ continue;
+ if (subprog[idx].is_cb)
+ err = true;
+ for (int c = 0; c < frame && !err; c++) {
+ if (subprog[ret_prog[c]].is_cb) {
+ err = true;
+ break;
+ }
+ }
+ if (!err)
+ continue;
+ verbose(env,
+ "bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n",
+ i, idx);
+ return -EINVAL;
+ }
+
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
@@ -5671,6 +5868,10 @@ continue_func:
/* async callbacks don't increase bpf prog stack size unless called directly */
if (!bpf_pseudo_call(insn + i))
continue;
+ if (subprog[sidx].is_exception_cb) {
+ verbose(env, "insn %d cannot call exception cb directly\n", i);
+ return -EINVAL;
+ }
}
i = next_insn;
idx = sidx;
@@ -5692,8 +5893,13 @@ continue_func:
* tail call counter throughout bpf2bpf calls combined with tailcalls
*/
if (tail_call_reachable)
- for (j = 0; j < frame; j++)
+ for (j = 0; j < frame; j++) {
+ if (subprog[ret_prog[j]].is_exception_cb) {
+ verbose(env, "cannot tail call within exception cb\n");
+ return -EINVAL;
+ }
subprog[ret_prog[j]].tail_call_reachable = true;
+ }
if (subprog[0].tail_call_reachable)
env->prog->aux->tail_call_reachable = true;
@@ -5833,9 +6039,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
* values are also truncated so we push 64-bit bounds into
* 32-bit bounds. Above were truncated < 32-bits already.
*/
- if (size >= 4)
- return;
- __reg_combine_64_into_32(reg);
+ if (size < 4) {
+ __mark_reg32_unbounded(reg);
+ reg_bounds_sync(reg);
+ }
}
static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
@@ -6209,7 +6416,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
}
if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
- !reg->ref_obj_id) {
+ !(reg->type & MEM_RCU) && !reg->ref_obj_id) {
verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
return -EFAULT;
}
@@ -6350,13 +6557,14 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
* The minimum valid offset is -MAX_BPF_STACK for writes, and
* -state->allocated_stack for reads.
*/
-static int check_stack_slot_within_bounds(int off,
- struct bpf_func_state *state,
- enum bpf_access_type t)
+static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
+ s64 off,
+ struct bpf_func_state *state,
+ enum bpf_access_type t)
{
int min_valid_off;
- if (t == BPF_WRITE)
+ if (t == BPF_WRITE || env->allow_uninit_stack)
min_valid_off = -MAX_BPF_STACK;
else
min_valid_off = -state->allocated_stack;
@@ -6379,7 +6587,7 @@ static int check_stack_access_within_bounds(
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
struct bpf_func_state *state = func(env, reg);
- int min_off, max_off;
+ s64 min_off, max_off;
int err;
char *err_extra;
@@ -6392,11 +6600,8 @@ static int check_stack_access_within_bounds(
err_extra = " write to";
if (tnum_is_const(reg->var_off)) {
- min_off = reg->var_off.value + off;
- if (access_size > 0)
- max_off = min_off + access_size - 1;
- else
- max_off = min_off;
+ min_off = (s64)reg->var_off.value + off;
+ max_off = min_off + access_size;
} else {
if (reg->smax_value >= BPF_MAX_VAR_OFF ||
reg->smin_value <= -BPF_MAX_VAR_OFF) {
@@ -6405,15 +6610,12 @@ static int check_stack_access_within_bounds(
return -EACCES;
}
min_off = reg->smin_value + off;
- if (access_size > 0)
- max_off = reg->smax_value + off + access_size - 1;
- else
- max_off = min_off;
+ max_off = reg->smax_value + off + access_size;
}
- err = check_stack_slot_within_bounds(min_off, state, type);
- if (!err)
- err = check_stack_slot_within_bounds(max_off, state, type);
+ err = check_stack_slot_within_bounds(env, min_off, state, type);
+ if (!err && max_off > 0)
+ err = -EINVAL; /* out of stack access into non-negative offsets */
if (err) {
if (tnum_is_const(reg->var_off)) {
@@ -6423,11 +6625,16 @@ static int check_stack_access_within_bounds(
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n",
- err_extra, regno, tn_buf, access_size);
+ verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n",
+ err_extra, regno, tn_buf, off, access_size);
}
+ return err;
}
- return err;
+
+ /* Note that there is no stack access with offset zero, so the needed stack
+ * size is -min_off, not -min_off+1.
+ */
+ return grow_stack_state(env, state, -min_off /* size */);
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -6442,7 +6649,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
- struct bpf_func_state *state;
int size, err = 0;
size = bpf_size_to_bytes(bpf_size);
@@ -6585,11 +6791,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err)
return err;
- state = func(env, reg);
- err = update_stack_depth(env, state, off);
- if (err)
- return err;
-
if (t == BPF_READ)
err = check_stack_read(env, regno, off, size,
value_regno);
@@ -6778,13 +6979,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
if (err)
return err;
-
return 0;
}
/* When register 'regno' is used to read the stack (either directly or through
* a helper function) make sure that it's within stack boundary and, depending
- * on the access type, that all elements of the stack are initialized.
+ * on the access type and privileges, that all elements of the stack are
+ * initialized.
*
* 'off' includes 'regno->off', but not its dynamic part (if any).
*
@@ -6892,8 +7093,11 @@ static int check_stack_range_initialized(
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
- if (state->allocated_stack <= slot)
- goto err;
+ if (state->allocated_stack <= slot) {
+ verbose(env, "verifier bug: allocated_stack too small");
+ return -EFAULT;
+ }
+
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
if (*stype == STACK_MISC)
goto mark;
@@ -6917,7 +7121,6 @@ static int check_stack_range_initialized(
goto mark;
}
-err:
if (tnum_is_const(reg->var_off)) {
verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
err_extra, regno, min_off, i - min_off, access_size);
@@ -6942,7 +7145,7 @@ mark:
* helper may write to the entire memory range.
*/
}
- return update_stack_depth(env, state, min_off);
+ return 0;
}
static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
@@ -7038,6 +7241,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
}
}
+/* verify arguments to helpers or kfuncs consisting of a pointer and an access
+ * size.
+ *
+ * @regno is the register containing the access size. regno-1 is the register
+ * containing the pointer.
+ */
static int check_mem_size_reg(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, u32 regno,
bool zero_size_allowed,
@@ -7072,12 +7281,10 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
return -EACCES;
}
- if (reg->umin_value == 0) {
- err = check_helper_mem_access(env, regno - 1, 0,
- zero_size_allowed,
- meta);
- if (err)
- return err;
+ if (reg->umin_value == 0 && !zero_size_allowed) {
+ verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n",
+ regno, reg->umin_value, reg->umax_value);
+ return -EACCES;
}
if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
@@ -7093,8 +7300,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
return err;
}
-int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- u32 regno, u32 mem_size)
+static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno, u32 mem_size)
{
bool may_be_null = type_may_be_null(reg->type);
struct bpf_reg_state saved_reg;
@@ -7320,7 +7527,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
return -EACCES;
}
- if (kptr_field->type != BPF_KPTR_REF) {
+ if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU) {
verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
return -EACCES;
}
@@ -7491,15 +7698,24 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
return err;
}
- err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+ err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
if (err)
return err;
} else {
/* iter_next() or iter_destroy() expect initialized iter state*/
- if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+ err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
+ switch (err) {
+ case 0:
+ break;
+ case -EINVAL:
verbose(env, "expected an initialized iter_%s as arg #%d\n",
iter_type_str(meta->btf, btf_id), regno);
- return -EINVAL;
+ return err;
+ case -EPROTO:
+ verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
+ return err;
+ default:
+ return err;
}
spi = iter_get_spi(env, reg, nr_slots);
@@ -7525,6 +7741,81 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
return 0;
}
+/* Look for a previous loop entry at insn_idx: nearest parent state
+ * stopped at insn_idx with callsites matching those in cur->frame.
+ */
+static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *cur,
+ int insn_idx)
+{
+ struct bpf_verifier_state_list *sl;
+ struct bpf_verifier_state *st;
+
+ /* Explored states are pushed in stack order, most recent states come first */
+ sl = *explored_state(env, insn_idx);
+ for (; sl; sl = sl->next) {
+ /* If st->branches != 0 state is a part of current DFS verification path,
+ * hence cur & st for a loop.
+ */
+ st = &sl->state;
+ if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) &&
+ st->dfs_depth < cur->dfs_depth)
+ return st;
+ }
+
+ return NULL;
+}
+
+static void reset_idmap_scratch(struct bpf_verifier_env *env);
+static bool regs_exact(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap);
+
+static void maybe_widen_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap)
+{
+ if (rold->type != SCALAR_VALUE)
+ return;
+ if (rold->type != rcur->type)
+ return;
+ if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap))
+ return;
+ __mark_reg_unknown(env, rcur);
+}
+
+static int widen_imprecise_scalars(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
+{
+ struct bpf_func_state *fold, *fcur;
+ int i, fr;
+
+ reset_idmap_scratch(env);
+ for (fr = old->curframe; fr >= 0; fr--) {
+ fold = old->frame[fr];
+ fcur = cur->frame[fr];
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ maybe_widen_reg(env,
+ &fold->regs[i],
+ &fcur->regs[i],
+ &env->idmap_scratch);
+
+ for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) {
+ if (!is_spilled_reg(&fold->stack[i]) ||
+ !is_spilled_reg(&fcur->stack[i]))
+ continue;
+
+ maybe_widen_reg(env,
+ &fold->stack[i].spilled_ptr,
+ &fcur->stack[i].spilled_ptr,
+ &env->idmap_scratch);
+ }
+ }
+ return 0;
+}
+
/* process_iter_next_call() is called when verifier gets to iterator's next
* "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
* to it as just "iter_next()" in comments below.
@@ -7566,25 +7857,47 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
* is some statically known limit on number of iterations (e.g., if there is
* an explicit `if n > 100 then break;` statement somewhere in the loop).
*
- * One very subtle but very important aspect is that we *always* simulate NULL
- * condition first (as the current state) before we simulate non-NULL case.
- * This has to do with intricacies of scalar precision tracking. By simulating
- * "exit condition" of iter_next() returning NULL first, we make sure all the
- * relevant precision marks *that will be set **after** we exit iterator loop*
- * are propagated backwards to common parent state of NULL and non-NULL
- * branches. Thanks to that, state equivalence checks done later in forked
- * state, when reaching iter_next() for ACTIVE iterator, can assume that
- * precision marks are finalized and won't change. Because simulating another
- * ACTIVE iterator iteration won't change them (because given same input
- * states we'll end up with exactly same output states which we are currently
- * comparing; and verification after the loop already propagated back what
- * needs to be **additionally** tracked as precise). It's subtle, grok
- * precision tracking for more intuitive understanding.
+ * Iteration convergence logic in is_state_visited() relies on exact
+ * states comparison, which ignores read and precision marks.
+ * This is necessary because read and precision marks are not finalized
+ * while in the loop. Exact comparison might preclude convergence for
+ * simple programs like below:
+ *
+ * i = 0;
+ * while(iter_next(&it))
+ * i++;
+ *
+ * At each iteration step i++ would produce a new distinct state and
+ * eventually instruction processing limit would be reached.
+ *
+ * To avoid such behavior speculatively forget (widen) range for
+ * imprecise scalar registers, if those registers were not precise at the
+ * end of the previous iteration and do not match exactly.
+ *
+ * This is a conservative heuristic that allows to verify wide range of programs,
+ * however it precludes verification of programs that conjure an
+ * imprecise value on the first loop iteration and use it as precise on a second.
+ * For example, the following safe program would fail to verify:
+ *
+ * struct bpf_num_iter it;
+ * int arr[10];
+ * int i = 0, a = 0;
+ * bpf_iter_num_new(&it, 0, 10);
+ * while (bpf_iter_num_next(&it)) {
+ * if (a == 0) {
+ * a = 1;
+ * i = 7; // Because i changed verifier would forget
+ * // it's range on second loop entry.
+ * } else {
+ * arr[i] = 42; // This would fail to verify.
+ * }
+ * }
+ * bpf_iter_num_destroy(&it);
*/
static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
struct bpf_kfunc_call_arg_meta *meta)
{
- struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
struct bpf_reg_state *cur_iter, *queued_iter;
int iter_frameno = meta->iter.frameno;
@@ -7602,6 +7915,19 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
}
if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+ /* Because iter_next() call is a checkpoint is_state_visitied()
+ * should guarantee parent state with same call sites and insn_idx.
+ */
+ if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
+ !same_callsites(cur_st->parent, cur_st)) {
+ verbose(env, "bug: bad parent state for iter next call");
+ return -EFAULT;
+ }
+ /* Note cur_st->parent in the call below, it is necessary to skip
+ * checkpoint created for cur_st by is_state_visited()
+ * right at this instruction.
+ */
+ prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
/* branch out active iter state */
queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
if (!queued_st)
@@ -7610,6 +7936,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
queued_iter->iter.depth++;
+ if (prev_st)
+ widen_imprecise_scalars(env, prev_st, queued_st);
queued_fr = queued_st->frame[queued_st->curframe];
mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
@@ -7618,7 +7946,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
/* switch to DRAINED state, but keep the depth unchanged */
/* mark current iter state as drained and assume returned NULL */
cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
- __mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]);
+ __mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]);
return 0;
}
@@ -7753,6 +8081,7 @@ static const struct bpf_reg_types btf_ptr_types = {
static const struct bpf_reg_types percpu_btf_ptr_types = {
.types = {
PTR_TO_BTF_ID | MEM_PERCPU,
+ PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU,
PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
}
};
@@ -7831,8 +8160,10 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
if (base_type(arg_type) == ARG_PTR_TO_MEM)
type &= ~DYNPTR_TYPE_FLAG_MASK;
- if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type))
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) {
type &= ~MEM_ALLOC;
+ type &= ~MEM_PERCPU;
+ }
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
expected = compatible->types[i];
@@ -7915,6 +8246,7 @@ found:
break;
}
case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
meta->func_id != BPF_FUNC_kptr_xchg) {
verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
@@ -7926,6 +8258,7 @@ found:
}
break;
case PTR_TO_BTF_ID | MEM_PERCPU:
+ case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU:
case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
/* Handled by helper specific checks */
break;
@@ -7953,9 +8286,9 @@ reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
return field;
}
-int check_func_arg_reg_off(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno,
- enum bpf_arg_type arg_type)
+static int check_func_arg_reg_off(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ enum bpf_arg_type arg_type)
{
u32 type = reg->type;
@@ -8089,6 +8422,54 @@ static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
return state->stack[spi].spilled_ptr.dynptr.type;
}
+static int check_reg_const_str(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno)
+{
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+ int map_off;
+ u64 map_addr;
+ char *str_ptr;
+
+ if (reg->type != PTR_TO_MAP_VALUE)
+ return -EINVAL;
+
+ if (!bpf_map_is_rdonly(map)) {
+ verbose(env, "R%d does not point to a readonly map'\n", regno);
+ return -EACCES;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a constant address'\n", regno);
+ return -EACCES;
+ }
+
+ if (!map->ops->map_direct_value_addr) {
+ verbose(env, "no direct value access support for this map type\n");
+ return -EACCES;
+ }
+
+ err = check_map_access(env, regno, reg->off,
+ map->value_size - reg->off, false,
+ ACCESS_HELPER);
+ if (err)
+ return err;
+
+ map_off = reg->off + reg->var_off.value;
+ err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
+ if (err) {
+ verbose(env, "direct value access on string failed\n");
+ return err;
+ }
+
+ str_ptr = (char *)(long)(map_addr);
+ if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
+ verbose(env, "string is not zero-terminated\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
const struct bpf_func_proto *fn,
@@ -8333,44 +8714,9 @@ skip_type_check:
}
case ARG_PTR_TO_CONST_STR:
{
- struct bpf_map *map = reg->map_ptr;
- int map_off;
- u64 map_addr;
- char *str_ptr;
-
- if (!bpf_map_is_rdonly(map)) {
- verbose(env, "R%d does not point to a readonly map'\n", regno);
- return -EACCES;
- }
-
- if (!tnum_is_const(reg->var_off)) {
- verbose(env, "R%d is not a constant address'\n", regno);
- return -EACCES;
- }
-
- if (!map->ops->map_direct_value_addr) {
- verbose(env, "no direct value access support for this map type\n");
- return -EACCES;
- }
-
- err = check_map_access(env, regno, reg->off,
- map->value_size - reg->off, false,
- ACCESS_HELPER);
+ err = check_reg_const_str(env, reg, regno);
if (err)
return err;
-
- map_off = reg->off + reg->var_off.value;
- err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
- if (err) {
- verbose(env, "direct value access on string failed\n");
- return err;
- }
-
- str_ptr = (char *)(long)(map_addr);
- if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
- verbose(env, "string is not zero-terminated\n");
- return -EINVAL;
- }
break;
}
case ARG_PTR_TO_KPTR:
@@ -8839,7 +9185,7 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env,
/* after the call registers r0 - r5 were scratched */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
- check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
}
}
@@ -8852,11 +9198,10 @@ static int set_callee_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee, int insn_idx);
-static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx, int subprog,
- set_callee_state_fn set_callee_state_cb)
+static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite,
+ set_callee_state_fn set_callee_state_cb,
+ struct bpf_verifier_state *state)
{
- struct bpf_verifier_state *state = env->cur_state;
struct bpf_func_state *caller, *callee;
int err;
@@ -8866,53 +9211,168 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -E2BIG;
}
- caller = state->frame[state->curframe];
if (state->frame[state->curframe + 1]) {
verbose(env, "verifier bug. Frame %d already allocated\n",
state->curframe + 1);
return -EFAULT;
}
- err = btf_check_subprog_call(env, subprog, caller->regs);
- if (err == -EFAULT)
- return err;
- if (subprog_is_global(env, subprog)) {
- if (err) {
- verbose(env, "Caller passes invalid args into func#%d\n",
- subprog);
- return err;
- } else {
- if (env->log.level & BPF_LOG_LEVEL)
- verbose(env,
- "Func#%d is global and valid. Skipping.\n",
- subprog);
- clear_caller_saved_regs(env, caller->regs);
+ caller = state->frame[state->curframe];
+ callee = kzalloc(sizeof(*callee), GFP_KERNEL);
+ if (!callee)
+ return -ENOMEM;
+ state->frame[state->curframe + 1] = callee;
+
+ /* callee cannot access r0, r6 - r9 for reading and has to write
+ * into its own stack before reading from it.
+ * callee can read/write into caller's stack
+ */
+ init_func_state(env, callee,
+ /* remember the callsite, it will be used by bpf_exit */
+ callsite,
+ state->curframe + 1 /* frameno within this callchain */,
+ subprog /* subprog number within this prog */);
+ /* Transfer references to the callee */
+ err = copy_reference_state(callee, caller);
+ err = err ?: set_callee_state_cb(env, caller, callee, callsite);
+ if (err)
+ goto err_out;
- /* All global functions return a 64-bit SCALAR_VALUE */
- mark_reg_unknown(env, caller->regs, BPF_REG_0);
- caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+ /* only increment it after check_reg_arg() finished */
+ state->curframe++;
- /* continue with next insn after call */
- return 0;
+ return 0;
+
+err_out:
+ free_func_state(callee);
+ state->frame[state->curframe + 1] = NULL;
+ return err;
+}
+
+static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
+ const struct btf *btf,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_subprog_info *sub = subprog_info(env, subprog);
+ struct bpf_verifier_log *log = &env->log;
+ u32 i;
+ int ret;
+
+ ret = btf_prepare_func_args(env, subprog);
+ if (ret)
+ return ret;
+
+ /* check that BTF function arguments match actual types that the
+ * verifier sees.
+ */
+ for (i = 0; i < sub->arg_cnt; i++) {
+ u32 regno = i + 1;
+ struct bpf_reg_state *reg = &regs[regno];
+ struct bpf_subprog_arg_info *arg = &sub->args[i];
+
+ if (arg->arg_type == ARG_ANYTHING) {
+ if (reg->type != SCALAR_VALUE) {
+ bpf_log(log, "R%d is not a scalar\n", regno);
+ return -EINVAL;
+ }
+ } else if (arg->arg_type == ARG_PTR_TO_CTX) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+ if (ret < 0)
+ return ret;
+ /* If function expects ctx type in BTF check that caller
+ * is passing PTR_TO_CTX.
+ */
+ if (reg->type != PTR_TO_CTX) {
+ bpf_log(log, "arg#%d expects pointer to ctx\n", i);
+ return -EINVAL;
+ }
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+ if (ret < 0)
+ return ret;
+ if (check_mem_reg(env, reg, regno, arg->mem_size))
+ return -EINVAL;
+ if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) {
+ bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
+ return -EINVAL;
+ }
+ } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+ ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
+ if (ret)
+ return ret;
+ } else {
+ bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n",
+ i, arg->arg_type);
+ return -EFAULT;
}
}
+ return 0;
+}
+
+/* Compare BTF of a function call with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_prog *prog = env->prog;
+ struct btf *btf = prog->aux->btf;
+ u32 btf_id;
+ int err;
+
+ if (!prog->aux->func_info)
+ return -EINVAL;
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ if (prog->aux->func_info_aux[subprog].unreliable)
+ return -EINVAL;
+
+ err = btf_check_func_arg_match(env, subprog, btf, regs);
+ /* Compiler optimizations can remove arguments from static functions
+ * or mismatched type can be passed into a global function.
+ * In such cases mark the function as unreliable from BTF point of view.
+ */
+ if (err)
+ prog->aux->func_info_aux[subprog].unreliable = true;
+ return err;
+}
+
+static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int insn_idx, int subprog,
+ set_callee_state_fn set_callee_state_cb)
+{
+ struct bpf_verifier_state *state = env->cur_state, *callback_state;
+ struct bpf_func_state *caller, *callee;
+ int err;
+
+ caller = state->frame[state->curframe];
+ err = btf_check_subprog_call(env, subprog, caller->regs);
+ if (err == -EFAULT)
+ return err;
+
/* set_callee_state is used for direct subprog calls, but we are
* interested in validating only BPF helpers that can call subprogs as
* callbacks
*/
- if (set_callee_state_cb != set_callee_state) {
- if (bpf_pseudo_kfunc_call(insn) &&
- !is_callback_calling_kfunc(insn->imm)) {
- verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
- func_id_name(insn->imm), insn->imm);
- return -EFAULT;
- } else if (!bpf_pseudo_kfunc_call(insn) &&
- !is_callback_calling_function(insn->imm)) { /* helper */
- verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
- func_id_name(insn->imm), insn->imm);
- return -EFAULT;
- }
+ env->subprog_info[subprog].is_cb = true;
+ if (bpf_pseudo_kfunc_call(insn) &&
+ !is_sync_callback_calling_kfunc(insn->imm)) {
+ verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ } else if (!bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_function(insn->imm)) { /* helper */
+ verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
}
if (insn->code == (BPF_JMP | BPF_CALL) &&
@@ -8923,53 +9383,88 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* there is no real recursion here. timer callbacks are async */
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
- *insn_idx, subprog);
+ insn_idx, subprog);
if (!async_cb)
return -EFAULT;
callee = async_cb->frame[0];
callee->async_entry_cnt = caller->async_entry_cnt + 1;
/* Convert bpf_timer_set_callback() args into timer callback args */
- err = set_callee_state_cb(env, caller, callee, *insn_idx);
+ err = set_callee_state_cb(env, caller, callee, insn_idx);
if (err)
return err;
+ return 0;
+ }
+
+ /* for callback functions enqueue entry to callback and
+ * proceed with next instruction within current frame.
+ */
+ callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
+ if (!callback_state)
+ return -ENOMEM;
+
+ err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
+ callback_state);
+ if (err)
+ return err;
+
+ callback_state->callback_unroll_depth++;
+ callback_state->frame[callback_state->curframe - 1]->callback_depth++;
+ caller->callback_depth = 0;
+ return 0;
+}
+
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_state *caller;
+ int err, subprog, target_insn;
+
+ target_insn = *insn_idx + insn->imm + 1;
+ subprog = find_subprog(env, target_insn);
+ if (subprog < 0) {
+ verbose(env, "verifier bug. No program starts at insn %d\n", target_insn);
+ return -EFAULT;
+ }
+
+ caller = state->frame[state->curframe];
+ err = btf_check_subprog_call(env, subprog, caller->regs);
+ if (err == -EFAULT)
+ return err;
+ if (subprog_is_global(env, subprog)) {
+ const char *sub_name = subprog_name(env, subprog);
+
+ if (err) {
+ verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
+ subprog, sub_name);
+ return err;
+ }
+
+ verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
+ subprog, sub_name);
+ /* mark global subprog for verifying after main prog */
+ subprog_aux(env, subprog)->called = true;
clear_caller_saved_regs(env, caller->regs);
+
+ /* All global functions return a 64-bit SCALAR_VALUE */
mark_reg_unknown(env, caller->regs, BPF_REG_0);
caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
/* continue with next insn after call */
return 0;
}
- callee = kzalloc(sizeof(*callee), GFP_KERNEL);
- if (!callee)
- return -ENOMEM;
- state->frame[state->curframe + 1] = callee;
-
- /* callee cannot access r0, r6 - r9 for reading and has to write
- * into its own stack before reading from it.
- * callee can read/write into caller's stack
+ /* for regular function entry setup new frame and continue
+ * from that frame.
*/
- init_func_state(env, callee,
- /* remember the callsite, it will be used by bpf_exit */
- *insn_idx /* callsite */,
- state->curframe + 1 /* frameno within this callchain */,
- subprog /* subprog number within this prog */);
-
- /* Transfer references to the callee */
- err = copy_reference_state(callee, caller);
+ err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state);
if (err)
- goto err_out;
-
- err = set_callee_state_cb(env, caller, callee, *insn_idx);
- if (err)
- goto err_out;
+ return err;
clear_caller_saved_regs(env, caller->regs);
- /* only increment it after check_reg_arg() finished */
- state->curframe++;
-
/* and go analyze first insn of the callee */
*insn_idx = env->subprog_info[subprog].start - 1;
@@ -8977,14 +9472,10 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
verbose(env, "caller:\n");
print_verifier_state(env, caller, true);
verbose(env, "callee:\n");
- print_verifier_state(env, callee, true);
+ print_verifier_state(env, state->frame[state->curframe], true);
}
- return 0;
-err_out:
- free_func_state(callee);
- state->frame[state->curframe + 1] = NULL;
- return err;
+ return 0;
}
int map_set_for_each_callback_args(struct bpf_verifier_env *env,
@@ -9028,22 +9519,6 @@ static int set_callee_state(struct bpf_verifier_env *env,
return 0;
}
-static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx)
-{
- int subprog, target_insn;
-
- target_insn = *insn_idx + insn->imm + 1;
- subprog = find_subprog(env, target_insn);
- if (subprog < 0) {
- verbose(env, "verifier bug. No program starts at insn %d\n",
- target_insn);
- return -EFAULT;
- }
-
- return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
-}
-
static int set_map_elem_callback_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee,
@@ -9070,7 +9545,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return err;
callee->in_callback_fn = true;
- callee->callback_ret_range = tnum_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -9092,7 +9567,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
- callee->callback_ret_range = tnum_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -9122,7 +9597,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_async_callback_fn = true;
- callee->callback_ret_range = tnum_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -9141,7 +9616,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
callee->regs[BPF_REG_2].btf = btf_vmlinux;
- callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA],
+ callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
/* pointer to stack or null */
callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
@@ -9150,7 +9625,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
- callee->callback_ret_range = tnum_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -9173,7 +9648,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
- callee->callback_ret_range = tnum_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -9205,7 +9680,7 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
- callee->callback_ret_range = tnum_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -9234,11 +9709,17 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
return is_rbtree_lock_required_kfunc(kfunc_btf_id);
}
+static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
+{
+ return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
- struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state, *prev_st;
struct bpf_func_state *caller, *callee;
struct bpf_reg_state *r0;
+ bool in_callback_fn;
int err;
callee = state->frame[state->curframe];
@@ -9256,17 +9737,28 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
caller = state->frame[state->curframe - 1];
if (callee->in_callback_fn) {
- /* enforce R0 return value range [0, 1]. */
- struct tnum range = callee->callback_ret_range;
-
if (r0->type != SCALAR_VALUE) {
verbose(env, "R0 not a scalar value\n");
return -EACCES;
}
- if (!tnum_in(range, r0->var_off)) {
- verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
+
+ /* we are going to rely on register's precise value */
+ err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64);
+ err = err ?: mark_chain_precision(env, BPF_REG_0);
+ if (err)
+ return err;
+
+ /* enforce R0 return value range */
+ if (!retval_range_within(callee->callback_ret_range, r0)) {
+ verbose_invalid_scalar(env, r0, callee->callback_ret_range,
+ "At callback return", "R0");
return -EINVAL;
}
+ if (!calls_callback(env, callee->callsite)) {
+ verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n",
+ *insn_idx, callee->callsite);
+ return -EFAULT;
+ }
} else {
/* return to the caller whatever r0 had in the callee */
caller->regs[BPF_REG_0] = *r0;
@@ -9284,27 +9776,56 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return err;
}
- *insn_idx = callee->callsite + 1;
+ /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
+ * there function call logic would reschedule callback visit. If iteration
+ * converges is_state_visited() would prune that visit eventually.
+ */
+ in_callback_fn = callee->in_callback_fn;
+ if (in_callback_fn)
+ *insn_idx = callee->callsite;
+ else
+ *insn_idx = callee->callsite + 1;
+
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
print_verifier_state(env, callee, true);
verbose(env, "to caller at %d:\n", *insn_idx);
print_verifier_state(env, caller, true);
}
- /* clear everything in the callee */
+ /* clear everything in the callee. In case of exceptional exits using
+ * bpf_throw, this will be done by copy_verifier_state for extra frames. */
free_func_state(callee);
state->frame[state->curframe--] = NULL;
+
+ /* for callbacks widen imprecise scalars to make programs like below verify:
+ *
+ * struct ctx { int i; }
+ * void cb(int idx, struct ctx *ctx) { ctx->i++; ... }
+ * ...
+ * struct ctx = { .i = 0; }
+ * bpf_loop(100, cb, &ctx, 0);
+ *
+ * This is similar to what is done in process_iter_next_call() for open
+ * coded iterators.
+ */
+ prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL;
+ if (prev_st) {
+ err = widen_imprecise_scalars(env, prev_st, state);
+ if (err)
+ return err;
+ }
return 0;
}
-static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
- int func_id,
- struct bpf_call_arg_meta *meta)
+static int do_refine_retval_range(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, int ret_type,
+ int func_id,
+ struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
if (ret_type != RET_INTEGER)
- return;
+ return 0;
switch (func_id) {
case BPF_FUNC_get_stack:
@@ -9330,6 +9851,8 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
reg_bounds_sync(ret_reg);
break;
}
+
+ return reg_bounds_sanity_check(env, ret_reg, "retval");
}
static int
@@ -9399,7 +9922,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
val = reg->var_off.value;
max = map->max_entries;
- if (!(register_is_const(reg) && val < max)) {
+ if (!(is_reg_const(reg, false) && val < max)) {
bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
return 0;
}
@@ -9415,17 +9938,17 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return 0;
}
-static int check_reference_leak(struct bpf_verifier_env *env)
+static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
{
struct bpf_func_state *state = cur_func(env);
bool refs_lingering = false;
int i;
- if (state->frameno && !state->in_callback_fn)
+ if (!exception_exit && state->frameno && !state->in_callback_fn)
return 0;
for (i = 0; i < state->acquired_refs; i++) {
- if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
@@ -9532,6 +10055,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
int *insn_idx_p)
{
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+ bool returns_cpu_specific_alloc_ptr = false;
const struct bpf_func_proto *fn = NULL;
enum bpf_return_type ret_type;
enum bpf_type_flag ret_flag;
@@ -9642,6 +10166,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EFAULT;
}
err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
+ } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
+ u32 ref_obj_id = meta.ref_obj_id;
+ bool in_rcu = in_rcu_cs(env);
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+
+ err = release_reference_state(cur_func(env), ref_obj_id);
+ if (!err) {
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id) {
+ if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
+ reg->ref_obj_id = 0;
+ reg->type &= ~MEM_ALLOC;
+ reg->type |= MEM_RCU;
+ } else {
+ mark_reg_invalid(env, reg);
+ }
+ }
+ }));
+ }
} else if (meta.ref_obj_id) {
err = release_reference(env, meta.ref_obj_id);
} else if (register_is_null(&regs[meta.release_regno])) {
@@ -9659,7 +10203,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
switch (func_id) {
case BPF_FUNC_tail_call:
- err = check_reference_leak(env);
+ err = check_reference_leak(env, false);
if (err) {
verbose(env, "tail_call would lead to reference leak\n");
return err;
@@ -9675,24 +10219,37 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
break;
case BPF_FUNC_for_each_map_elem:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_map_elem_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_map_elem_callback_state);
break;
case BPF_FUNC_timer_set_callback:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_timer_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_timer_callback_state);
break;
case BPF_FUNC_find_vma:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_find_vma_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_find_vma_callback_state);
break;
case BPF_FUNC_snprintf:
err = check_bpf_snprintf_call(env, regs);
break;
case BPF_FUNC_loop:
update_loop_inline_state(env, meta.subprogno);
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_loop_callback_state);
+ /* Verifier relies on R1 value to determine if bpf_loop() iteration
+ * is finished, thus mark it precise.
+ */
+ err = mark_chain_precision(env, BPF_REG_1);
+ if (err)
+ return err;
+ if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_loop_callback_state);
+ } else {
+ cur_func(env)->callback_depth = 0;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame%d bpf_loop iteration limit reached\n",
+ env->cur_state->curframe);
+ }
break;
case BPF_FUNC_dynptr_from_mem:
if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
@@ -9770,9 +10327,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
break;
}
+ case BPF_FUNC_per_cpu_ptr:
+ case BPF_FUNC_this_cpu_ptr:
+ {
+ struct bpf_reg_state *reg = &regs[BPF_REG_1];
+ const struct btf_type *type;
+
+ if (reg->type & MEM_RCU) {
+ type = btf_type_by_id(reg->btf, reg->btf_id);
+ if (!type || !btf_type_is_struct(type)) {
+ verbose(env, "Helper has invalid btf/btf_id in R1\n");
+ return -EFAULT;
+ }
+ returns_cpu_specific_alloc_ptr = true;
+ env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true;
+ }
+ break;
+ }
case BPF_FUNC_user_ringbuf_drain:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_user_ringbuf_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_user_ringbuf_callback_state);
break;
}
@@ -9859,14 +10433,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = tsize;
} else {
- /* MEM_RDONLY may be carried from ret_flag, but it
- * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
- * it will confuse the check of PTR_TO_BTF_ID in
- * check_mem_access().
- */
- ret_flag &= ~MEM_RDONLY;
+ if (returns_cpu_specific_alloc_ptr) {
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU;
+ } else {
+ /* MEM_RDONLY may be carried from ret_flag, but it
+ * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
+ * it will confuse the check of PTR_TO_BTF_ID in
+ * check_mem_access().
+ */
+ ret_flag &= ~MEM_RDONLY;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
+ }
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
regs[BPF_REG_0].btf = meta.ret_btf;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
}
@@ -9882,8 +10460,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (func_id == BPF_FUNC_kptr_xchg) {
ret_btf = meta.kptr_field->kptr.btf;
ret_btf_id = meta.kptr_field->kptr.btf_id;
- if (!btf_is_kernel(ret_btf))
+ if (!btf_is_kernel(ret_btf)) {
regs[BPF_REG_0].type |= MEM_ALLOC;
+ if (meta.kptr_field->type == BPF_KPTR_PERCPU)
+ regs[BPF_REG_0].type |= MEM_PERCPU;
+ }
} else {
if (fn->ret_btf_id == BPF_PTR_POISON) {
verbose(env, "verifier internal error:");
@@ -9936,7 +10517,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].ref_obj_id = id;
}
- do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+ err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
+ if (err)
+ return err;
err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
@@ -10030,6 +10613,11 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_RCU;
}
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RCU_PROTECTED;
+}
+
static bool __kfunc_param_match_suffix(const struct btf *btf,
const struct btf_param *arg,
const char *suffix)
@@ -10104,6 +10692,16 @@ static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf
return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr");
}
+static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__nullable");
+}
+
+static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__str");
+}
+
static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
const struct btf_param *arg,
const char *name)
@@ -10246,6 +10844,8 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_CALLBACK,
KF_ARG_PTR_TO_RB_ROOT,
KF_ARG_PTR_TO_RB_NODE,
+ KF_ARG_PTR_TO_NULL,
+ KF_ARG_PTR_TO_CONST_STR,
};
enum special_kfunc_type {
@@ -10268,6 +10868,10 @@ enum special_kfunc_type {
KF_bpf_dynptr_slice,
KF_bpf_dynptr_slice_rdwr,
KF_bpf_dynptr_clone,
+ KF_bpf_percpu_obj_new_impl,
+ KF_bpf_percpu_obj_drop_impl,
+ KF_bpf_throw,
+ KF_bpf_iter_css_task_new,
};
BTF_SET_START(special_kfunc_set)
@@ -10288,6 +10892,12 @@ BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
+#ifdef CONFIG_CGROUPS
+BTF_ID(func, bpf_iter_css_task_new)
+#endif
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
@@ -10310,6 +10920,14 @@ BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
+#ifdef CONFIG_CGROUPS
+BTF_ID(func, bpf_iter_css_task_new)
+#else
+BTF_ID_UNUSED
+#endif
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -10378,6 +10996,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_RB_NODE;
+ if (is_kfunc_arg_const_str(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_CONST_STR;
+
if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
if (!btf_type_is_struct(ref_t)) {
verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
@@ -10390,6 +11011,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
return KF_ARG_PTR_TO_CALLBACK;
+ if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
+ return KF_ARG_PTR_TO_NULL;
if (argno + 1 < nargs &&
(is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
@@ -10622,11 +11245,17 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id)
btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
}
-static bool is_callback_calling_kfunc(u32 btf_id)
+static bool is_sync_callback_calling_kfunc(u32 btf_id)
{
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
}
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
+{
+ return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
+ insn->imm == special_kfunc_list[KF_bpf_throw];
+}
+
static bool is_rbtree_lock_required_kfunc(u32 btf_id)
{
return is_bpf_rbtree_api_kfunc(btf_id);
@@ -10834,6 +11463,28 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
&meta->arg_rbtree_root.field);
}
+/*
+ * css_task iter allowlist is needed to avoid dead locking on css_set_lock.
+ * LSM hooks and iters (both sleepable and non-sleepable) are safe.
+ * Any sleepable progs are also safe since bpf_check_attach_target() enforce
+ * them can only be attached to some specific hook points.
+ */
+static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_LSM:
+ return true;
+ case BPF_PROG_TYPE_TRACING:
+ if (env->prog->expected_attach_type == BPF_TRACE_ITER)
+ return true;
+ fallthrough;
+ default:
+ return env->prog->aux->sleepable;
+ }
+}
+
static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
int insn_idx)
{
@@ -10920,7 +11571,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
- (register_is_null(reg) || type_may_be_null(reg->type))) {
+ (register_is_null(reg) || type_may_be_null(reg->type)) &&
+ !is_kfunc_arg_nullable(meta->btf, &args[i])) {
verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
return -EACCES;
}
@@ -10945,6 +11597,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return kf_arg_type;
switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_NULL:
+ continue;
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
case KF_ARG_PTR_TO_BTF_ID:
if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
@@ -10976,6 +11630,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_MEM_SIZE:
case KF_ARG_PTR_TO_CALLBACK:
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
+ case KF_ARG_PTR_TO_CONST_STR:
/* Trusted by default */
break;
default:
@@ -11004,7 +11659,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
break;
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
- if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i);
+ return -EINVAL;
+ }
+ } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) {
+ if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
+ verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i);
+ return -EINVAL;
+ }
+ } else {
verbose(env, "arg#%d expected pointer to allocated object\n", i);
return -EINVAL;
}
@@ -11012,8 +11677,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "allocated object must be referenced\n");
return -EINVAL;
}
- if (meta->btf == btf_vmlinux &&
- meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ if (meta->btf == btf_vmlinux) {
meta->arg_btf = reg->btf;
meta->arg_btf_id = reg->btf_id;
}
@@ -11075,6 +11739,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
break;
}
case KF_ARG_PTR_TO_ITER:
+ if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) {
+ if (!check_css_task_iter_allowlist(env)) {
+ verbose(env, "css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs\n");
+ return -EINVAL;
+ }
+ }
ret = process_iter_arg(env, regno, insn_idx, meta);
if (ret < 0)
return ret;
@@ -11204,6 +11874,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
break;
}
case KF_ARG_PTR_TO_CALLBACK:
+ if (reg->type != PTR_TO_FUNC) {
+ verbose(env, "arg%d expected pointer to func\n", i);
+ return -EINVAL;
+ }
meta->subprogno = reg->subprogno;
break;
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
@@ -11228,6 +11902,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
meta->arg_btf = reg->btf;
meta->arg_btf_id = reg->btf_id;
break;
+ case KF_ARG_PTR_TO_CONST_STR:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a const string\n", i);
+ return -EINVAL;
+ }
+ ret = check_reg_const_str(env, reg, regno);
+ if (ret)
+ return ret;
+ break;
}
}
@@ -11282,6 +11965,8 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env,
return 0;
}
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
+
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
@@ -11322,12 +12007,28 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EACCES;
}
+ /* Check the arguments */
+ err = check_kfunc_args(env, &meta, insn_idx);
+ if (err < 0)
+ return err;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_rbtree_add_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
if (env->cur_state->active_rcu_lock) {
struct bpf_func_state *state;
struct bpf_reg_state *reg;
+ u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
@@ -11338,7 +12039,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
return -EINVAL;
} else if (rcu_unlock) {
- bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
if (reg->type & MEM_RCU) {
reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
reg->type |= PTR_UNTRUSTED;
@@ -11356,10 +12057,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EINVAL;
}
- /* Check the arguments */
- err = check_kfunc_args(env, &meta, insn_idx);
- if (err < 0)
- return err;
/* In case of release function, we get register number of refcounted
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
@@ -11393,13 +12090,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
- if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_rbtree_add_callback_state);
- if (err) {
- verbose(env, "kfunc %s#%d failed callback verification\n",
+ if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
+ if (!bpf_jit_supports_exceptions()) {
+ verbose(env, "JIT does not support calling kfunc %s#%d\n",
func_name, meta.func_id);
- return err;
+ return -ENOTSUPP;
+ }
+ env->seen_exception = true;
+
+ /* In the case of the default callback, the cookie value passed
+ * to bpf_throw becomes the return value of the program.
+ */
+ if (!env->exception_callback_subprog) {
+ err = check_return_code(env, BPF_REG_1, "R1");
+ if (err < 0)
+ return err;
}
}
@@ -11413,6 +12118,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* Only exception is bpf_obj_new_impl */
if (meta.btf != btf_vmlinux ||
(meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
+ meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
return -EINVAL;
@@ -11426,11 +12132,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
- if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ struct btf_struct_meta *struct_meta;
struct btf *ret_btf;
u32 ret_btf_id;
- if (unlikely(!bpf_global_ma_set))
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
return -ENOMEM;
if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
@@ -11443,24 +12151,67 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* This may be NULL due to user not supplying a BTF */
if (!ret_btf) {
- verbose(env, "bpf_obj_new requires prog BTF\n");
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
return -EINVAL;
}
ret_t = btf_type_by_id(ret_btf, ret_btf_id);
if (!ret_t || !__btf_type_is_struct(ret_t)) {
- verbose(env, "bpf_obj_new type ID argument must be of a struct\n");
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
return -EINVAL;
}
+ if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
+ verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
+ ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
+ return -EINVAL;
+ }
+
+ if (!bpf_global_percpu_ma_set) {
+ mutex_lock(&bpf_percpu_ma_lock);
+ if (!bpf_global_percpu_ma_set) {
+ /* Charge memory allocated with bpf_global_percpu_ma to
+ * root memcg. The obj_cgroup for root memcg is NULL.
+ */
+ err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
+ if (!err)
+ bpf_global_percpu_ma_set = true;
+ }
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&bpf_percpu_ma_lock);
+ err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
+ if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
+ return -EINVAL;
+ }
+
+ if (struct_meta) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
+ return -EINVAL;
+ }
+ }
+
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
regs[BPF_REG_0].btf = ret_btf;
regs[BPF_REG_0].btf_id = ret_btf_id;
+ if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
+ regs[BPF_REG_0].type |= MEM_PERCPU;
insn_aux->obj_new_size = ret_t->size;
- insn_aux->kptr_struct_meta =
- btf_find_struct_meta(ret_btf, ret_btf_id);
+ insn_aux->kptr_struct_meta = struct_meta;
} else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
@@ -11597,7 +12348,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].id = ++env->id_gen;
} else if (btf_type_is_void(t)) {
if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
- if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
insn_aux->kptr_struct_meta =
btf_find_struct_meta(meta.arg_btf,
meta.arg_btf_id);
@@ -12074,6 +12826,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
switch (base_type(ptr_reg->type)) {
+ case PTR_TO_FLOW_KEYS:
+ if (known)
+ break;
+ fallthrough;
case CONST_PTR_TO_MAP:
/* smin_val represents the known value */
if (known && smin_val == 0 && opcode == BPF_ADD)
@@ -13294,13 +14050,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* check dest operand */
err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+ err = err ?: adjust_reg_min_max_vals(env, insn);
if (err)
return err;
-
- return adjust_reg_min_max_vals(env, insn);
}
- return 0;
+ return reg_bounds_sanity_check(env, &regs[insn->dst_reg], "alu");
}
static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
@@ -13382,153 +14137,130 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
}));
}
-static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
-{
- struct tnum subreg = tnum_subreg(reg->var_off);
- s32 sval = (s32)val;
+/*
+ * <reg1> <op> <reg2>, currently assuming reg2 is a constant
+ */
+static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32)
+{
+ struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
+ struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
+ u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value;
+ u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value;
+ s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value;
+ s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value;
+ u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value;
+ u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value;
+ s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value;
+ s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value;
switch (opcode) {
case BPF_JEQ:
- if (tnum_is_const(subreg))
- return !!tnum_equals_const(subreg, val);
- else if (val < reg->u32_min_value || val > reg->u32_max_value)
- return 0;
- break;
- case BPF_JNE:
- if (tnum_is_const(subreg))
- return !tnum_equals_const(subreg, val);
- else if (val < reg->u32_min_value || val > reg->u32_max_value)
- return 1;
- break;
- case BPF_JSET:
- if ((~subreg.mask & subreg.value) & val)
- return 1;
- if (!((subreg.mask | subreg.value) & val))
- return 0;
- break;
- case BPF_JGT:
- if (reg->u32_min_value > val)
- return 1;
- else if (reg->u32_max_value <= val)
- return 0;
- break;
- case BPF_JSGT:
- if (reg->s32_min_value > sval)
- return 1;
- else if (reg->s32_max_value <= sval)
- return 0;
- break;
- case BPF_JLT:
- if (reg->u32_max_value < val)
- return 1;
- else if (reg->u32_min_value >= val)
- return 0;
- break;
- case BPF_JSLT:
- if (reg->s32_max_value < sval)
- return 1;
- else if (reg->s32_min_value >= sval)
- return 0;
- break;
- case BPF_JGE:
- if (reg->u32_min_value >= val)
- return 1;
- else if (reg->u32_max_value < val)
- return 0;
- break;
- case BPF_JSGE:
- if (reg->s32_min_value >= sval)
- return 1;
- else if (reg->s32_max_value < sval)
+ /* constants, umin/umax and smin/smax checks would be
+ * redundant in this case because they all should match
+ */
+ if (tnum_is_const(t1) && tnum_is_const(t2))
+ return t1.value == t2.value;
+ /* non-overlapping ranges */
+ if (umin1 > umax2 || umax1 < umin2)
return 0;
- break;
- case BPF_JLE:
- if (reg->u32_max_value <= val)
- return 1;
- else if (reg->u32_min_value > val)
- return 0;
- break;
- case BPF_JSLE:
- if (reg->s32_max_value <= sval)
- return 1;
- else if (reg->s32_min_value > sval)
- return 0;
- break;
- }
-
- return -1;
-}
-
-
-static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
-{
- s64 sval = (s64)val;
-
- switch (opcode) {
- case BPF_JEQ:
- if (tnum_is_const(reg->var_off))
- return !!tnum_equals_const(reg->var_off, val);
- else if (val < reg->umin_value || val > reg->umax_value)
+ if (smin1 > smax2 || smax1 < smin2)
return 0;
+ if (!is_jmp32) {
+ /* if 64-bit ranges are inconclusive, see if we can
+ * utilize 32-bit subrange knowledge to eliminate
+ * branches that can't be taken a priori
+ */
+ if (reg1->u32_min_value > reg2->u32_max_value ||
+ reg1->u32_max_value < reg2->u32_min_value)
+ return 0;
+ if (reg1->s32_min_value > reg2->s32_max_value ||
+ reg1->s32_max_value < reg2->s32_min_value)
+ return 0;
+ }
break;
case BPF_JNE:
- if (tnum_is_const(reg->var_off))
- return !tnum_equals_const(reg->var_off, val);
- else if (val < reg->umin_value || val > reg->umax_value)
+ /* constants, umin/umax and smin/smax checks would be
+ * redundant in this case because they all should match
+ */
+ if (tnum_is_const(t1) && tnum_is_const(t2))
+ return t1.value != t2.value;
+ /* non-overlapping ranges */
+ if (umin1 > umax2 || umax1 < umin2)
return 1;
+ if (smin1 > smax2 || smax1 < smin2)
+ return 1;
+ if (!is_jmp32) {
+ /* if 64-bit ranges are inconclusive, see if we can
+ * utilize 32-bit subrange knowledge to eliminate
+ * branches that can't be taken a priori
+ */
+ if (reg1->u32_min_value > reg2->u32_max_value ||
+ reg1->u32_max_value < reg2->u32_min_value)
+ return 1;
+ if (reg1->s32_min_value > reg2->s32_max_value ||
+ reg1->s32_max_value < reg2->s32_min_value)
+ return 1;
+ }
break;
case BPF_JSET:
- if ((~reg->var_off.mask & reg->var_off.value) & val)
+ if (!is_reg_const(reg2, is_jmp32)) {
+ swap(reg1, reg2);
+ swap(t1, t2);
+ }
+ if (!is_reg_const(reg2, is_jmp32))
+ return -1;
+ if ((~t1.mask & t1.value) & t2.value)
return 1;
- if (!((reg->var_off.mask | reg->var_off.value) & val))
+ if (!((t1.mask | t1.value) & t2.value))
return 0;
break;
case BPF_JGT:
- if (reg->umin_value > val)
+ if (umin1 > umax2)
return 1;
- else if (reg->umax_value <= val)
+ else if (umax1 <= umin2)
return 0;
break;
case BPF_JSGT:
- if (reg->smin_value > sval)
+ if (smin1 > smax2)
return 1;
- else if (reg->smax_value <= sval)
+ else if (smax1 <= smin2)
return 0;
break;
case BPF_JLT:
- if (reg->umax_value < val)
+ if (umax1 < umin2)
return 1;
- else if (reg->umin_value >= val)
+ else if (umin1 >= umax2)
return 0;
break;
case BPF_JSLT:
- if (reg->smax_value < sval)
+ if (smax1 < smin2)
return 1;
- else if (reg->smin_value >= sval)
+ else if (smin1 >= smax2)
return 0;
break;
case BPF_JGE:
- if (reg->umin_value >= val)
+ if (umin1 >= umax2)
return 1;
- else if (reg->umax_value < val)
+ else if (umax1 < umin2)
return 0;
break;
case BPF_JSGE:
- if (reg->smin_value >= sval)
+ if (smin1 >= smax2)
return 1;
- else if (reg->smax_value < sval)
+ else if (smax1 < smin2)
return 0;
break;
case BPF_JLE:
- if (reg->umax_value <= val)
+ if (umax1 <= umin2)
return 1;
- else if (reg->umin_value > val)
+ else if (umin1 > umax2)
return 0;
break;
case BPF_JSLE:
- if (reg->smax_value <= sval)
+ if (smax1 <= smin2)
return 1;
- else if (reg->smin_value > sval)
+ else if (smin1 > smax2)
return 0;
break;
}
@@ -13536,41 +14268,6 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return -1;
}
-/* compute branch direction of the expression "if (reg opcode val) goto target;"
- * and return:
- * 1 - branch will be taken and "goto target" will be executed
- * 0 - branch will not be taken and fall-through to next insn
- * -1 - unknown. Example: "if (reg < 5)" is unknown when register value
- * range [0,10]
- */
-static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
- bool is_jmp32)
-{
- if (__is_pointer_value(false, reg)) {
- if (!reg_not_null(reg))
- return -1;
-
- /* If pointer is valid tests against zero will fail so we can
- * use this to direct branch taken.
- */
- if (val != 0)
- return -1;
-
- switch (opcode) {
- case BPF_JEQ:
- return 0;
- case BPF_JNE:
- return 1;
- default:
- return -1;
- }
- }
-
- if (is_jmp32)
- return is_branch32_taken(reg, val, opcode);
- return is_branch64_taken(reg, val, opcode);
-}
-
static int flip_opcode(u32 opcode)
{
/* How can we transform "a <op> b" into "b <op> a"? */
@@ -13632,216 +14329,280 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
return -1;
}
-/* Adjusts the register min/max values in the case that the dst_reg is the
- * variable register that we are working on, and src_reg is a constant or we're
- * simply doing a BPF_K check.
- * In JEQ/JNE cases we also adjust the var_off values.
+/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;"
+ * and return:
+ * 1 - branch will be taken and "goto target" will be executed
+ * 0 - branch will not be taken and fall-through to next insn
+ * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
+ * range [0,10]
*/
-static void reg_set_min_max(struct bpf_reg_state *true_reg,
- struct bpf_reg_state *false_reg,
- u64 val, u32 val32,
- u8 opcode, bool is_jmp32)
-{
- struct tnum false_32off = tnum_subreg(false_reg->var_off);
- struct tnum false_64off = false_reg->var_off;
- struct tnum true_32off = tnum_subreg(true_reg->var_off);
- struct tnum true_64off = true_reg->var_off;
- s64 sval = (s64)val;
- s32 sval32 = (s32)val32;
-
- /* If the dst_reg is a pointer, we can't learn anything about its
- * variable offset from the compare (unless src_reg were a pointer into
- * the same object, but we don't bother with that.
- * Since false_reg and true_reg have the same type by construction, we
- * only need to check one of them for pointerness.
- */
- if (__is_pointer_value(false, false_reg))
- return;
+static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32)
+{
+ if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32)
+ return is_pkt_ptr_branch_taken(reg1, reg2, opcode);
+
+ if (__is_pointer_value(false, reg1) || __is_pointer_value(false, reg2)) {
+ u64 val;
+
+ /* arrange that reg2 is a scalar, and reg1 is a pointer */
+ if (!is_reg_const(reg2, is_jmp32)) {
+ opcode = flip_opcode(opcode);
+ swap(reg1, reg2);
+ }
+ /* and ensure that reg2 is a constant */
+ if (!is_reg_const(reg2, is_jmp32))
+ return -1;
+
+ if (!reg_not_null(reg1))
+ return -1;
+
+ /* If pointer is valid tests against zero will fail so we can
+ * use this to direct branch taken.
+ */
+ val = reg_const_value(reg2, is_jmp32);
+ if (val != 0)
+ return -1;
+
+ switch (opcode) {
+ case BPF_JEQ:
+ return 0;
+ case BPF_JNE:
+ return 1;
+ default:
+ return -1;
+ }
+ }
+ /* now deal with two scalars, but not necessarily constants */
+ return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
+}
+
+/* Opcode that corresponds to a *false* branch condition.
+ * E.g., if r1 < r2, then reverse (false) condition is r1 >= r2
+ */
+static u8 rev_opcode(u8 opcode)
+{
switch (opcode) {
- /* JEQ/JNE comparison doesn't change the register equivalence.
- *
- * r1 = r2;
- * if (r1 == 42) goto label;
- * ...
- * label: // here both r1 and r2 are known to be 42.
- *
- * Hence when marking register as known preserve it's ID.
+ case BPF_JEQ: return BPF_JNE;
+ case BPF_JNE: return BPF_JEQ;
+ /* JSET doesn't have it's reverse opcode in BPF, so add
+ * BPF_X flag to denote the reverse of that operation
*/
+ case BPF_JSET: return BPF_JSET | BPF_X;
+ case BPF_JSET | BPF_X: return BPF_JSET;
+ case BPF_JGE: return BPF_JLT;
+ case BPF_JGT: return BPF_JLE;
+ case BPF_JLE: return BPF_JGT;
+ case BPF_JLT: return BPF_JGE;
+ case BPF_JSGE: return BPF_JSLT;
+ case BPF_JSGT: return BPF_JSLE;
+ case BPF_JSLE: return BPF_JSGT;
+ case BPF_JSLT: return BPF_JSGE;
+ default: return 0;
+ }
+}
+
+/* Refine range knowledge for <reg1> <op> <reg>2 conditional operation. */
+static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32)
+{
+ struct tnum t;
+ u64 val;
+
+again:
+ switch (opcode) {
case BPF_JEQ:
if (is_jmp32) {
- __mark_reg32_known(true_reg, val32);
- true_32off = tnum_subreg(true_reg->var_off);
+ reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
+ reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
+ reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+ reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
+ reg2->u32_min_value = reg1->u32_min_value;
+ reg2->u32_max_value = reg1->u32_max_value;
+ reg2->s32_min_value = reg1->s32_min_value;
+ reg2->s32_max_value = reg1->s32_max_value;
+
+ t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off));
+ reg1->var_off = tnum_with_subreg(reg1->var_off, t);
+ reg2->var_off = tnum_with_subreg(reg2->var_off, t);
} else {
- ___mark_reg_known(true_reg, val);
- true_64off = true_reg->var_off;
+ reg1->umin_value = max(reg1->umin_value, reg2->umin_value);
+ reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
+ reg1->smin_value = max(reg1->smin_value, reg2->smin_value);
+ reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
+ reg2->umin_value = reg1->umin_value;
+ reg2->umax_value = reg1->umax_value;
+ reg2->smin_value = reg1->smin_value;
+ reg2->smax_value = reg1->smax_value;
+
+ reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off);
+ reg2->var_off = reg1->var_off;
}
break;
case BPF_JNE:
+ if (!is_reg_const(reg2, is_jmp32))
+ swap(reg1, reg2);
+ if (!is_reg_const(reg2, is_jmp32))
+ break;
+
+ /* try to recompute the bound of reg1 if reg2 is a const and
+ * is exactly the edge of reg1.
+ */
+ val = reg_const_value(reg2, is_jmp32);
if (is_jmp32) {
- __mark_reg32_known(false_reg, val32);
- false_32off = tnum_subreg(false_reg->var_off);
+ /* u32_min_value is not equal to 0xffffffff at this point,
+ * because otherwise u32_max_value is 0xffffffff as well,
+ * in such a case both reg1 and reg2 would be constants,
+ * jump would be predicted and reg_set_min_max() won't
+ * be called.
+ *
+ * Same reasoning works for all {u,s}{min,max}{32,64} cases
+ * below.
+ */
+ if (reg1->u32_min_value == (u32)val)
+ reg1->u32_min_value++;
+ if (reg1->u32_max_value == (u32)val)
+ reg1->u32_max_value--;
+ if (reg1->s32_min_value == (s32)val)
+ reg1->s32_min_value++;
+ if (reg1->s32_max_value == (s32)val)
+ reg1->s32_max_value--;
} else {
- ___mark_reg_known(false_reg, val);
- false_64off = false_reg->var_off;
+ if (reg1->umin_value == (u64)val)
+ reg1->umin_value++;
+ if (reg1->umax_value == (u64)val)
+ reg1->umax_value--;
+ if (reg1->smin_value == (s64)val)
+ reg1->smin_value++;
+ if (reg1->smax_value == (s64)val)
+ reg1->smax_value--;
}
break;
case BPF_JSET:
+ if (!is_reg_const(reg2, is_jmp32))
+ swap(reg1, reg2);
+ if (!is_reg_const(reg2, is_jmp32))
+ break;
+ val = reg_const_value(reg2, is_jmp32);
+ /* BPF_JSET (i.e., TRUE branch, *not* BPF_JSET | BPF_X)
+ * requires single bit to learn something useful. E.g., if we
+ * know that `r1 & 0x3` is true, then which bits (0, 1, or both)
+ * are actually set? We can learn something definite only if
+ * it's a single-bit value to begin with.
+ *
+ * BPF_JSET | BPF_X (i.e., negation of BPF_JSET) doesn't have
+ * this restriction. I.e., !(r1 & 0x3) means neither bit 0 nor
+ * bit 1 is set, which we can readily use in adjustments.
+ */
+ if (!is_power_of_2(val))
+ break;
if (is_jmp32) {
- false_32off = tnum_and(false_32off, tnum_const(~val32));
- if (is_power_of_2(val32))
- true_32off = tnum_or(true_32off,
- tnum_const(val32));
+ t = tnum_or(tnum_subreg(reg1->var_off), tnum_const(val));
+ reg1->var_off = tnum_with_subreg(reg1->var_off, t);
} else {
- false_64off = tnum_and(false_64off, tnum_const(~val));
- if (is_power_of_2(val))
- true_64off = tnum_or(true_64off,
- tnum_const(val));
+ reg1->var_off = tnum_or(reg1->var_off, tnum_const(val));
}
break;
- case BPF_JGE:
- case BPF_JGT:
- {
+ case BPF_JSET | BPF_X: /* reverse of BPF_JSET, see rev_opcode() */
+ if (!is_reg_const(reg2, is_jmp32))
+ swap(reg1, reg2);
+ if (!is_reg_const(reg2, is_jmp32))
+ break;
+ val = reg_const_value(reg2, is_jmp32);
if (is_jmp32) {
- u32 false_umax = opcode == BPF_JGT ? val32 : val32 - 1;
- u32 true_umin = opcode == BPF_JGT ? val32 + 1 : val32;
-
- false_reg->u32_max_value = min(false_reg->u32_max_value,
- false_umax);
- true_reg->u32_min_value = max(true_reg->u32_min_value,
- true_umin);
+ t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val));
+ reg1->var_off = tnum_with_subreg(reg1->var_off, t);
} else {
- u64 false_umax = opcode == BPF_JGT ? val : val - 1;
- u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
-
- false_reg->umax_value = min(false_reg->umax_value, false_umax);
- true_reg->umin_value = max(true_reg->umin_value, true_umin);
+ reg1->var_off = tnum_and(reg1->var_off, tnum_const(~val));
}
break;
- }
- case BPF_JSGE:
- case BPF_JSGT:
- {
+ case BPF_JLE:
if (is_jmp32) {
- s32 false_smax = opcode == BPF_JSGT ? sval32 : sval32 - 1;
- s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32;
-
- false_reg->s32_max_value = min(false_reg->s32_max_value, false_smax);
- true_reg->s32_min_value = max(true_reg->s32_min_value, true_smin);
+ reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
+ reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
} else {
- s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1;
- s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
-
- false_reg->smax_value = min(false_reg->smax_value, false_smax);
- true_reg->smin_value = max(true_reg->smin_value, true_smin);
+ reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
+ reg2->umin_value = max(reg1->umin_value, reg2->umin_value);
}
break;
- }
- case BPF_JLE:
case BPF_JLT:
- {
if (is_jmp32) {
- u32 false_umin = opcode == BPF_JLT ? val32 : val32 + 1;
- u32 true_umax = opcode == BPF_JLT ? val32 - 1 : val32;
-
- false_reg->u32_min_value = max(false_reg->u32_min_value,
- false_umin);
- true_reg->u32_max_value = min(true_reg->u32_max_value,
- true_umax);
+ reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1);
+ reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value);
} else {
- u64 false_umin = opcode == BPF_JLT ? val : val + 1;
- u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
-
- false_reg->umin_value = max(false_reg->umin_value, false_umin);
- true_reg->umax_value = min(true_reg->umax_value, true_umax);
+ reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1);
+ reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value);
}
break;
- }
case BPF_JSLE:
+ if (is_jmp32) {
+ reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
+ reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+ } else {
+ reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
+ reg2->smin_value = max(reg1->smin_value, reg2->smin_value);
+ }
+ break;
case BPF_JSLT:
- {
if (is_jmp32) {
- s32 false_smin = opcode == BPF_JSLT ? sval32 : sval32 + 1;
- s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32;
-
- false_reg->s32_min_value = max(false_reg->s32_min_value, false_smin);
- true_reg->s32_max_value = min(true_reg->s32_max_value, true_smax);
+ reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1);
+ reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value);
} else {
- s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1;
- s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
-
- false_reg->smin_value = max(false_reg->smin_value, false_smin);
- true_reg->smax_value = min(true_reg->smax_value, true_smax);
+ reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1);
+ reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
}
break;
- }
+ case BPF_JGE:
+ case BPF_JGT:
+ case BPF_JSGE:
+ case BPF_JSGT:
+ /* just reuse LE/LT logic above */
+ opcode = flip_opcode(opcode);
+ swap(reg1, reg2);
+ goto again;
default:
return;
}
-
- if (is_jmp32) {
- false_reg->var_off = tnum_or(tnum_clear_subreg(false_64off),
- tnum_subreg(false_32off));
- true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off),
- tnum_subreg(true_32off));
- __reg_combine_32_into_64(false_reg);
- __reg_combine_32_into_64(true_reg);
- } else {
- false_reg->var_off = false_64off;
- true_reg->var_off = true_64off;
- __reg_combine_64_into_32(false_reg);
- __reg_combine_64_into_32(true_reg);
- }
}
-/* Same as above, but for the case that dst_reg holds a constant and src_reg is
- * the variable reg.
+/* Adjusts the register min/max values in the case that the dst_reg and
+ * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
+ * check, in which case we havea fake SCALAR_VALUE representing insn->imm).
+ * Technically we can do similar adjustments for pointers to the same object,
+ * but we don't support that right now.
*/
-static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
- struct bpf_reg_state *false_reg,
- u64 val, u32 val32,
- u8 opcode, bool is_jmp32)
+static int reg_set_min_max(struct bpf_verifier_env *env,
+ struct bpf_reg_state *true_reg1,
+ struct bpf_reg_state *true_reg2,
+ struct bpf_reg_state *false_reg1,
+ struct bpf_reg_state *false_reg2,
+ u8 opcode, bool is_jmp32)
{
- opcode = flip_opcode(opcode);
- /* This uses zero as "not present in table"; luckily the zero opcode,
- * BPF_JA, can't get here.
+ int err;
+
+ /* If either register is a pointer, we can't learn anything about its
+ * variable offset from the compare (unless they were a pointer into
+ * the same object, but we don't bother with that).
*/
- if (opcode)
- reg_set_min_max(true_reg, false_reg, val, val32, opcode, is_jmp32);
-}
-
-/* Regs are known to be equal, so intersect their min/max/var_off */
-static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
- struct bpf_reg_state *dst_reg)
-{
- src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value,
- dst_reg->umin_value);
- src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value,
- dst_reg->umax_value);
- src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value,
- dst_reg->smin_value);
- src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value,
- dst_reg->smax_value);
- src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
- dst_reg->var_off);
- reg_bounds_sync(src_reg);
- reg_bounds_sync(dst_reg);
-}
+ if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
+ return 0;
-static void reg_combine_min_max(struct bpf_reg_state *true_src,
- struct bpf_reg_state *true_dst,
- struct bpf_reg_state *false_src,
- struct bpf_reg_state *false_dst,
- u8 opcode)
-{
- switch (opcode) {
- case BPF_JEQ:
- __reg_combine_min_max(true_src, true_dst);
- break;
- case BPF_JNE:
- __reg_combine_min_max(false_src, false_dst);
- break;
- }
+ /* fallthrough (FALSE) branch */
+ regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
+ reg_bounds_sync(false_reg1);
+ reg_bounds_sync(false_reg2);
+
+ /* jump (TRUE) branch */
+ regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32);
+ reg_bounds_sync(true_reg1);
+ reg_bounds_sync(true_reg2);
+
+ err = reg_bounds_sanity_check(env, true_reg1, "true_reg1");
+ err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2");
+ err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1");
+ err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2");
+ return err;
}
static void mark_ptr_or_null_reg(struct bpf_func_state *state,
@@ -14039,6 +14800,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
struct bpf_reg_state *eq_branch_regs;
+ struct bpf_reg_state fake_reg = {};
u8 opcode = BPF_OP(insn->code);
bool is_jmp32;
int pred = -1;
@@ -14079,42 +14841,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
+ src_reg = &fake_reg;
+ src_reg->type = SCALAR_VALUE;
+ __mark_reg_known(src_reg, insn->imm);
}
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
-
- if (BPF_SRC(insn->code) == BPF_K) {
- pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32);
- } else if (src_reg->type == SCALAR_VALUE &&
- is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) {
- pred = is_branch_taken(dst_reg,
- tnum_subreg(src_reg->var_off).value,
- opcode,
- is_jmp32);
- } else if (src_reg->type == SCALAR_VALUE &&
- !is_jmp32 && tnum_is_const(src_reg->var_off)) {
- pred = is_branch_taken(dst_reg,
- src_reg->var_off.value,
- opcode,
- is_jmp32);
- } else if (dst_reg->type == SCALAR_VALUE &&
- is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) {
- pred = is_branch_taken(src_reg,
- tnum_subreg(dst_reg->var_off).value,
- flip_opcode(opcode),
- is_jmp32);
- } else if (dst_reg->type == SCALAR_VALUE &&
- !is_jmp32 && tnum_is_const(dst_reg->var_off)) {
- pred = is_branch_taken(src_reg,
- dst_reg->var_off.value,
- flip_opcode(opcode),
- is_jmp32);
- } else if (reg_is_pkt_pointer_any(dst_reg) &&
- reg_is_pkt_pointer_any(src_reg) &&
- !is_jmp32) {
- pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode);
- }
-
+ pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
if (pred >= 0) {
/* If we get here with a dst_reg pointer type it is because
* above is_branch_taken() special cased the 0 comparison.
@@ -14137,6 +14870,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
!sanitize_speculative_path(env, insn, *insn_idx + 1,
*insn_idx))
return -EFAULT;
+ if (env->log.level & BPF_LOG_LEVEL)
+ print_insn_state(env, this_branch->frame[this_branch->curframe]);
*insn_idx += insn->off;
return 0;
} else if (pred == 0) {
@@ -14149,6 +14884,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*insn_idx + insn->off + 1,
*insn_idx))
return -EFAULT;
+ if (env->log.level & BPF_LOG_LEVEL)
+ print_insn_state(env, this_branch->frame[this_branch->curframe]);
return 0;
}
@@ -14158,53 +14895,27 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EFAULT;
other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
- /* detect if we are comparing against a constant value so we can adjust
- * our min/max values for our dst register.
- * this is only legit if both are scalars (or pointers to the same
- * object, I suppose, see the PTR_MAYBE_NULL related if block below),
- * because otherwise the different base pointers mean the offsets aren't
- * comparable.
- */
if (BPF_SRC(insn->code) == BPF_X) {
- struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-
- if (dst_reg->type == SCALAR_VALUE &&
- src_reg->type == SCALAR_VALUE) {
- if (tnum_is_const(src_reg->var_off) ||
- (is_jmp32 &&
- tnum_is_const(tnum_subreg(src_reg->var_off))))
- reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg,
- src_reg->var_off.value,
- tnum_subreg(src_reg->var_off).value,
- opcode, is_jmp32);
- else if (tnum_is_const(dst_reg->var_off) ||
- (is_jmp32 &&
- tnum_is_const(tnum_subreg(dst_reg->var_off))))
- reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
- src_reg,
- dst_reg->var_off.value,
- tnum_subreg(dst_reg->var_off).value,
- opcode, is_jmp32);
- else if (!is_jmp32 &&
- (opcode == BPF_JEQ || opcode == BPF_JNE))
- /* Comparing for equality, we can combine knowledge */
- reg_combine_min_max(&other_branch_regs[insn->src_reg],
- &other_branch_regs[insn->dst_reg],
- src_reg, dst_reg, opcode);
- if (src_reg->id &&
- !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
- find_equal_scalars(this_branch, src_reg);
- find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
- }
-
- }
- } else if (dst_reg->type == SCALAR_VALUE) {
- reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg, insn->imm, (u32)insn->imm,
- opcode, is_jmp32);
+ err = reg_set_min_max(env,
+ &other_branch_regs[insn->dst_reg],
+ &other_branch_regs[insn->src_reg],
+ dst_reg, src_reg, opcode, is_jmp32);
+ } else /* BPF_SRC(insn->code) == BPF_K */ {
+ err = reg_set_min_max(env,
+ &other_branch_regs[insn->dst_reg],
+ src_reg /* fake one */,
+ dst_reg, src_reg /* same fake one */,
+ opcode, is_jmp32);
}
+ if (err)
+ return err;
+ if (BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == SCALAR_VALUE && src_reg->id &&
+ !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
+ find_equal_scalars(this_branch, src_reg);
+ find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
+ }
if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
!WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
find_equal_scalars(this_branch, dst_reg);
@@ -14427,7 +15138,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* gen_ld_abs() may terminate the program at runtime, leading to
* reference leak.
*/
- err = check_reference_leak(env);
+ err = check_reference_leak(env, false);
if (err) {
verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
return err;
@@ -14476,19 +15187,20 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
-static int check_return_code(struct bpf_verifier_env *env)
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
{
+ const char *exit_ctx = "At program exit";
struct tnum enforce_attach_type_range = tnum_unknown;
const struct bpf_prog *prog = env->prog;
struct bpf_reg_state *reg;
- struct tnum range = tnum_range(0, 1);
+ struct bpf_retval_range range = retval_range(0, 1);
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
int err;
struct bpf_func_state *frame = env->cur_state->frame[0];
const bool is_subprog = frame->subprogno;
/* LSM and struct_ops func-ptr's return type could be "void" */
- if (!is_subprog) {
+ if (!is_subprog || frame->in_exception_callback_fn) {
switch (prog_type) {
case BPF_PROG_TYPE_LSM:
if (prog->expected_attach_type == BPF_LSM_CGROUP)
@@ -14510,36 +15222,28 @@ static int check_return_code(struct bpf_verifier_env *env)
* of bpf_exit, which means that program wrote
* something into it earlier
*/
- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ err = check_reg_arg(env, regno, SRC_OP);
if (err)
return err;
- if (is_pointer_value(env, BPF_REG_0)) {
- verbose(env, "R0 leaks addr as return value\n");
+ if (is_pointer_value(env, regno)) {
+ verbose(env, "R%d leaks addr as return value\n", regno);
return -EACCES;
}
- reg = cur_regs(env) + BPF_REG_0;
+ reg = cur_regs(env) + regno;
if (frame->in_async_callback_fn) {
/* enforce return zero from async callbacks like timer */
- if (reg->type != SCALAR_VALUE) {
- verbose(env, "In async callback the register R0 is not a known value (%s)\n",
- reg_type_str(env, reg->type));
- return -EINVAL;
- }
-
- if (!tnum_in(tnum_const(0), reg->var_off)) {
- verbose_invalid_scalar(env, reg, &range, "async callback", "R0");
- return -EINVAL;
- }
- return 0;
+ exit_ctx = "At async callback return";
+ range = retval_range(0, 0);
+ goto enforce_retval;
}
- if (is_subprog) {
+ if (is_subprog && !frame->in_exception_callback_fn) {
if (reg->type != SCALAR_VALUE) {
- verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
- reg_type_str(env, reg->type));
+ verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n",
+ regno, reg_type_str(env, reg->type));
return -EINVAL;
}
return 0;
@@ -14549,18 +15253,21 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG ||
env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME ||
env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
- env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
- range = tnum_range(1, 1);
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
+ range = retval_range(1, 1);
if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
- range = tnum_range(0, 3);
+ range = retval_range(0, 3);
break;
case BPF_PROG_TYPE_CGROUP_SKB:
if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
- range = tnum_range(0, 3);
+ range = retval_range(0, 3);
enforce_attach_type_range = tnum_range(2, 3);
}
break;
@@ -14573,13 +15280,13 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_PROG_TYPE_RAW_TRACEPOINT:
if (!env->prog->aux->attach_btf_id)
return 0;
- range = tnum_const(0);
+ range = retval_range(0, 0);
break;
case BPF_PROG_TYPE_TRACING:
switch (env->prog->expected_attach_type) {
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
- range = tnum_const(0);
+ range = retval_range(0, 0);
break;
case BPF_TRACE_RAW_TP:
case BPF_MODIFY_RETURN:
@@ -14591,7 +15298,7 @@ static int check_return_code(struct bpf_verifier_env *env)
}
break;
case BPF_PROG_TYPE_SK_LOOKUP:
- range = tnum_range(SK_DROP, SK_PASS);
+ range = retval_range(SK_DROP, SK_PASS);
break;
case BPF_PROG_TYPE_LSM:
@@ -14605,12 +15312,12 @@ static int check_return_code(struct bpf_verifier_env *env)
/* Make sure programs that attach to void
* hooks don't try to modify return value.
*/
- range = tnum_range(1, 1);
+ range = retval_range(1, 1);
}
break;
case BPF_PROG_TYPE_NETFILTER:
- range = tnum_range(NF_DROP, NF_ACCEPT);
+ range = retval_range(NF_DROP, NF_ACCEPT);
break;
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
@@ -14620,15 +15327,21 @@ static int check_return_code(struct bpf_verifier_env *env)
return 0;
}
+enforce_retval:
if (reg->type != SCALAR_VALUE) {
- verbose(env, "At program exit the register R0 is not a known value (%s)\n",
- reg_type_str(env, reg->type));
+ verbose(env, "%s the register R%d is not a known value (%s)\n",
+ exit_ctx, regno, reg_type_str(env, reg->type));
return -EINVAL;
}
- if (!tnum_in(range, reg->var_off)) {
- verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
- if (prog->expected_attach_type == BPF_LSM_CGROUP &&
+ err = mark_chain_precision(env, regno);
+ if (err)
+ return err;
+
+ if (!retval_range_within(range, reg)) {
+ verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
+ if (!is_subprog &&
+ prog->expected_attach_type == BPF_LSM_CGROUP &&
prog_type == BPF_PROG_TYPE_LSM &&
!prog->aux->attach_func_proto->type)
verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
@@ -14681,21 +15394,6 @@ enum {
BRANCH = 2,
};
-static u32 state_htab_size(struct bpf_verifier_env *env)
-{
- return env->prog->len;
-}
-
-static struct bpf_verifier_state_list **explored_state(
- struct bpf_verifier_env *env,
- int idx)
-{
- struct bpf_verifier_state *cur = env->cur_state;
- struct bpf_func_state *state = cur->frame[cur->curframe];
-
- return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
-}
-
static void mark_prune_point(struct bpf_verifier_env *env, int idx)
{
env->insn_aux_data[idx].prune_point = true;
@@ -14716,6 +15414,15 @@ static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
return env->insn_aux_data[insn_idx].force_checkpoint;
}
+static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].calls_callback = true;
+}
+
+static bool calls_callback(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].calls_callback;
+}
enum {
DONE_EXPLORING = 0,
@@ -14727,8 +15434,7 @@ enum {
* w - next instruction
* e - edge
*/
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
- bool loop_ok)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
{
int *insn_stack = env->cfg.insn_stack;
int *insn_state = env->cfg.insn_state;
@@ -14760,7 +15466,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
insn_stack[env->cfg.cur_stack++] = w;
return KEEP_EXPLORING;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- if (loop_ok && env->bpf_capable)
+ if (env->bpf_capable)
return DONE_EXPLORING;
verbose_linfo(env, t, "%d: ", t);
verbose_linfo(env, w, "%d: ", w);
@@ -14780,24 +15486,20 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
struct bpf_verifier_env *env,
bool visit_callee)
{
- int ret;
+ int ret, insn_sz;
- ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
+ insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
+ ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
if (ret)
return ret;
- mark_prune_point(env, t + 1);
+ mark_prune_point(env, t + insn_sz);
/* when we exit from subprog, we need to record non-linear history */
- mark_jmp_point(env, t + 1);
+ mark_jmp_point(env, t + insn_sz);
if (visit_callee) {
mark_prune_point(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
- /* It's ok to allow recursion from CFG point of
- * view. __check_func_call() will do the actual
- * check.
- */
- bpf_pseudo_func(insns + t));
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
}
return ret;
}
@@ -14810,15 +15512,17 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
static int visit_insn(int t, struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
- int ret, off;
+ int ret, off, insn_sz;
if (bpf_pseudo_func(insn))
return visit_func_call_insn(t, insns, env, true);
/* All non-branch instructions have a single fall-through edge. */
if (BPF_CLASS(insn->code) != BPF_JMP &&
- BPF_CLASS(insn->code) != BPF_JMP32)
- return push_insn(t, t + 1, FALLTHROUGH, env, false);
+ BPF_CLASS(insn->code) != BPF_JMP32) {
+ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+ return push_insn(t, t + insn_sz, FALLTHROUGH, env);
+ }
switch (BPF_OP(insn->code)) {
case BPF_EXIT:
@@ -14832,6 +15536,21 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
* async state will be pushed for further exploration.
*/
mark_prune_point(env, t);
+ /* For functions that invoke callbacks it is not known how many times
+ * callback would be called. Verifier models callback calling functions
+ * by repeatedly visiting callback bodies and returning to origin call
+ * instruction.
+ * In order to stop such iteration verifier needs to identify when a
+ * state identical some state from a previous iteration is reached.
+ * Check below forces creation of checkpoint before callback calling
+ * instruction to allow search for such identical states.
+ */
+ if (is_sync_callback_calling_insn(insn)) {
+ mark_calls_callback(env, t);
+ mark_force_checkpoint(env, t);
+ mark_prune_point(env, t);
+ mark_jmp_point(env, t);
+ }
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
struct bpf_kfunc_call_arg_meta meta;
@@ -14864,8 +15583,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
off = insn->imm;
/* unconditional jump with single edge */
- ret = push_insn(t, t + off + 1, FALLTHROUGH, env,
- true);
+ ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
if (ret)
return ret;
@@ -14878,11 +15596,11 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
/* conditional jump with two edges */
mark_prune_point(env, t);
- ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
if (ret)
return ret;
- return push_insn(t, t + insn->off + 1, BRANCH, env, true);
+ return push_insn(t, t + insn->off + 1, BRANCH, env);
}
}
@@ -14893,8 +15611,8 @@ static int check_cfg(struct bpf_verifier_env *env)
{
int insn_cnt = env->prog->len;
int *insn_stack, *insn_state;
- int ret = 0;
- int i;
+ int ex_insn_beg, i, ret = 0;
+ bool ex_done = false;
insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
@@ -14910,6 +15628,7 @@ static int check_cfg(struct bpf_verifier_env *env)
insn_stack[0] = 0; /* 0 is the first instruction */
env->cfg.cur_stack = 1;
+walk_cfg:
while (env->cfg.cur_stack > 0) {
int t = insn_stack[env->cfg.cur_stack - 1];
@@ -14936,12 +15655,32 @@ static int check_cfg(struct bpf_verifier_env *env)
goto err_free;
}
+ if (env->exception_callback_subprog && !ex_done) {
+ ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start;
+
+ insn_state[ex_insn_beg] = DISCOVERED;
+ insn_stack[0] = ex_insn_beg;
+ env->cfg.cur_stack = 1;
+ ex_done = true;
+ goto walk_cfg;
+ }
+
for (i = 0; i < insn_cnt; i++) {
+ struct bpf_insn *insn = &env->prog->insnsi[i];
+
if (insn_state[i] != EXPLORED) {
verbose(env, "unreachable insn %d\n", i);
ret = -EINVAL;
goto err_free;
}
+ if (bpf_is_ldimm64(insn)) {
+ if (insn_state[i + 1] != 0) {
+ verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ i++; /* skip second half of ldimm64 */
+ }
}
ret = 0; /* cfg looks good */
@@ -14973,20 +15712,18 @@ static int check_abnormal_return(struct bpf_verifier_env *env)
#define MIN_BPF_FUNCINFO_SIZE 8
#define MAX_FUNCINFO_REC_SIZE 252
-static int check_btf_func(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
+static int check_btf_func_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
{
- const struct btf_type *type, *func_proto, *ret_type;
- u32 i, nfuncs, urec_size, min_size;
u32 krec_size = sizeof(struct bpf_func_info);
+ const struct btf_type *type, *func_proto;
+ u32 i, nfuncs, urec_size, min_size;
struct bpf_func_info *krecord;
- struct bpf_func_info_aux *info_aux = NULL;
struct bpf_prog *prog;
const struct btf *btf;
- bpfptr_t urecord;
u32 prev_offset = 0;
- bool scalar_return;
+ bpfptr_t urecord;
int ret = -ENOMEM;
nfuncs = attr->func_info_cnt;
@@ -14996,11 +15733,6 @@ static int check_btf_func(struct bpf_verifier_env *env,
return 0;
}
- if (nfuncs != env->subprog_cnt) {
- verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
- return -EINVAL;
- }
-
urec_size = attr->func_info_rec_size;
if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
urec_size > MAX_FUNCINFO_REC_SIZE ||
@@ -15018,9 +15750,6 @@ static int check_btf_func(struct bpf_verifier_env *env,
krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
if (!krecord)
return -ENOMEM;
- info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
- if (!info_aux)
- goto err_free;
for (i = 0; i < nfuncs; i++) {
ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
@@ -15059,11 +15788,6 @@ static int check_btf_func(struct bpf_verifier_env *env,
goto err_free;
}
- if (env->subprog_info[i].start != krecord[i].insn_off) {
- verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
- goto err_free;
- }
-
/* check type_id */
type = btf_type_by_id(btf, krecord[i].type_id);
if (!type || !btf_type_is_func(type)) {
@@ -15071,12 +15795,77 @@ static int check_btf_func(struct bpf_verifier_env *env,
krecord[i].type_id);
goto err_free;
}
- info_aux[i].linkage = BTF_INFO_VLEN(type->info);
func_proto = btf_type_by_id(btf, type->type);
if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
/* btf_func_check() already verified it during BTF load */
goto err_free;
+
+ prev_offset = krecord[i].insn_off;
+ bpfptr_add(&urecord, urec_size);
+ }
+
+ prog->aux->func_info = krecord;
+ prog->aux->func_info_cnt = nfuncs;
+ return 0;
+
+err_free:
+ kvfree(krecord);
+ return ret;
+}
+
+static int check_btf_func(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ const struct btf_type *type, *func_proto, *ret_type;
+ u32 i, nfuncs, urec_size;
+ struct bpf_func_info *krecord;
+ struct bpf_func_info_aux *info_aux = NULL;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ bpfptr_t urecord;
+ bool scalar_return;
+ int ret = -ENOMEM;
+
+ nfuncs = attr->func_info_cnt;
+ if (!nfuncs) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+ if (nfuncs != env->subprog_cnt) {
+ verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
+ return -EINVAL;
+ }
+
+ urec_size = attr->func_info_rec_size;
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
+
+ krecord = prog->aux->func_info;
+ info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
+ if (!info_aux)
+ return -ENOMEM;
+
+ for (i = 0; i < nfuncs; i++) {
+ /* check insn_off */
+ ret = -EINVAL;
+
+ if (env->subprog_info[i].start != krecord[i].insn_off) {
+ verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
+ goto err_free;
+ }
+
+ /* Already checked type_id */
+ type = btf_type_by_id(btf, krecord[i].type_id);
+ info_aux[i].linkage = BTF_INFO_VLEN(type->info);
+ /* Already checked func_proto */
+ func_proto = btf_type_by_id(btf, type->type);
+
ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
scalar_return =
btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
@@ -15089,17 +15878,13 @@ static int check_btf_func(struct bpf_verifier_env *env,
goto err_free;
}
- prev_offset = krecord[i].insn_off;
bpfptr_add(&urecord, urec_size);
}
- prog->aux->func_info = krecord;
- prog->aux->func_info_cnt = nfuncs;
prog->aux->func_info_aux = info_aux;
return 0;
err_free:
- kvfree(krecord);
kfree(info_aux);
return ret;
}
@@ -15112,7 +15897,8 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
if (!aux->func_info)
return;
- for (i = 0; i < env->subprog_cnt; i++)
+ /* func_info is not available for hidden subprogs */
+ for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++)
aux->func_info[i].insn_off = env->subprog_info[i].start;
}
@@ -15316,9 +16102,9 @@ static int check_core_relo(struct bpf_verifier_env *env,
return err;
}
-static int check_btf_info(struct bpf_verifier_env *env,
- const union bpf_attr *attr,
- bpfptr_t uattr)
+static int check_btf_info_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
{
struct btf *btf;
int err;
@@ -15338,6 +16124,24 @@ static int check_btf_info(struct bpf_verifier_env *env,
}
env->prog->aux->btf = btf;
+ err = check_btf_func_early(env, attr, uattr);
+ if (err)
+ return err;
+ return 0;
+}
+
+static int check_btf_info(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ int err;
+
+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+
err = check_btf_func(env, attr, uattr);
if (err)
return err;
@@ -15496,18 +16300,14 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
struct bpf_verifier_state *cur)
{
struct bpf_verifier_state_list *sl;
- int i;
sl = *explored_state(env, insn);
while (sl) {
if (sl->state.branches)
goto next;
if (sl->state.insn_idx != insn ||
- sl->state.curframe != cur->curframe)
+ !same_callsites(&sl->state, cur))
goto next;
- for (i = 0; i <= cur->curframe; i++)
- if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
- goto next;
clean_verifier_state(env, &sl->state);
next:
sl = sl->next;
@@ -15525,8 +16325,11 @@ static bool regs_exact(const struct bpf_reg_state *rold,
/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
- struct bpf_reg_state *rcur, struct bpf_idmap *idmap)
+ struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact)
{
+ if (exact)
+ return regs_exact(rold, rcur, idmap);
+
if (!(rold->live & REG_LIVE_READ))
/* explored state didn't use this */
return true;
@@ -15643,7 +16446,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
}
static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, struct bpf_idmap *idmap)
+ struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact)
{
int i, spi;
@@ -15656,7 +16459,12 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
spi = i / BPF_REG_SIZE;
- if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
+ if (exact &&
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ return false;
+
+ if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) {
i += BPF_REG_SIZE - 1;
/* explored state didn't use this */
continue;
@@ -15706,7 +16514,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
* return false to continue verification of this path
*/
if (!regsafe(env, &old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr, idmap))
+ &cur->stack[spi].spilled_ptr, idmap, exact))
return false;
break;
case STACK_DYNPTR:
@@ -15788,16 +16596,16 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
* the current state will reach 'bpf_exit' instruction safely
*/
static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur)
+ struct bpf_func_state *cur, bool exact)
{
int i;
for (i = 0; i < MAX_BPF_REG; i++)
if (!regsafe(env, &old->regs[i], &cur->regs[i],
- &env->idmap_scratch))
+ &env->idmap_scratch, exact))
return false;
- if (!stacksafe(env, old, cur, &env->idmap_scratch))
+ if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
return false;
if (!refsafe(old, cur, &env->idmap_scratch))
@@ -15806,17 +16614,23 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
return true;
}
+static void reset_idmap_scratch(struct bpf_verifier_env *env)
+{
+ env->idmap_scratch.tmp_id_gen = env->id_gen;
+ memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+}
+
static bool states_equal(struct bpf_verifier_env *env,
struct bpf_verifier_state *old,
- struct bpf_verifier_state *cur)
+ struct bpf_verifier_state *cur,
+ bool exact)
{
int i;
if (old->curframe != cur->curframe)
return false;
- env->idmap_scratch.tmp_id_gen = env->id_gen;
- memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+ reset_idmap_scratch(env);
/* Verification state from speculative execution simulation
* must never prune a non-speculative execution one.
@@ -15846,7 +16660,7 @@ static bool states_equal(struct bpf_verifier_env *env,
for (i = 0; i <= old->curframe; i++) {
if (old->frame[i]->callsite != cur->frame[i]->callsite)
return false;
- if (!func_states_equal(env, old->frame[i], cur->frame[i]))
+ if (!func_states_equal(env, old->frame[i], cur->frame[i], exact))
return false;
}
return true;
@@ -16100,10 +16914,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl, **pprev;
- struct bpf_verifier_state *cur = env->cur_state, *new;
- int i, j, err, states_cnt = 0;
+ struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
+ int i, j, n, err, states_cnt = 0;
bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
bool add_new_state = force_new_state;
+ bool force_exact;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -16156,9 +16971,33 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* It's safe to assume that iterator loop will finish, taking into
* account iter_next() contract of eventually returning
* sticky NULL result.
+ *
+ * Note, that states have to be compared exactly in this case because
+ * read and precision marks might not be finalized inside the loop.
+ * E.g. as in the program below:
+ *
+ * 1. r7 = -16
+ * 2. r6 = bpf_get_prandom_u32()
+ * 3. while (bpf_iter_num_next(&fp[-8])) {
+ * 4. if (r6 != 42) {
+ * 5. r7 = -32
+ * 6. r6 = bpf_get_prandom_u32()
+ * 7. continue
+ * 8. }
+ * 9. r0 = r10
+ * 10. r0 += r7
+ * 11. r8 = *(u64 *)(r0 + 0)
+ * 12. r6 = bpf_get_prandom_u32()
+ * 13. }
+ *
+ * Here verifier would first visit path 1-3, create a checkpoint at 3
+ * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
+ * not have read or precision mark for r7 yet, thus inexact states
+ * comparison would discard current state with r7=-32
+ * => unsafe memory access at 11 would not be caught.
*/
if (is_iter_next_insn(env, insn_idx)) {
- if (states_equal(env, &sl->state, cur)) {
+ if (states_equal(env, &sl->state, cur, true)) {
struct bpf_func_state *cur_frame;
struct bpf_reg_state *iter_state, *iter_reg;
int spi;
@@ -16174,17 +17013,29 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
*/
spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
- if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+ if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
+ update_loop_entry(cur, &sl->state);
goto hit;
+ }
}
goto skip_inf_loop_check;
}
+ if (calls_callback(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, true))
+ goto hit;
+ goto skip_inf_loop_check;
+ }
/* attempt to detect infinite loop to avoid unnecessary doomed work */
if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur) &&
- !iter_active_depths_differ(&sl->state, cur)) {
+ states_equal(env, &sl->state, cur, false) &&
+ !iter_active_depths_differ(&sl->state, cur) &&
+ sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+ verbose(env, "cur state:");
+ print_verifier_state(env, cur->frame[cur->curframe], true);
+ verbose(env, "old state:");
+ print_verifier_state(env, sl->state.frame[cur->curframe], true);
return -EINVAL;
}
/* if the verifier is processing a loop, avoid adding new state
@@ -16206,7 +17057,36 @@ skip_inf_loop_check:
add_new_state = false;
goto miss;
}
- if (states_equal(env, &sl->state, cur)) {
+ /* If sl->state is a part of a loop and this loop's entry is a part of
+ * current verification path then states have to be compared exactly.
+ * 'force_exact' is needed to catch the following case:
+ *
+ * initial Here state 'succ' was processed first,
+ * | it was eventually tracked to produce a
+ * V state identical to 'hdr'.
+ * .---------> hdr All branches from 'succ' had been explored
+ * | | and thus 'succ' has its .branches == 0.
+ * | V
+ * | .------... Suppose states 'cur' and 'succ' correspond
+ * | | | to the same instruction + callsites.
+ * | V V In such case it is necessary to check
+ * | ... ... if 'succ' and 'cur' are states_equal().
+ * | | | If 'succ' and 'cur' are a part of the
+ * | V V same loop exact flag has to be set.
+ * | succ <- cur To check if that is the case, verify
+ * | | if loop entry of 'succ' is in current
+ * | V DFS path.
+ * | ...
+ * | |
+ * '----'
+ *
+ * Additional details are in the comment before get_loop_entry().
+ */
+ loop_entry = get_loop_entry(&sl->state);
+ force_exact = loop_entry && loop_entry->branches > 0;
+ if (states_equal(env, &sl->state, cur, force_exact)) {
+ if (force_exact)
+ update_loop_entry(cur, loop_entry);
hit:
sl->hit_cnt++;
/* reached equivalent register/stack state,
@@ -16226,7 +17106,8 @@ hit:
* the precision needs to be propagated back in
* the current state.
*/
- err = err ? : push_jmp_history(env, cur);
+ if (is_jmp_point(env, env->insn_idx))
+ err = err ? : push_jmp_history(env, cur, 0);
err = err ? : propagate_precision(env, &sl->state);
if (err)
return err;
@@ -16245,13 +17126,18 @@ miss:
* to keep checking from state equivalence point of view.
* Higher numbers increase max_states_per_insn and verification time,
* but do not meaningfully decrease insn_processed.
+ * 'n' controls how many times state could miss before eviction.
+ * Use bigger 'n' for checkpoints because evicting checkpoint states
+ * too early would hinder iterator convergence.
*/
- if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
+ n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
+ if (sl->miss_cnt > sl->hit_cnt * n + n) {
/* the state is unlikely to be useful. Remove it to
* speed up verification
*/
*pprev = sl->next;
- if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
+ if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE &&
+ !sl->state.used_as_loop_entry) {
u32 br = sl->state.branches;
WARN_ONCE(br,
@@ -16320,6 +17206,7 @@ next:
cur->parent = new;
cur->first_insn_idx = insn_idx;
+ cur->dfs_depth = new->dfs_depth + 1;
clear_jmp_history(cur);
new_sl->next = *explored_state(env, insn_idx);
*explored_state(env, insn_idx) = new_sl;
@@ -16440,10 +17327,14 @@ static int do_check(struct bpf_verifier_env *env)
int prev_insn_idx = -1;
for (;;) {
+ bool exception_exit = false;
struct bpf_insn *insn;
u8 class;
int err;
+ /* reset current history entry on each new instruction */
+ env->cur_hist_ent = NULL;
+
env->prev_insn_idx = prev_insn_idx;
if (env->insn_idx >= insn_cnt) {
verbose(env, "invalid insn idx %d insn_cnt %d\n",
@@ -16483,7 +17374,7 @@ static int do_check(struct bpf_verifier_env *env)
}
if (is_jmp_point(env, env->insn_idx)) {
- err = push_jmp_history(env, state);
+ err = push_jmp_history(env, state, 0);
if (err)
return err;
}
@@ -16560,10 +17451,8 @@ static int do_check(struct bpf_verifier_env *env)
insn->off, BPF_SIZE(insn->code),
BPF_READ, insn->dst_reg, false,
BPF_MODE(insn->code) == BPF_MEMSX);
- if (err)
- return err;
-
- err = save_aux_ptr_type(env, src_reg_type, true);
+ err = err ?: save_aux_ptr_type(env, src_reg_type, true);
+ err = err ?: reg_bounds_sanity_check(env, &regs[insn->dst_reg], "ldx");
if (err)
return err;
} else if (class == BPF_STX) {
@@ -16654,12 +17543,17 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
}
- if (insn->src_reg == BPF_PSEUDO_CALL)
+ if (insn->src_reg == BPF_PSEUDO_CALL) {
err = check_func_call(env, insn, &env->insn_idx);
- else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
err = check_kfunc_call(env, insn, &env->insn_idx);
- else
+ if (!err && is_bpf_throw_kfunc(insn)) {
+ exception_exit = true;
+ goto process_bpf_exit_full;
+ }
+ } else {
err = check_helper_call(env, insn, &env->insn_idx);
+ }
if (err)
return err;
@@ -16689,7 +17583,7 @@ static int do_check(struct bpf_verifier_env *env)
verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
-
+process_bpf_exit_full:
if (env->cur_state->active_lock.ptr &&
!in_rbtree_lock_required_cb(env)) {
verbose(env, "bpf_spin_unlock is missing\n");
@@ -16708,10 +17602,23 @@ static int do_check(struct bpf_verifier_env *env)
* function, for which reference_state must
* match caller reference state when it exits.
*/
- err = check_reference_leak(env);
+ err = check_reference_leak(env, exception_exit);
if (err)
return err;
+ /* The side effect of the prepare_func_exit
+ * which is being skipped is that it frees
+ * bpf_func_state. Typically, process_bpf_exit
+ * will only be hit with outermost exit.
+ * copy_verifier_state in pop_stack will handle
+ * freeing of any extra bpf_func_state left over
+ * from not processing all nested function
+ * exits. We also skip return code checks as
+ * they are not needed for exceptional exits.
+ */
+ if (exception_exit)
+ goto process_bpf_exit;
+
if (state->curframe) {
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
@@ -16721,7 +17628,7 @@ static int do_check(struct bpf_verifier_env *env)
continue;
}
- err = check_return_code(env);
+ err = check_return_code(env, BPF_REG_0, "R0");
if (err)
return err;
process_bpf_exit:
@@ -17182,10 +18089,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
return -E2BIG;
}
+ if (env->prog->aux->sleepable)
+ atomic64_inc(&map->sleepable_refcnt);
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
- * and all maps are released in free_used_maps()
+ * and all maps are released in bpf_free_used_maps()
*/
bpf_map_inc(map);
@@ -18014,6 +18923,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
}
func[i]->aux->num_exentries = num_exentries;
func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
+ func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+ if (!i)
+ func[i]->aux->exception_boundary = env->seen_exception;
func[i] = bpf_int_jit_compile(func[i]);
if (!func[i]->jited) {
err = -ENOTSUPP;
@@ -18053,7 +18965,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* the call instruction, as an index for this list
*/
func[i]->aux->func = func;
- func[i]->aux->func_cnt = env->subprog_cnt;
+ func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ func[i]->aux->real_func_cnt = env->subprog_cnt;
}
for (i = 0; i < env->subprog_cnt; i++) {
old_bpf_func = func[i]->bpf_func;
@@ -18099,7 +19012,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->aux->extable = func[0]->aux->extable;
prog->aux->num_exentries = func[0]->aux->num_exentries;
prog->aux->func = func;
- prog->aux->func_cnt = env->subprog_cnt;
+ prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ prog->aux->real_func_cnt = env->subprog_cnt;
+ prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
+ prog->aux->exception_boundary = func[0]->aux->exception_boundary;
bpf_prog_jit_attempt_done(prog);
return 0;
out_free:
@@ -18266,21 +19182,35 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
insn->imm = BPF_CALL_IMM(desc->addr);
if (insn->off)
return 0;
- if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+ if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
+ if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) {
+ verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
insn_buf[1] = addr[0];
insn_buf[2] = addr[1];
insn_buf[3] = *insn;
*cnt = 4;
} else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] ||
desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) {
+ verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
!kptr_struct_meta) {
verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
@@ -18321,6 +19251,33 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return 0;
}
+/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
+static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+{
+ struct bpf_subprog_info *info = env->subprog_info;
+ int cnt = env->subprog_cnt;
+ struct bpf_prog *prog;
+
+ /* We only reserve one slot for hidden subprogs in subprog_info. */
+ if (env->hidden_subprog_cnt) {
+ verbose(env, "verifier internal error: only one hidden subprog supported\n");
+ return -EFAULT;
+ }
+ /* We're not patching any existing instruction, just appending the new
+ * ones for the hidden subprog. Hence all of the adjustment operations
+ * in bpf_patch_insn_data are no-ops.
+ */
+ prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
+ if (!prog)
+ return -ENOMEM;
+ env->prog = prog;
+ info[cnt + 1].start = info[cnt].start;
+ info[cnt].start = prog->len - len + 1;
+ env->subprog_cnt++;
+ env->hidden_subprog_cnt++;
+ return 0;
+}
+
/* Do various post-verification rewrites in a single program pass.
* These rewrites simplify JIT and interpreter implementations.
*/
@@ -18339,6 +19296,24 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
struct bpf_map *map_ptr;
int i, ret, cnt, delta = 0;
+ if (env->seen_exception && !env->exception_callback_subprog) {
+ struct bpf_insn patch[] = {
+ env->prog->insnsi[insn_cnt - 1],
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ };
+
+ ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch));
+ if (ret < 0)
+ return ret;
+ prog = env->prog;
+ insn = prog->insnsi;
+
+ env->exception_callback_subprog = env->subprog_cnt - 1;
+ /* Don't update insn_cnt, as add_hidden_subprog always appends insns */
+ mark_subprog_exc_cb(env, env->exception_callback_subprog);
+ }
+
for (i = 0; i < insn_cnt; i++, insn++) {
/* Make divide-by-zero exceptions impossible. */
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
@@ -18608,6 +19583,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto patch_call_imm;
}
+ /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
+ if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
+ /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
+ * bpf_mem_alloc() returns a ptr to the percpu data ptr.
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* and other inlining handlers are currently limited to 64 bit
* only.
@@ -19020,6 +20014,7 @@ static void free_states(struct bpf_verifier_env *env)
static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
+ struct bpf_subprog_info *sub = subprog_info(env, subprog);
struct bpf_verifier_state *state;
struct bpf_reg_state *regs;
int ret, i;
@@ -19046,40 +20041,71 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
state->first_insn_idx = env->subprog_info[subprog].start;
state->last_insn_idx = -1;
+
regs = state->frame[state->curframe]->regs;
if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
- ret = btf_prepare_func_args(env, subprog, regs);
+ const char *sub_name = subprog_name(env, subprog);
+ struct bpf_subprog_arg_info *arg;
+ struct bpf_reg_state *reg;
+
+ verbose(env, "Validating %s() func#%d...\n", sub_name, subprog);
+ ret = btf_prepare_func_args(env, subprog);
if (ret)
goto out;
- for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
- if (regs[i].type == PTR_TO_CTX)
+
+ if (subprog_is_exc_cb(env, subprog)) {
+ state->frame[0]->in_exception_callback_fn = true;
+ /* We have already ensured that the callback returns an integer, just
+ * like all global subprogs. We need to determine it only has a single
+ * scalar argument.
+ */
+ if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) {
+ verbose(env, "exception cb only supports single integer argument\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ for (i = BPF_REG_1; i <= sub->arg_cnt; i++) {
+ arg = &sub->args[i - BPF_REG_1];
+ reg = &regs[i];
+
+ if (arg->arg_type == ARG_PTR_TO_CTX) {
+ reg->type = PTR_TO_CTX;
mark_reg_known_zero(env, regs, i);
- else if (regs[i].type == SCALAR_VALUE)
+ } else if (arg->arg_type == ARG_ANYTHING) {
+ reg->type = SCALAR_VALUE;
mark_reg_unknown(env, regs, i);
- else if (base_type(regs[i].type) == PTR_TO_MEM) {
- const u32 mem_size = regs[i].mem_size;
-
+ } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+ /* assume unspecial LOCAL dynptr type */
+ __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen);
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
+ reg->type = PTR_TO_MEM;
+ if (arg->arg_type & PTR_MAYBE_NULL)
+ reg->type |= PTR_MAYBE_NULL;
mark_reg_known_zero(env, regs, i);
- regs[i].mem_size = mem_size;
- regs[i].id = ++env->id_gen;
+ reg->mem_size = arg->mem_size;
+ reg->id = ++env->id_gen;
+ } else {
+ WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n",
+ i - BPF_REG_1, arg->arg_type);
+ ret = -EFAULT;
+ goto out;
}
}
} else {
+ /* if main BPF program has associated BTF info, validate that
+ * it's matching expected signature, and otherwise mark BTF
+ * info for main program as unreliable
+ */
+ if (env->prog->aux->func_info_aux) {
+ ret = btf_prepare_func_args(env, 0);
+ if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX)
+ env->prog->aux->func_info_aux[0].unreliable = true;
+ }
+
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
mark_reg_known_zero(env, regs, BPF_REG_1);
- ret = btf_check_subprog_arg_match(env, subprog, regs);
- if (ret == -EFAULT)
- /* unlikely verifier bug. abort.
- * ret == 0 and ret < 0 are sadly acceptable for
- * main() function due to backward compatibility.
- * Like socket filter program may be written as:
- * int bpf_prog(struct pt_regs *ctx)
- * and never dereference that ctx in the program.
- * 'struct pt_regs' is a type mismatch for socket
- * filter that should be using 'struct __sk_buff'.
- */
- goto out;
}
ret = do_check(env);
@@ -19098,8 +20124,11 @@ out:
return ret;
}
-/* Verify all global functions in a BPF program one by one based on their BTF.
- * All global functions must pass verification. Otherwise the whole program is rejected.
+/* Lazily verify all global functions based on their BTF, if they are called
+ * from main BPF program or any of subprograms transitively.
+ * BPF global subprogs called from dead code are not validated.
+ * All callable global functions must pass verification.
+ * Otherwise the whole program is rejected.
* Consider:
* int bar(int);
* int foo(int f)
@@ -19118,25 +20147,50 @@ out:
static int do_check_subprogs(struct bpf_verifier_env *env)
{
struct bpf_prog_aux *aux = env->prog->aux;
- int i, ret;
+ struct bpf_func_info_aux *sub_aux;
+ int i, ret, new_cnt;
if (!aux->func_info)
return 0;
+ /* exception callback is presumed to be always called */
+ if (env->exception_callback_subprog)
+ subprog_aux(env, env->exception_callback_subprog)->called = true;
+
+again:
+ new_cnt = 0;
for (i = 1; i < env->subprog_cnt; i++) {
- if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
+ if (!subprog_is_global(env, i))
+ continue;
+
+ sub_aux = subprog_aux(env, i);
+ if (!sub_aux->called || sub_aux->verified)
continue;
+
env->insn_idx = env->subprog_info[i].start;
WARN_ON_ONCE(env->insn_idx == 0);
ret = do_check_common(env, i);
if (ret) {
return ret;
} else if (env->log.level & BPF_LOG_LEVEL) {
- verbose(env,
- "Func#%d is safe for any args that match its prototype\n",
- i);
+ verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n",
+ i, subprog_name(env, i));
}
+
+ /* We verified new global subprog, it might have called some
+ * more global subprogs that we haven't verified yet, so we
+ * need to do another pass over subprogs to verify those.
+ */
+ sub_aux->verified = true;
+ new_cnt++;
}
+
+ /* We can't loop forever as we verify at least one global subprog on
+ * each pass.
+ */
+ if (new_cnt)
+ goto again;
+
return 0;
}
@@ -19267,6 +20321,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
struct bpf_attach_target_info *tgt_info)
{
bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
+ bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING;
const char prefix[] = "btf_trace_";
int ret = 0, subprog = -1, i;
const struct btf_type *t;
@@ -19314,6 +20369,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
bpf_log(log, "Subprog %s doesn't exist\n", tname);
return -EINVAL;
}
+ if (aux->func && aux->func[subprog]->aux->exception_cb) {
+ bpf_log(log,
+ "%s programs cannot attach to exception callback\n",
+ prog_extension ? "Extension" : "FENTRY/FEXIT");
+ return -EINVAL;
+ }
conservative = aux->func_info_aux[subprog].unreliable;
if (prog_extension) {
if (conservative) {
@@ -19331,10 +20392,21 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
bpf_log(log, "Can attach to only JITed progs\n");
return -EINVAL;
}
- if (tgt_prog->type == prog->type) {
- /* Cannot fentry/fexit another fentry/fexit program.
- * Cannot attach program extension to another extension.
- * It's ok to attach fentry/fexit to extension program.
+ if (prog_tracing) {
+ if (aux->attach_tracing_prog) {
+ /*
+ * Target program is an fentry/fexit which is already attached
+ * to another tracing program. More levels of nesting
+ * attachment are not allowed.
+ */
+ bpf_log(log, "Cannot nest tracing program attach more than once\n");
+ return -EINVAL;
+ }
+ } else if (tgt_prog->type == prog->type) {
+ /*
+ * To avoid potential call chain cycles, prevent attaching of a
+ * program extension to another extension. It's ok to attach
+ * fentry/fexit to extension program.
*/
bpf_log(log, "Cannot recursively attach\n");
return -EINVAL;
@@ -19347,16 +20419,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
* except fentry/fexit. The reason is the following.
* The fentry/fexit programs are used for performance
* analysis, stats and can be attached to any program
- * type except themselves. When extension program is
- * replacing XDP function it is necessary to allow
- * performance analysis of all functions. Both original
- * XDP program and its program extension. Hence
- * attaching fentry/fexit to BPF_PROG_TYPE_EXT is
- * allowed. If extending of fentry/fexit was allowed it
- * would be possible to create long call chain
- * fentry->extension->fentry->extension beyond
- * reasonable stack size. Hence extending fentry is not
- * allowed.
+ * type. When extension program is replacing XDP function
+ * it is necessary to allow performance analysis of all
+ * functions. Both original XDP program and its program
+ * extension. Hence attaching fentry/fexit to
+ * BPF_PROG_TYPE_EXT is allowed. If extending of
+ * fentry/fexit was allowed it would be possible to create
+ * long call chain fentry->extension->fentry->extension
+ * beyond reasonable stack size. Hence extending fentry
+ * is not allowed.
*/
bpf_log(log, "Cannot extend fentry/fexit\n");
return -EINVAL;
@@ -19643,6 +20714,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
if (!tr)
return -ENOMEM;
+ if (tgt_prog && tgt_prog->aux->tail_call_reachable)
+ tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX;
+
prog->aux->dst_trampoline = tr;
return 0;
}
@@ -19730,6 +20804,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
+ env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
env->explored_states = kvcalloc(state_htab_size(env),
sizeof(struct bpf_verifier_state_list *),
@@ -19738,6 +20813,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (!env->explored_states)
goto skip_full_check;
+ ret = check_btf_info_early(env, attr, uattr);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = add_subprog_and_kfunc(env);
if (ret < 0)
goto skip_full_check;
@@ -19768,8 +20847,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
- ret = do_check_subprogs(env);
- ret = ret ?: do_check_main(env);
+ ret = do_check_main(env);
+ ret = ret ?: do_check_subprogs(env);
if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
ret = bpf_prog_offload_finalize(env);
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index c56071f150f2..520b90dd97ec 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -164,13 +164,13 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
/* iterate across the hierarchies */
#define for_each_root(root) \
- list_for_each_entry((root), &cgroup_roots, root_list)
+ list_for_each_entry_rcu((root), &cgroup_roots, root_list, \
+ lockdep_is_held(&cgroup_mutex))
/**
* for_each_subsys - iterate all enabled cgroup subsystems
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index c487ffef6652..520a11cb12f4 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -360,10 +360,9 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
}
css_task_iter_end(&it);
length = n;
- /* now sort & (if procs) strip out duplicates */
+ /* now sort & strip out duplicates (tgids or recycled thread PIDs) */
sort(array, length, sizeof(pid_t), cmppid, NULL);
- if (type == CGROUP_FILE_PROCS)
- length = pidlist_uniq(array, length);
+ length = pidlist_uniq(array, length);
l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
@@ -803,7 +802,7 @@ void cgroup1_release_agent(struct work_struct *work)
goto out_free;
ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
- if (ret < 0 || ret >= PATH_MAX)
+ if (ret < 0)
goto out_free;
argv[0] = agentbuf;
@@ -1263,6 +1262,40 @@ int cgroup1_get_tree(struct fs_context *fc)
return ret;
}
+/**
+ * task_get_cgroup1 - Acquires the associated cgroup of a task within a
+ * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
+ * hierarchy ID.
+ * @tsk: The target task
+ * @hierarchy_id: The ID of a cgroup1 hierarchy
+ *
+ * On success, the cgroup is returned. On failure, ERR_PTR is returned.
+ * We limit it to cgroup1 only.
+ */
+struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id)
+{
+ struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup_root *root;
+ unsigned long flags;
+
+ rcu_read_lock();
+ for_each_root(root) {
+ /* cgroup1 only*/
+ if (root == &cgrp_dfl_root)
+ continue;
+ if (root->hierarchy_id != hierarchy_id)
+ continue;
+ spin_lock_irqsave(&css_set_lock, flags);
+ cgrp = task_cgroup_from_root(tsk, root);
+ if (!cgrp || !cgroup_tryget(cgrp))
+ cgrp = ERR_PTR(-ENOENT);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+ break;
+ }
+ rcu_read_unlock();
+ return cgrp;
+}
+
static int __init cgroup1_wq_init(void)
{
/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1fb7f562289d..a66c088c851c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -207,6 +207,8 @@ static u16 have_exit_callback __read_mostly;
static u16 have_release_callback __read_mostly;
static u16 have_canfork_callback __read_mostly;
+static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
+
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
.ns.count = REFCOUNT_INIT(2),
@@ -1313,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
void cgroup_free_root(struct cgroup_root *root)
{
- kfree(root);
+ kfree_rcu(root, rcu);
}
static void cgroup_destroy_root(struct cgroup_root *root)
@@ -1345,12 +1347,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)
spin_unlock_irq(&css_set_lock);
- if (!list_empty(&root->root_list)) {
- list_del(&root->root_list);
- cgroup_root_count--;
- }
+ WARN_ON_ONCE(list_empty(&root->root_list));
+ list_del_rcu(&root->root_list);
+ cgroup_root_count--;
+
+ if (!have_favordynmods)
+ cgroup_favor_dynmods(root, false);
- cgroup_favor_dynmods(root, false);
cgroup_exit_root_id(root);
cgroup_unlock();
@@ -1386,7 +1389,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
}
}
- BUG_ON(!res_cgroup);
+ /*
+ * If cgroup_mutex is not held, the cgrp_cset_link will be freed
+ * before we remove the cgroup root from the root_list. Consequently,
+ * when accessing a cgroup root, the cset_link may have already been
+ * freed, resulting in a NULL res_cgroup. However, by holding the
+ * cgroup_mutex, we ensure that res_cgroup can't be NULL.
+ * If we don't hold cgroup_mutex in the caller, we must do the NULL
+ * check.
+ */
return res_cgroup;
}
@@ -1409,6 +1420,11 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
rcu_read_unlock();
+ /*
+ * The namespace_sem is held by current, so the root cgroup can't
+ * be umounted. Therefore, we can ensure that the res is non-NULL.
+ */
+ WARN_ON_ONCE(!res);
return res;
}
@@ -1445,7 +1461,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
struct cgroup_root *root)
{
- lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
return __cset_cgroup_from_root(cset, root);
@@ -1453,7 +1468,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
/*
* Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_lock held.
+ * called with css_set_lock held to prevent task's groups from being modified.
+ * Must be called with either cgroup_mutex or rcu read lock to prevent the
+ * cgroup root from being destroyed.
*/
struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroup_root *root)
@@ -1719,20 +1736,22 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
if (!css->ss) {
if (cgroup_on_dfl(cgrp)) {
- ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ ret = cgroup_addrm_files(css, cgrp,
cgroup_base_files, true);
if (ret < 0)
return ret;
if (cgroup_psi_enabled()) {
- ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ ret = cgroup_addrm_files(css, cgrp,
cgroup_psi_files, true);
if (ret < 0)
return ret;
}
} else {
- cgroup_addrm_files(css, cgrp,
- cgroup1_base_files, true);
+ ret = cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, true);
+ if (ret < 0)
+ return ret;
}
} else {
list_for_each_entry(cfts, &css->ss->cfts, node) {
@@ -1887,7 +1906,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
spin_unlock_irq(&css_set_lock);
- if (len >= PATH_MAX)
+ if (len == -E2BIG)
len = -ERANGE;
else if (len > 0) {
seq_escape(sf, buf, " \t\n\\");
@@ -1902,6 +1921,7 @@ enum cgroup2_param {
Opt_favordynmods,
Opt_memory_localevents,
Opt_memory_recursiveprot,
+ Opt_memory_hugetlb_accounting,
nr__cgroup2_params
};
@@ -1910,6 +1930,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("favordynmods", Opt_favordynmods),
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
+ fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
{}
};
@@ -1936,6 +1957,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_memory_recursiveprot:
ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
return 0;
+ case Opt_memory_hugetlb_accounting:
+ ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+ return 0;
}
return -EINVAL;
}
@@ -1960,6 +1984,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+
+ if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
}
}
@@ -1973,6 +2002,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
seq_puts(seq, ",memory_localevents");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
seq_puts(seq, ",memory_recursiveprot");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+ seq_puts(seq, ",memory_hugetlb_accounting");
return 0;
}
@@ -2014,7 +2045,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
struct cgroup_root *root = ctx->root;
struct cgroup *cgrp = &root->cgrp;
- INIT_LIST_HEAD(&root->root_list);
+ INIT_LIST_HEAD_RCU(&root->root_list);
atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
@@ -2097,7 +2128,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
* care of subsystems' refcounts, which are explicitly dropped in
* the failure exit path.
*/
- list_add(&root->root_list, &cgroup_roots);
+ list_add_rcu(&root->root_list, &cgroup_roots);
cgroup_root_count++;
/*
@@ -2243,9 +2274,9 @@ static int cgroup_init_fs_context(struct fs_context *fc)
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
-#ifdef CONFIG_CGROUP_FAVOR_DYNMODS
- ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
-#endif
+ if (have_favordynmods)
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+
return 0;
}
@@ -3867,14 +3898,6 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
-static int cgroup_pressure_open(struct kernfs_open_file *of)
-{
- if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- return 0;
-}
-
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
struct cgroup_file_ctx *ctx = of->priv;
@@ -4159,20 +4182,6 @@ static struct kernfs_ops cgroup_kf_ops = {
.seq_show = cgroup_seqfile_show,
};
-/* set uid and gid of cgroup dirs and files to that of the creator */
-static int cgroup_kn_set_ugid(struct kernfs_node *kn)
-{
- struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
- .ia_uid = current_fsuid(),
- .ia_gid = current_fsgid(), };
-
- if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
- gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
- return 0;
-
- return kernfs_setattr(kn, &iattr);
-}
-
static void cgroup_file_notify_timer(struct timer_list *timer)
{
cgroup_file_notify(container_of(timer, struct cgroup_file,
@@ -4185,25 +4194,18 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
char name[CGROUP_FILE_NAME_MAX];
struct kernfs_node *kn;
struct lock_class_key *key = NULL;
- int ret;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
key = &cft->lockdep_key;
#endif
kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
cgroup_file_mode(cft),
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ current_fsuid(), current_fsgid(),
0, cft->kf_ops, cft,
NULL, key);
if (IS_ERR(kn))
return PTR_ERR(kn);
- ret = cgroup_kn_set_ugid(kn);
- if (ret) {
- kernfs_remove(kn);
- return ret;
- }
-
if (cft->file_offset) {
struct cgroup_file *cfile = (void *)css + cft->file_offset;
@@ -4917,9 +4919,11 @@ repeat:
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it)
{
+ unsigned long irqflags;
+
memset(it, 0, sizeof(*it));
- spin_lock_irq(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, irqflags);
it->ss = css->ss;
it->flags = flags;
@@ -4933,7 +4937,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
css_task_iter_advance(it);
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
}
/**
@@ -4946,12 +4950,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
*/
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
+ unsigned long irqflags;
+
if (it->cur_task) {
put_task_struct(it->cur_task);
it->cur_task = NULL;
}
- spin_lock_irq(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, irqflags);
/* @it may be half-advanced by skips, finish advancing */
if (it->flags & CSS_TASK_ITER_SKIPPED)
@@ -4964,7 +4970,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
css_task_iter_advance(it);
}
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
return it->cur_task;
}
@@ -4977,11 +4983,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
*/
void css_task_iter_end(struct css_task_iter *it)
{
+ unsigned long irqflags;
+
if (it->cur_cset) {
- spin_lock_irq(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, irqflags);
list_del(&it->iters_node);
put_css_set_locked(it->cur_cset);
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
}
if (it->cur_dcset)
@@ -5275,7 +5283,6 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "io.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
- .open = cgroup_pressure_open,
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5284,7 +5291,6 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "memory.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
- .open = cgroup_pressure_open,
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5293,7 +5299,6 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "cpu.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
- .open = cgroup_pressure_open,
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5303,7 +5308,6 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "irq.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
- .open = cgroup_pressure_open,
.seq_show = cgroup_irq_pressure_show,
.write = cgroup_irq_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5604,7 +5608,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
goto out_cancel_ref;
/* create the directory */
- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ kn = kernfs_create_dir_ns(parent->kn, name, mode,
+ current_fsuid(), current_fsgid(),
+ cgrp, NULL);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
goto out_stat_exit;
@@ -5749,10 +5755,6 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
*/
kernfs_get(cgrp->kn);
- ret = cgroup_kn_set_ugid(cgrp->kn);
- if (ret)
- goto out_destroy;
-
ret = css_populate_dir(&cgrp->self);
if (ret)
goto out_destroy;
@@ -6121,7 +6123,7 @@ int __init cgroup_init(void)
if (cgroup1_ssid_disabled(ssid))
pr_info("Disabling %s control group subsystem in v1 mounts\n",
- ss->name);
+ ss->legacy_name);
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
@@ -6253,7 +6255,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- cgroup_lock();
+ rcu_read_lock();
spin_lock_irq(&css_set_lock);
for_each_root(root) {
@@ -6264,6 +6266,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
continue;
+ cgrp = task_cgroup_from_root(tsk, root);
+ /* The root has already been unmounted. */
+ if (!cgrp)
+ continue;
+
seq_printf(m, "%d:", root->hierarchy_id);
if (root != &cgrp_dfl_root)
for_each_subsys(ss, ssid)
@@ -6274,9 +6281,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
-
- cgrp = task_cgroup_from_root(tsk, root);
-
/*
* On traditional hierarchies, all zombie tasks show up as
* belonging to the root cgroup. On the default hierarchy,
@@ -6289,7 +6293,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
- if (retval >= PATH_MAX)
+ if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
goto out_unlock;
@@ -6308,7 +6312,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
retval = 0;
out_unlock:
spin_unlock_irq(&css_set_lock);
- cgroup_unlock();
+ rcu_read_unlock();
kfree(buf);
out:
return retval;
@@ -6764,6 +6768,12 @@ static int __init enable_cgroup_debug(char *str)
}
__setup("cgroup_debug", enable_cgroup_debug);
+static int __init cgroup_favordynmods_setup(char *str)
+{
+ return (kstrtobool(str, &have_favordynmods) == 0);
+}
+__setup("cgroup_favordynmods=", cgroup_favordynmods_setup);
+
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -7050,7 +7060,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
"nsdelegate\n"
"favordynmods\n"
"memory_localevents\n"
- "memory_recursiveprot\n");
+ "memory_recursiveprot\n"
+ "memory_hugetlb_accounting\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 58ec88efa4f8..ba36c073304a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -25,6 +25,7 @@
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
+#include <linux/delay.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
@@ -43,6 +44,7 @@
#include <linux/sched/isolation.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
+#include <linux/workqueue.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -75,16 +77,18 @@ enum prs_errcode {
PERR_NOCPUS,
PERR_HOTPLUG,
PERR_CPUSEMPTY,
+ PERR_HKEEPING,
};
static const char * const perr_strings[] = {
- [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus",
+ [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive",
[PERR_INVPARENT] = "Parent is an invalid partition root",
[PERR_NOTPART] = "Parent is not a partition root",
[PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
[PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
[PERR_HOTPLUG] = "No cpu available due to hotplug",
[PERR_CPUSEMPTY] = "cpuset.cpus is empty",
+ [PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
};
struct cpuset {
@@ -121,14 +125,23 @@ struct cpuset {
nodemask_t effective_mems;
/*
- * CPUs allocated to child sub-partitions (default hierarchy only)
- * - CPUs granted by the parent = effective_cpus U subparts_cpus
- * - effective_cpus and subparts_cpus are mutually exclusive.
+ * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
*
- * effective_cpus contains only onlined CPUs, but subparts_cpus
- * may have offlined ones.
+ * This exclusive CPUs must be a subset of cpus_allowed. A parent
+ * cgroup can only grant exclusive CPUs to one of its children.
+ *
+ * When the cgroup becomes a valid partition root, effective_xcpus
+ * defaults to cpus_allowed if not set. The effective_cpus of a valid
+ * partition root comes solely from its effective_xcpus and some of the
+ * effective_xcpus may be distributed to sub-partitions below & hence
+ * excluded from its effective_cpus.
+ */
+ cpumask_var_t effective_xcpus;
+
+ /*
+ * Exclusive CPUs as requested by the user (default hierarchy only)
*/
- cpumask_var_t subparts_cpus;
+ cpumask_var_t exclusive_cpus;
/*
* This is old Memory Nodes tasks took on.
@@ -156,8 +169,8 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
- /* number of CPUs in subparts_cpus */
- int nr_subparts_cpus;
+ /* number of valid sub-partitions */
+ int nr_subparts;
/* partition root state */
int partition_root_state;
@@ -183,9 +196,25 @@ struct cpuset {
/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
+
+ /* Remote partition silbling list anchored at remote_children */
+ struct list_head remote_sibling;
};
/*
+ * Exclusive CPUs distributed out to sub-partitions of top_cpuset
+ */
+static cpumask_var_t subpartitions_cpus;
+
+/*
+ * Exclusive CPUs in isolated partitions
+ */
+static cpumask_var_t isolated_cpus;
+
+/* List of remote partition root children */
+static struct list_head remote_children;
+
+/*
* Partition root states:
*
* 0 - member (not a partition root)
@@ -312,7 +341,7 @@ static inline int is_partition_invalid(const struct cpuset *cs)
*/
static inline void make_partition_invalid(struct cpuset *cs)
{
- if (is_partition_valid(cs))
+ if (cs->partition_root_state > 0)
cs->partition_root_state = -cs->partition_root_state;
}
@@ -334,6 +363,7 @@ static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
.partition_root_state = PRS_ROOT,
+ .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
};
/**
@@ -469,7 +499,7 @@ static inline bool partition_is_populated(struct cpuset *cs,
if (cs->css.cgroup->nr_populated_csets)
return true;
- if (!excluded_child && !cs->nr_subparts_cpus)
+ if (!excluded_child && !cs->nr_subparts)
return cgroup_is_populated(cs->css.cgroup);
rcu_read_lock();
@@ -596,16 +626,18 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
*/
static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
{
- cpumask_var_t *pmask1, *pmask2, *pmask3;
+ cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;
if (cs) {
pmask1 = &cs->cpus_allowed;
pmask2 = &cs->effective_cpus;
- pmask3 = &cs->subparts_cpus;
+ pmask3 = &cs->effective_xcpus;
+ pmask4 = &cs->exclusive_cpus;
} else {
pmask1 = &tmp->new_cpus;
pmask2 = &tmp->addmask;
pmask3 = &tmp->delmask;
+ pmask4 = NULL;
}
if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
@@ -617,8 +649,14 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
goto free_two;
+ if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
+ goto free_three;
+
+
return 0;
+free_three:
+ free_cpumask_var(*pmask3);
free_two:
free_cpumask_var(*pmask2);
free_one:
@@ -636,7 +674,8 @@ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
if (cs) {
free_cpumask_var(cs->cpus_allowed);
free_cpumask_var(cs->effective_cpus);
- free_cpumask_var(cs->subparts_cpus);
+ free_cpumask_var(cs->effective_xcpus);
+ free_cpumask_var(cs->exclusive_cpus);
}
if (tmp) {
free_cpumask_var(tmp->new_cpus);
@@ -664,6 +703,8 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+ cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
+ cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
return trial;
}
@@ -677,6 +718,28 @@ static inline void free_cpuset(struct cpuset *cs)
kfree(cs);
}
+static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
+{
+ return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus :
+ cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed
+ : cs->effective_xcpus;
+}
+
+/*
+ * cpusets_are_exclusive() - check if two cpusets are exclusive
+ *
+ * Return true if exclusive, false if not
+ */
+static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
+{
+ struct cpumask *xcpus1 = fetch_xcpus(cs1);
+ struct cpumask *xcpus2 = fetch_xcpus(cs2);
+
+ if (cpumask_intersects(xcpus1, xcpus2))
+ return false;
+ return true;
+}
+
/*
* validate_change_legacy() - Validate conditions specific to legacy (v1)
* behavior.
@@ -776,9 +839,10 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
ret = -EINVAL;
cpuset_for_each_child(c, css, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
- c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- goto out;
+ c != cur) {
+ if (!cpusets_are_exclusive(trial, c))
+ goto out;
+ }
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed))
@@ -908,7 +972,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
+ if (root_load_balance && !top_cpuset.nr_subparts) {
ndoms = 1;
doms = alloc_sched_domains(ndoms);
if (!doms)
@@ -1159,7 +1223,7 @@ static void rebuild_sched_domains_locked(void)
* should be the same as the active CPUs, so checking only top_cpuset
* is enough to detect racing CPU offlines.
*/
- if (!top_cpuset.nr_subparts_cpus &&
+ if (cpumask_empty(subpartitions_cpus) &&
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
return;
@@ -1168,7 +1232,7 @@ static void rebuild_sched_domains_locked(void)
* root should be only a subset of the active CPUs. Since a CPU in any
* partition root could be offlined, all must be checked.
*/
- if (top_cpuset.nr_subparts_cpus) {
+ if (top_cpuset.nr_subparts) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
if (!is_partition_valid(cs)) {
@@ -1232,7 +1296,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
*/
if (kthread_is_per_cpu(task))
continue;
- cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus);
+ cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
} else {
cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
}
@@ -1247,32 +1311,23 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
* @cs: the cpuset the need to recompute the new effective_cpus mask
* @parent: the parent cpuset
*
- * If the parent has subpartition CPUs, include them in the list of
- * allowable CPUs in computing the new effective_cpus mask. Since offlined
- * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
- * to mask those out.
+ * The result is valid only if the given cpuset isn't a partition root.
*/
static void compute_effective_cpumask(struct cpumask *new_cpus,
struct cpuset *cs, struct cpuset *parent)
{
- if (parent->nr_subparts_cpus && is_partition_valid(cs)) {
- cpumask_or(new_cpus, parent->effective_cpus,
- parent->subparts_cpus);
- cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
- cpumask_and(new_cpus, new_cpus, cpu_active_mask);
- } else {
- cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
- }
+ cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
}
/*
- * Commands for update_parent_subparts_cpumask
+ * Commands for update_parent_effective_cpumask
*/
-enum subparts_cmd {
- partcmd_enable, /* Enable partition root */
- partcmd_disable, /* Disable partition root */
- partcmd_update, /* Update parent's subparts_cpus */
- partcmd_invalidate, /* Make partition invalid */
+enum partition_cmd {
+ partcmd_enable, /* Enable partition root */
+ partcmd_enablei, /* Enable isolated partition root */
+ partcmd_disable, /* Disable partition root */
+ partcmd_update, /* Update parent's effective_cpus */
+ partcmd_invalidate, /* Make partition invalid */
};
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1304,13 +1359,23 @@ static int update_partition_exclusive(struct cpuset *cs, int new_prs)
*
* Changing load balance flag will automatically call
* rebuild_sched_domains_locked().
+ * This function is for cgroup v2 only.
*/
static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
{
int new_prs = cs->partition_root_state;
- bool new_lb = (new_prs != PRS_ISOLATED);
bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
+ bool new_lb;
+ /*
+ * If cs is not a valid partition root, the load balance state
+ * will follow its parent.
+ */
+ if (new_prs > 0) {
+ new_lb = (new_prs != PRS_ISOLATED);
+ } else {
+ new_lb = is_sched_load_balance(parent_cs(cs));
+ }
if (new_lb != !!is_sched_load_balance(cs)) {
rebuild_domains = true;
if (new_lb)
@@ -1323,35 +1388,417 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
rebuild_sched_domains_locked();
}
+/*
+ * tasks_nocpu_error - Return true if tasks will have no effective_cpus
+ */
+static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
+ struct cpumask *xcpus)
+{
+ /*
+ * A populated partition (cs or parent) can't have empty effective_cpus
+ */
+ return (cpumask_subset(parent->effective_cpus, xcpus) &&
+ partition_is_populated(parent, cs)) ||
+ (!cpumask_intersects(xcpus, cpu_active_mask) &&
+ partition_is_populated(cs, NULL));
+}
+
+static void reset_partition_data(struct cpuset *cs)
+{
+ struct cpuset *parent = parent_cs(cs);
+
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ return;
+
+ lockdep_assert_held(&callback_lock);
+
+ cs->nr_subparts = 0;
+ if (cpumask_empty(cs->exclusive_cpus)) {
+ cpumask_clear(cs->effective_xcpus);
+ if (is_cpu_exclusive(cs))
+ clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+ }
+ if (!cpumask_and(cs->effective_cpus,
+ parent->effective_cpus, cs->cpus_allowed)) {
+ cs->use_parent_ecpus = true;
+ parent->child_ecpus_count++;
+ cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+ }
+}
+
+/*
+ * partition_xcpus_newstate - Exclusive CPUs state change
+ * @old_prs: old partition_root_state
+ * @new_prs: new partition_root_state
+ * @xcpus: exclusive CPUs with state change
+ */
+static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
+{
+ WARN_ON_ONCE(old_prs == new_prs);
+ if (new_prs == PRS_ISOLATED)
+ cpumask_or(isolated_cpus, isolated_cpus, xcpus);
+ else
+ cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
+}
+
+/*
+ * partition_xcpus_add - Add new exclusive CPUs to partition
+ * @new_prs: new partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be added
+ * Return: true if isolated_cpus modified, false otherwise
+ *
+ * Remote partition if parent == NULL
+ */
+static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
+ struct cpumask *xcpus)
+{
+ bool isolcpus_updated;
+
+ WARN_ON_ONCE(new_prs < 0);
+ lockdep_assert_held(&callback_lock);
+ if (!parent)
+ parent = &top_cpuset;
+
+
+ if (parent == &top_cpuset)
+ cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+ isolcpus_updated = (new_prs != parent->partition_root_state);
+ if (isolcpus_updated)
+ partition_xcpus_newstate(parent->partition_root_state, new_prs,
+ xcpus);
+
+ cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
+ return isolcpus_updated;
+}
+
+/*
+ * partition_xcpus_del - Remove exclusive CPUs from partition
+ * @old_prs: old partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be removed
+ * Return: true if isolated_cpus modified, false otherwise
+ *
+ * Remote partition if parent == NULL
+ */
+static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
+ struct cpumask *xcpus)
+{
+ bool isolcpus_updated;
+
+ WARN_ON_ONCE(old_prs < 0);
+ lockdep_assert_held(&callback_lock);
+ if (!parent)
+ parent = &top_cpuset;
+
+ if (parent == &top_cpuset)
+ cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+ isolcpus_updated = (old_prs != parent->partition_root_state);
+ if (isolcpus_updated)
+ partition_xcpus_newstate(old_prs, parent->partition_root_state,
+ xcpus);
+
+ cpumask_and(xcpus, xcpus, cpu_active_mask);
+ cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
+ return isolcpus_updated;
+}
+
+static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
+{
+ int ret;
+
+ lockdep_assert_cpus_held();
+
+ if (!isolcpus_updated)
+ return;
+
+ ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
+ WARN_ON_ONCE(ret < 0);
+}
+
+/**
+ * cpuset_cpu_is_isolated - Check if the given CPU is isolated
+ * @cpu: the CPU number to be checked
+ * Return: true if CPU is used in an isolated partition, false otherwise
+ */
+bool cpuset_cpu_is_isolated(int cpu)
+{
+ return cpumask_test_cpu(cpu, isolated_cpus);
+}
+EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
+
+/*
+ * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
+ * @cs: cpuset
+ * @xcpus: effective exclusive CPUs value to be set
+ * Return: true if xcpus is not empty, false otherwise.
+ *
+ * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
+ * it must be a subset of cpus_allowed and parent's effective_xcpus.
+ */
+static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
+ struct cpumask *xcpus)
+{
+ struct cpuset *parent = parent_cs(cs);
+
+ if (!xcpus)
+ xcpus = cs->effective_xcpus;
+
+ if (!cpumask_empty(cs->exclusive_cpus))
+ cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed);
+ else
+ cpumask_copy(xcpus, cs->cpus_allowed);
+
+ return cpumask_and(xcpus, xcpus, parent->effective_xcpus);
+}
+
+static inline bool is_remote_partition(struct cpuset *cs)
+{
+ return !list_empty(&cs->remote_sibling);
+}
+
+static inline bool is_local_partition(struct cpuset *cs)
+{
+ return is_partition_valid(cs) && !is_remote_partition(cs);
+}
+
+/*
+ * remote_partition_enable - Enable current cpuset as a remote partition root
+ * @cs: the cpuset to update
+ * @new_prs: new partition_root_state
+ * @tmp: temparary masks
+ * Return: 1 if successful, 0 if error
+ *
+ * Enable the current cpuset to become a remote partition root taking CPUs
+ * directly from the top cpuset. cpuset_mutex must be held by the caller.
+ */
+static int remote_partition_enable(struct cpuset *cs, int new_prs,
+ struct tmpmasks *tmp)
+{
+ bool isolcpus_updated;
+
+ /*
+ * The user must have sysadmin privilege.
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
+ /*
+ * The requested exclusive_cpus must not be allocated to other
+ * partitions and it can't use up all the root's effective_cpus.
+ *
+ * Note that if there is any local partition root above it or
+ * remote partition root underneath it, its exclusive_cpus must
+ * have overlapped with subpartitions_cpus.
+ */
+ compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
+ if (cpumask_empty(tmp->new_cpus) ||
+ cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
+ cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
+ return 0;
+
+ spin_lock_irq(&callback_lock);
+ isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
+ list_add(&cs->remote_sibling, &remote_children);
+ if (cs->use_parent_ecpus) {
+ struct cpuset *parent = parent_cs(cs);
+
+ cs->use_parent_ecpus = false;
+ parent->child_ecpus_count--;
+ }
+ spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
+
+ /*
+ * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+ */
+ update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+ return 1;
+}
+
+/*
+ * remote_partition_disable - Remove current cpuset from remote partition list
+ * @cs: the cpuset to update
+ * @tmp: temparary masks
+ *
+ * The effective_cpus is also updated.
+ *
+ * cpuset_mutex must be held by the caller.
+ */
+static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
+{
+ bool isolcpus_updated;
+
+ compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
+ WARN_ON_ONCE(!is_remote_partition(cs));
+ WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
+
+ spin_lock_irq(&callback_lock);
+ list_del_init(&cs->remote_sibling);
+ isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
+ NULL, tmp->new_cpus);
+ cs->partition_root_state = -cs->partition_root_state;
+ if (!cs->prs_err)
+ cs->prs_err = PERR_INVCPUS;
+ reset_partition_data(cs);
+ spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
+
+ /*
+ * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+ */
+ update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+}
+
+/*
+ * remote_cpus_update - cpus_exclusive change of remote partition
+ * @cs: the cpuset to be updated
+ * @newmask: the new effective_xcpus mask
+ * @tmp: temparary masks
+ *
+ * top_cpuset and subpartitions_cpus will be updated or partition can be
+ * invalidated.
+ */
+static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
+ struct tmpmasks *tmp)
+{
+ bool adding, deleting;
+ int prs = cs->partition_root_state;
+ int isolcpus_updated = 0;
+
+ if (WARN_ON_ONCE(!is_remote_partition(cs)))
+ return;
+
+ WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
+
+ if (cpumask_empty(newmask))
+ goto invalidate;
+
+ adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
+ deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);
+
+ /*
+ * Additions of remote CPUs is only allowed if those CPUs are
+ * not allocated to other partitions and there are effective_cpus
+ * left in the top cpuset.
+ */
+ if (adding && (!capable(CAP_SYS_ADMIN) ||
+ cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
+ cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)))
+ goto invalidate;
+
+ spin_lock_irq(&callback_lock);
+ if (adding)
+ isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
+ if (deleting)
+ isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
+ spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
+
+ /*
+ * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+ */
+ update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+ return;
+
+invalidate:
+ remote_partition_disable(cs, tmp);
+}
+
+/*
+ * remote_partition_check - check if a child remote partition needs update
+ * @cs: the cpuset to be updated
+ * @newmask: the new effective_xcpus mask
+ * @delmask: temporary mask for deletion (not in tmp)
+ * @tmp: temparary masks
+ *
+ * This should be called before the given cs has updated its cpus_allowed
+ * and/or effective_xcpus.
+ */
+static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
+ struct cpumask *delmask, struct tmpmasks *tmp)
+{
+ struct cpuset *child, *next;
+ int disable_cnt = 0;
+
+ /*
+ * Compute the effective exclusive CPUs that will be deleted.
+ */
+ if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) ||
+ !cpumask_intersects(delmask, subpartitions_cpus))
+ return; /* No deletion of exclusive CPUs in partitions */
+
+ /*
+ * Searching the remote children list to look for those that will
+ * be impacted by the deletion of exclusive CPUs.
+ *
+ * Since a cpuset must be removed from the remote children list
+ * before it can go offline and holding cpuset_mutex will prevent
+ * any change in cpuset status. RCU read lock isn't needed.
+ */
+ lockdep_assert_held(&cpuset_mutex);
+ list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
+ if (cpumask_intersects(child->effective_cpus, delmask)) {
+ remote_partition_disable(child, tmp);
+ disable_cnt++;
+ }
+ if (disable_cnt)
+ rebuild_sched_domains_locked();
+}
+
+/*
+ * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
+ * @prstate: partition root state to be checked
+ * @new_cpus: cpu mask
+ * Return: true if there is conflict, false otherwise
+ *
+ * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in
+ * an isolated partition.
+ */
+static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
+{
+ const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN);
+ bool all_in_hk = cpumask_subset(new_cpus, hk_domain);
+
+ if (!all_in_hk && (prstate != PRS_ISOLATED))
+ return true;
+
+ return false;
+}
+
/**
- * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
+ * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
* @cs: The cpuset that requests change in partition root state
* @cmd: Partition root state change command
* @newmask: Optional new cpumask for partcmd_update
* @tmp: Temporary addmask and delmask
* Return: 0 or a partition root state error code
*
- * For partcmd_enable, the cpuset is being transformed from a non-partition
- * root to a partition root. The cpus_allowed mask of the given cpuset will
- * be put into parent's subparts_cpus and taken away from parent's
- * effective_cpus. The function will return 0 if all the CPUs listed in
- * cpus_allowed can be granted or an error code will be returned.
+ * For partcmd_enable*, the cpuset is being transformed from a non-partition
+ * root to a partition root. The effective_xcpus (cpus_allowed if
+ * effective_xcpus not set) mask of the given cpuset will be taken away from
+ * parent's effective_cpus. The function will return 0 if all the CPUs listed
+ * in effective_xcpus can be granted or an error code will be returned.
*
* For partcmd_disable, the cpuset is being transformed from a partition
- * root back to a non-partition root. Any CPUs in cpus_allowed that are in
- * parent's subparts_cpus will be taken away from that cpumask and put back
- * into parent's effective_cpus. 0 will always be returned.
+ * root back to a non-partition root. Any CPUs in effective_xcpus will be
+ * given back to parent's effective_cpus. 0 will always be returned.
*
* For partcmd_update, if the optional newmask is specified, the cpu list is
- * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is
+ * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
* assumed to remain the same. The cpuset should either be a valid or invalid
* partition root. The partition root state may change from valid to invalid
- * or vice versa. An error code will only be returned if transitioning from
+ * or vice versa. An error code will be returned if transitioning from
* invalid to valid violates the exclusivity rule.
*
* For partcmd_invalidate, the current partition will be made invalid.
*
- * The partcmd_enable and partcmd_disable commands are used by
+ * The partcmd_enable* and partcmd_disable commands are used by
* update_prstate(). An error code may be returned and the caller will check
* for error.
*
@@ -1361,19 +1808,49 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
* check for error and so partition_root_state and prs_error will be updated
* directly.
*/
-static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
- struct cpumask *newmask,
- struct tmpmasks *tmp)
+static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
+ struct cpumask *newmask,
+ struct tmpmasks *tmp)
{
struct cpuset *parent = parent_cs(cs);
- int adding; /* Moving cpus from effective_cpus to subparts_cpus */
- int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
+ int adding; /* Adding cpus to parent's effective_cpus */
+ int deleting; /* Deleting cpus from parent's effective_cpus */
int old_prs, new_prs;
int part_error = PERR_NONE; /* Partition error? */
+ int subparts_delta = 0;
+ struct cpumask *xcpus; /* cs effective_xcpus */
+ int isolcpus_updated = 0;
+ bool nocpu;
lockdep_assert_held(&cpuset_mutex);
/*
+ * new_prs will only be changed for the partcmd_update and
+ * partcmd_invalidate commands.
+ */
+ adding = deleting = false;
+ old_prs = new_prs = cs->partition_root_state;
+ xcpus = !cpumask_empty(cs->exclusive_cpus)
+ ? cs->effective_xcpus : cs->cpus_allowed;
+
+ if (cmd == partcmd_invalidate) {
+ if (is_prs_invalid(old_prs))
+ return 0;
+
+ /*
+ * Make the current partition invalid.
+ */
+ if (is_partition_valid(parent))
+ adding = cpumask_and(tmp->addmask,
+ xcpus, parent->effective_xcpus);
+ if (old_prs > 0) {
+ new_prs = -old_prs;
+ subparts_delta--;
+ }
+ goto write_error;
+ }
+
+ /*
* The parent must be a partition root.
* The new cpumask, if present, or the current cpus_allowed must
* not be empty.
@@ -1385,124 +1862,140 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
if (!newmask && cpumask_empty(cs->cpus_allowed))
return PERR_CPUSEMPTY;
- /*
- * new_prs will only be changed for the partcmd_update and
- * partcmd_invalidate commands.
- */
- adding = deleting = false;
- old_prs = new_prs = cs->partition_root_state;
- if (cmd == partcmd_enable) {
+ nocpu = tasks_nocpu_error(parent, cs, xcpus);
+
+ if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
/*
- * Enabling partition root is not allowed if cpus_allowed
- * doesn't overlap parent's cpus_allowed.
+ * Enabling partition root is not allowed if its
+ * effective_xcpus is empty or doesn't overlap with
+ * parent's effective_xcpus.
*/
- if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed))
+ if (cpumask_empty(xcpus) ||
+ !cpumask_intersects(xcpus, parent->effective_xcpus))
return PERR_INVCPUS;
+ if (prstate_housekeeping_conflict(new_prs, xcpus))
+ return PERR_HKEEPING;
+
/*
* A parent can be left with no CPU as long as there is no
* task directly associated with the parent partition.
*/
- if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) &&
- partition_is_populated(parent, cs))
+ if (nocpu)
return PERR_NOCPUS;
- cpumask_copy(tmp->addmask, cs->cpus_allowed);
- adding = true;
+ cpumask_copy(tmp->delmask, xcpus);
+ deleting = true;
+ subparts_delta++;
+ new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
} else if (cmd == partcmd_disable) {
/*
- * Need to remove cpus from parent's subparts_cpus for valid
- * partition root.
+ * May need to add cpus to parent's effective_cpus for
+ * valid partition root.
*/
- deleting = !is_prs_invalid(old_prs) &&
- cpumask_and(tmp->delmask, cs->cpus_allowed,
- parent->subparts_cpus);
- } else if (cmd == partcmd_invalidate) {
- if (is_prs_invalid(old_prs))
- return 0;
-
+ adding = !is_prs_invalid(old_prs) &&
+ cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
+ if (adding)
+ subparts_delta--;
+ new_prs = PRS_MEMBER;
+ } else if (newmask) {
/*
- * Make the current partition invalid. It is assumed that
- * invalidation is caused by violating cpu exclusivity rule.
+ * Empty cpumask is not allowed
*/
- deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
- parent->subparts_cpus);
- if (old_prs > 0) {
- new_prs = -old_prs;
- part_error = PERR_NOTEXCL;
+ if (cpumask_empty(newmask)) {
+ part_error = PERR_CPUSEMPTY;
+ goto write_error;
}
- } else if (newmask) {
+
/*
* partcmd_update with newmask:
*
- * Compute add/delete mask to/from subparts_cpus
+ * Compute add/delete mask to/from effective_cpus
*
- * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
- * addmask = newmask & parent->cpus_allowed
- * & ~parent->subparts_cpus
+ * For valid partition:
+ * addmask = exclusive_cpus & ~newmask
+ * & parent->effective_xcpus
+ * delmask = newmask & ~exclusive_cpus
+ * & parent->effective_xcpus
+ *
+ * For invalid partition:
+ * delmask = newmask & parent->effective_xcpus
*/
- cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask);
- deleting = cpumask_and(tmp->delmask, tmp->delmask,
- parent->subparts_cpus);
+ if (is_prs_invalid(old_prs)) {
+ adding = false;
+ deleting = cpumask_and(tmp->delmask,
+ newmask, parent->effective_xcpus);
+ } else {
+ cpumask_andnot(tmp->addmask, xcpus, newmask);
+ adding = cpumask_and(tmp->addmask, tmp->addmask,
+ parent->effective_xcpus);
- cpumask_and(tmp->addmask, newmask, parent->cpus_allowed);
- adding = cpumask_andnot(tmp->addmask, tmp->addmask,
- parent->subparts_cpus);
- /*
- * Empty cpumask is not allowed
- */
- if (cpumask_empty(newmask)) {
- part_error = PERR_CPUSEMPTY;
+ cpumask_andnot(tmp->delmask, newmask, xcpus);
+ deleting = cpumask_and(tmp->delmask, tmp->delmask,
+ parent->effective_xcpus);
+ }
/*
* Make partition invalid if parent's effective_cpus could
* become empty and there are tasks in the parent.
*/
- } else if (adding &&
- cpumask_subset(parent->effective_cpus, tmp->addmask) &&
- !cpumask_intersects(tmp->delmask, cpu_active_mask) &&
- partition_is_populated(parent, cs)) {
+ if (nocpu && (!adding ||
+ !cpumask_intersects(tmp->addmask, cpu_active_mask))) {
part_error = PERR_NOCPUS;
- adding = false;
- deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
- parent->subparts_cpus);
+ deleting = false;
+ adding = cpumask_and(tmp->addmask,
+ xcpus, parent->effective_xcpus);
}
} else {
/*
- * partcmd_update w/o newmask:
+ * partcmd_update w/o newmask
+ *
+ * delmask = effective_xcpus & parent->effective_cpus
*
- * delmask = cpus_allowed & parent->subparts_cpus
- * addmask = cpus_allowed & parent->cpus_allowed
- * & ~parent->subparts_cpus
+ * This can be called from:
+ * 1) update_cpumasks_hier()
+ * 2) cpuset_hotplug_update_tasks()
*
- * This gets invoked either due to a hotplug event or from
- * update_cpumasks_hier(). This can cause the state of a
- * partition root to transition from valid to invalid or vice
- * versa. So we still need to compute the addmask and delmask.
-
- * A partition error happens when:
- * 1) Cpuset is valid partition, but parent does not distribute
- * out any CPUs.
- * 2) Parent has tasks and all its effective CPUs will have
- * to be distributed out.
+ * Check to see if it can be transitioned from valid to
+ * invalid partition or vice versa.
+ *
+ * A partition error happens when parent has tasks and all
+ * its effective CPUs will have to be distributed out.
*/
- cpumask_and(tmp->addmask, cs->cpus_allowed,
- parent->cpus_allowed);
- adding = cpumask_andnot(tmp->addmask, tmp->addmask,
- parent->subparts_cpus);
-
- if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) ||
- (adding &&
- cpumask_subset(parent->effective_cpus, tmp->addmask) &&
- partition_is_populated(parent, cs))) {
+ WARN_ON_ONCE(!is_partition_valid(parent));
+ if (nocpu) {
part_error = PERR_NOCPUS;
- adding = false;
- }
+ if (is_partition_valid(cs))
+ adding = cpumask_and(tmp->addmask,
+ xcpus, parent->effective_xcpus);
+ } else if (is_partition_invalid(cs) &&
+ cpumask_subset(xcpus, parent->effective_xcpus)) {
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+ bool exclusive = true;
- if (part_error && is_partition_valid(cs) &&
- parent->nr_subparts_cpus)
- deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
- parent->subparts_cpus);
+ /*
+ * Convert invalid partition to valid has to
+ * pass the cpu exclusivity test.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, parent) {
+ if (child == cs)
+ continue;
+ if (!cpusets_are_exclusive(cs, child)) {
+ exclusive = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (exclusive)
+ deleting = cpumask_and(tmp->delmask,
+ xcpus, parent->effective_cpus);
+ else
+ part_error = PERR_NOTEXCL;
+ }
}
+
+write_error:
if (part_error)
WRITE_ONCE(cs->prs_err, part_error);
@@ -1514,13 +2007,17 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
switch (cs->partition_root_state) {
case PRS_ROOT:
case PRS_ISOLATED:
- if (part_error)
+ if (part_error) {
new_prs = -old_prs;
+ subparts_delta--;
+ }
break;
case PRS_INVALID_ROOT:
case PRS_INVALID_ISOLATED:
- if (!part_error)
+ if (!part_error) {
new_prs = -old_prs;
+ subparts_delta++;
+ }
break;
}
}
@@ -1530,9 +2027,11 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
/*
* Transitioning between invalid to valid or vice versa may require
- * changing CS_CPU_EXCLUSIVE.
+ * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
+ * validate_change() has already been successfully called and
+ * CPU lists in cs haven't been updated yet. So defer it to later.
*/
- if (old_prs != new_prs) {
+ if ((old_prs != new_prs) && (cmd != partcmd_update)) {
int err = update_partition_exclusive(cs, new_prs);
if (err)
@@ -1540,39 +2039,42 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
}
/*
- * Change the parent's subparts_cpus.
+ * Change the parent's effective_cpus & effective_xcpus (top cpuset
+ * only).
+ *
* Newly added CPUs will be removed from effective_cpus and
* newly deleted ones will be added back to effective_cpus.
*/
spin_lock_irq(&callback_lock);
- if (adding) {
- cpumask_or(parent->subparts_cpus,
- parent->subparts_cpus, tmp->addmask);
- cpumask_andnot(parent->effective_cpus,
- parent->effective_cpus, tmp->addmask);
+ if (old_prs != new_prs) {
+ cs->partition_root_state = new_prs;
+ if (new_prs <= 0)
+ cs->nr_subparts = 0;
}
- if (deleting) {
- cpumask_andnot(parent->subparts_cpus,
- parent->subparts_cpus, tmp->delmask);
- /*
- * Some of the CPUs in subparts_cpus might have been offlined.
- */
- cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
- cpumask_or(parent->effective_cpus,
- parent->effective_cpus, tmp->delmask);
+ /*
+ * Adding to parent's effective_cpus means deletion CPUs from cs
+ * and vice versa.
+ */
+ if (adding)
+ isolcpus_updated += partition_xcpus_del(old_prs, parent,
+ tmp->addmask);
+ if (deleting)
+ isolcpus_updated += partition_xcpus_add(new_prs, parent,
+ tmp->delmask);
+
+ if (is_partition_valid(parent)) {
+ parent->nr_subparts += subparts_delta;
+ WARN_ON_ONCE(parent->nr_subparts < 0);
}
-
- parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
-
- if (old_prs != new_prs)
- cs->partition_root_state = new_prs;
-
spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
+
+ if ((old_prs != new_prs) && (cmd == partcmd_update))
+ update_partition_exclusive(cs, new_prs);
if (adding || deleting) {
update_tasks_cpumask(parent, tmp->addmask);
- if (parent->child_ecpus_count)
- update_sibling_cpumasks(parent, cs, tmp);
+ update_sibling_cpumasks(parent, cs, tmp);
}
/*
@@ -1590,6 +2092,73 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
return 0;
}
+/**
+ * compute_partition_effective_cpumask - compute effective_cpus for partition
+ * @cs: partition root cpuset
+ * @new_ecpus: previously computed effective_cpus to be updated
+ *
+ * Compute the effective_cpus of a partition root by scanning effective_xcpus
+ * of child partition roots and excluding their effective_xcpus.
+ *
+ * This has the side effect of invalidating valid child partition roots,
+ * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
+ * or update_cpumasks_hier() where parent and children are modified
+ * successively, we don't need to call update_parent_effective_cpumask()
+ * and the child's effective_cpus will be updated in later iterations.
+ *
+ * Note that rcu_read_lock() is assumed to be held.
+ */
+static void compute_partition_effective_cpumask(struct cpuset *cs,
+ struct cpumask *new_ecpus)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+ bool populated = partition_is_populated(cs, NULL);
+
+ /*
+ * Check child partition roots to see if they should be
+ * invalidated when
+ * 1) child effective_xcpus not a subset of new
+ * excluisve_cpus
+ * 2) All the effective_cpus will be used up and cp
+ * has tasks
+ */
+ compute_effective_exclusive_cpumask(cs, new_ecpus);
+ cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
+
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, cs) {
+ if (!is_partition_valid(child))
+ continue;
+
+ child->prs_err = 0;
+ if (!cpumask_subset(child->effective_xcpus,
+ cs->effective_xcpus))
+ child->prs_err = PERR_INVCPUS;
+ else if (populated &&
+ cpumask_subset(new_ecpus, child->effective_xcpus))
+ child->prs_err = PERR_NOCPUS;
+
+ if (child->prs_err) {
+ int old_prs = child->partition_root_state;
+
+ /*
+ * Invalidate child partition
+ */
+ spin_lock_irq(&callback_lock);
+ make_partition_invalid(child);
+ cs->nr_subparts--;
+ child->nr_subparts = 0;
+ spin_unlock_irq(&callback_lock);
+ notify_partition_change(child, old_prs);
+ continue;
+ }
+ cpumask_andnot(new_ecpus, new_ecpus,
+ child->effective_xcpus);
+ }
+ rcu_read_unlock();
+}
+
/*
* update_cpumasks_hier() flags
*/
@@ -1620,9 +2189,44 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
+ bool remote = is_remote_partition(cp);
bool update_parent = false;
- compute_effective_cpumask(tmp->new_cpus, cp, parent);
+ /*
+ * Skip descendent remote partition that acquires CPUs
+ * directly from top cpuset unless it is cs.
+ */
+ if (remote && (cp != cs)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ /*
+ * Update effective_xcpus if exclusive_cpus set.
+ * The case when exclusive_cpus isn't set is handled later.
+ */
+ if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) {
+ spin_lock_irq(&callback_lock);
+ compute_effective_exclusive_cpumask(cp, NULL);
+ spin_unlock_irq(&callback_lock);
+ }
+
+ old_prs = new_prs = cp->partition_root_state;
+ if (remote || (is_partition_valid(parent) &&
+ is_partition_valid(cp)))
+ compute_partition_effective_cpumask(cp, tmp->new_cpus);
+ else
+ compute_effective_cpumask(tmp->new_cpus, cp, parent);
+
+ /*
+ * A partition with no effective_cpus is allowed as long as
+ * there is no task associated with it. Call
+ * update_parent_effective_cpumask() to check it.
+ */
+ if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
+ update_parent = true;
+ goto update_parent_effective;
+ }
/*
* If it becomes empty, inherit the effective mask of the
@@ -1630,11 +2234,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
* it is a partition root that has explicitly distributed
* out all its CPUs.
*/
- if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
- if (is_partition_valid(cp) &&
- cpumask_equal(cp->cpus_allowed, cp->subparts_cpus))
- goto update_parent_subparts;
-
+ if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) {
cpumask_copy(tmp->new_cpus, parent->effective_cpus);
if (!cp->use_parent_ecpus) {
cp->use_parent_ecpus = true;
@@ -1646,6 +2246,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
parent->child_ecpus_count--;
}
+ if (remote)
+ goto get_css;
+
/*
* Skip the whole subtree if
* 1) the cpumask remains the same,
@@ -1661,14 +2264,13 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
continue;
}
-update_parent_subparts:
+update_parent_effective:
/*
- * update_parent_subparts_cpumask() should have been called
+ * update_parent_effective_cpumask() should have been called
* for cs already in update_cpumask(). We should also call
* update_tasks_cpumask() again for tasks in the parent
- * cpuset if the parent's subparts_cpus changes.
+ * cpuset if the parent's effective_cpus changes.
*/
- old_prs = new_prs = cp->partition_root_state;
if ((cp != cs) && old_prs) {
switch (parent->partition_root_state) {
case PRS_ROOT:
@@ -1690,14 +2292,13 @@ update_parent_subparts:
break;
}
}
-
+get_css:
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
if (update_parent) {
- update_parent_subparts_cpumask(cp, partcmd_update, NULL,
- tmp);
+ update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
/*
* The cpuset partition_root_state may become
* invalid. Capture it.
@@ -1706,30 +2307,17 @@ update_parent_subparts:
}
spin_lock_irq(&callback_lock);
-
- if (cp->nr_subparts_cpus && !is_partition_valid(cp)) {
- /*
- * Put all active subparts_cpus back to effective_cpus.
- */
- cpumask_or(tmp->new_cpus, tmp->new_cpus,
- cp->subparts_cpus);
- cpumask_and(tmp->new_cpus, tmp->new_cpus,
- cpu_active_mask);
- cp->nr_subparts_cpus = 0;
- cpumask_clear(cp->subparts_cpus);
- }
-
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- if (cp->nr_subparts_cpus) {
- /*
- * Make sure that effective_cpus & subparts_cpus
- * are mutually exclusive.
- */
- cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
- cp->subparts_cpus);
- }
-
cp->partition_root_state = new_prs;
+ /*
+ * Make sure effective_xcpus is properly set for a valid
+ * partition root.
+ */
+ if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
+ cpumask_and(cp->effective_xcpus,
+ cp->cpus_allowed, parent->effective_xcpus);
+ else if (new_prs < 0)
+ reset_partition_data(cp);
spin_unlock_irq(&callback_lock);
notify_partition_change(cp, old_prs);
@@ -1737,7 +2325,7 @@ update_parent_subparts:
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
- update_tasks_cpumask(cp, tmp->new_cpus);
+ update_tasks_cpumask(cp, cp->effective_cpus);
/*
* On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
@@ -1790,8 +2378,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
/*
* Check all its siblings and call update_cpumasks_hier()
- * if their use_parent_ecpus flag is set in order for them
- * to use the right effective_cpus value.
+ * if their effective_cpus will need to be changed.
+ *
+ * With the addition of effective_xcpus which is a subset of
+ * cpus_allowed. It is possible a change in parent's effective_cpus
+ * due to a change in a child partition's effective_xcpus will impact
+ * its siblings even if they do not inherit parent's effective_cpus
+ * directly.
*
* The update_cpumasks_hier() function may sleep. So we have to
* release the RCU read lock before calling it. HIER_NO_SD_REBUILD
@@ -1802,8 +2395,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
cpuset_for_each_child(sibling, pos_css, parent) {
if (sibling == cs)
continue;
- if (!sibling->use_parent_ecpus)
- continue;
+ if (!sibling->use_parent_ecpus &&
+ !is_partition_valid(sibling)) {
+ compute_effective_cpumask(tmp->new_cpus, sibling,
+ parent);
+ if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
+ continue;
+ }
if (!css_tryget_online(&sibling->css))
continue;
@@ -1826,7 +2424,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
{
int retval;
struct tmpmasks tmp;
+ struct cpuset *parent = parent_cs(cs);
bool invalidate = false;
+ int hier_flags = 0;
int old_prs = cs->partition_root_state;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@@ -1841,6 +2441,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
*/
if (!*buf) {
cpumask_clear(trialcs->cpus_allowed);
+ cpumask_clear(trialcs->effective_xcpus);
} else {
retval = cpulist_parse(buf, trialcs->cpus_allowed);
if (retval < 0)
@@ -1849,6 +2450,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (!cpumask_subset(trialcs->cpus_allowed,
top_cpuset.cpus_allowed))
return -EINVAL;
+
+ /*
+ * When exclusive_cpus isn't explicitly set, it is constrainted
+ * by cpus_allowed and parent's effective_xcpus. Otherwise,
+ * trialcs->effective_xcpus is used as a temporary cpumask
+ * for checking validity of the partition root.
+ */
+ if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
+ compute_effective_exclusive_cpumask(trialcs, NULL);
}
/* Nothing to do if the cpus didn't change */
@@ -1858,11 +2468,32 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (alloc_cpumasks(NULL, &tmp))
return -ENOMEM;
+ if (old_prs) {
+ if (is_partition_valid(cs) &&
+ cpumask_empty(trialcs->effective_xcpus)) {
+ invalidate = true;
+ cs->prs_err = PERR_INVCPUS;
+ } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
+ invalidate = true;
+ cs->prs_err = PERR_HKEEPING;
+ } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
+ invalidate = true;
+ cs->prs_err = PERR_NOCPUS;
+ }
+ }
+
+ /*
+ * Check all the descendants in update_cpumasks_hier() if
+ * effective_xcpus is to be changed.
+ */
+ if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
+ hier_flags = HIER_CHECKALL;
+
retval = validate_change(cs, trialcs);
if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
- struct cpuset *cp, *parent;
struct cgroup_subsys_state *css;
+ struct cpuset *cp;
/*
* The -EINVAL error code indicates that partition sibling
@@ -1873,70 +2504,168 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
*/
invalidate = true;
rcu_read_lock();
- parent = parent_cs(cs);
- cpuset_for_each_child(cp, css, parent)
+ cpuset_for_each_child(cp, css, parent) {
+ struct cpumask *xcpus = fetch_xcpus(trialcs);
+
if (is_partition_valid(cp) &&
- cpumask_intersects(trialcs->cpus_allowed, cp->cpus_allowed)) {
+ cpumask_intersects(xcpus, cp->effective_xcpus)) {
rcu_read_unlock();
- update_parent_subparts_cpumask(cp, partcmd_invalidate, NULL, &tmp);
+ update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
rcu_read_lock();
}
+ }
rcu_read_unlock();
retval = 0;
}
+
if (retval < 0)
goto out_free;
- if (cs->partition_root_state) {
- if (invalidate)
- update_parent_subparts_cpumask(cs, partcmd_invalidate,
- NULL, &tmp);
+ if (is_partition_valid(cs) ||
+ (is_partition_invalid(cs) && !invalidate)) {
+ struct cpumask *xcpus = trialcs->effective_xcpus;
+
+ if (cpumask_empty(xcpus) && is_partition_invalid(cs))
+ xcpus = trialcs->cpus_allowed;
+
+ /*
+ * Call remote_cpus_update() to handle valid remote partition
+ */
+ if (is_remote_partition(cs))
+ remote_cpus_update(cs, xcpus, &tmp);
+ else if (invalidate)
+ update_parent_effective_cpumask(cs, partcmd_invalidate,
+ NULL, &tmp);
else
- update_parent_subparts_cpumask(cs, partcmd_update,
- trialcs->cpus_allowed, &tmp);
+ update_parent_effective_cpumask(cs, partcmd_update,
+ xcpus, &tmp);
+ } else if (!cpumask_empty(cs->exclusive_cpus)) {
+ /*
+ * Use trialcs->effective_cpus as a temp cpumask
+ */
+ remote_partition_check(cs, trialcs->effective_xcpus,
+ trialcs->effective_cpus, &tmp);
}
- compute_effective_cpumask(trialcs->effective_cpus, trialcs,
- parent_cs(cs));
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+ cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
+ if ((old_prs > 0) && !is_partition_valid(cs))
+ reset_partition_data(cs);
+ spin_unlock_irq(&callback_lock);
+
+ /* effective_cpus/effective_xcpus will be updated here */
+ update_cpumasks_hier(cs, &tmp, hier_flags);
+
+ /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+ if (cs->partition_root_state)
+ update_partition_sd_lb(cs, old_prs);
+out_free:
+ free_cpumasks(NULL, &tmp);
+ return 0;
+}
+
+/**
+ * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ *
+ * The tasks' cpumask will be updated if cs is a valid partition root.
+ */
+static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ const char *buf)
+{
+ int retval;
+ struct tmpmasks tmp;
+ struct cpuset *parent = parent_cs(cs);
+ bool invalidate = false;
+ int hier_flags = 0;
+ int old_prs = cs->partition_root_state;
+
+ if (!*buf) {
+ cpumask_clear(trialcs->exclusive_cpus);
+ cpumask_clear(trialcs->effective_xcpus);
+ } else {
+ retval = cpulist_parse(buf, trialcs->exclusive_cpus);
+ if (retval < 0)
+ return retval;
+ if (!is_cpu_exclusive(cs))
+ set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
+ }
+
+ /* Nothing to do if the CPUs didn't change */
+ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
+ return 0;
+
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
+
+ if (*buf)
+ compute_effective_exclusive_cpumask(trialcs, NULL);
/*
- * Make sure that subparts_cpus, if not empty, is a subset of
- * cpus_allowed. Clear subparts_cpus if partition not valid or
- * empty effective cpus with tasks.
+ * Check all the descendants in update_cpumasks_hier() if
+ * effective_xcpus is to be changed.
*/
- if (cs->nr_subparts_cpus) {
- if (!is_partition_valid(cs) ||
- (cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus) &&
- partition_is_populated(cs, NULL))) {
- cs->nr_subparts_cpus = 0;
- cpumask_clear(cs->subparts_cpus);
+ if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
+ hier_flags = HIER_CHECKALL;
+
+ retval = validate_change(cs, trialcs);
+ if (retval)
+ return retval;
+
+ if (old_prs) {
+ if (cpumask_empty(trialcs->effective_xcpus)) {
+ invalidate = true;
+ cs->prs_err = PERR_INVCPUS;
+ } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
+ invalidate = true;
+ cs->prs_err = PERR_HKEEPING;
+ } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
+ invalidate = true;
+ cs->prs_err = PERR_NOCPUS;
+ }
+
+ if (is_remote_partition(cs)) {
+ if (invalidate)
+ remote_partition_disable(cs, &tmp);
+ else
+ remote_cpus_update(cs, trialcs->effective_xcpus,
+ &tmp);
+ } else if (invalidate) {
+ update_parent_effective_cpumask(cs, partcmd_invalidate,
+ NULL, &tmp);
} else {
- cpumask_and(cs->subparts_cpus, cs->subparts_cpus,
- cs->cpus_allowed);
- cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ update_parent_effective_cpumask(cs, partcmd_update,
+ trialcs->effective_xcpus, &tmp);
}
+ } else if (!cpumask_empty(trialcs->exclusive_cpus)) {
+ /*
+ * Use trialcs->effective_cpus as a temp cpumask
+ */
+ remote_partition_check(cs, trialcs->effective_xcpus,
+ trialcs->effective_cpus, &tmp);
}
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
+ cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
+ if ((old_prs > 0) && !is_partition_valid(cs))
+ reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
- /* effective_cpus will be updated here */
- update_cpumasks_hier(cs, &tmp, 0);
-
- if (cs->partition_root_state) {
- struct cpuset *parent = parent_cs(cs);
-
- /*
- * For partition root, update the cpumasks of sibling
- * cpusets if they use parent's effective_cpus.
- */
- if (parent->child_ecpus_count)
- update_sibling_cpumasks(parent, cs, &tmp);
+ /*
+ * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
+ * of the subtree when it is a valid partition root or effective_xcpus
+ * is updated.
+ */
+ if (is_partition_valid(cs) || hier_flags)
+ update_cpumasks_hier(cs, &tmp, hier_flags);
- /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */
+ /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+ if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
- }
-out_free:
+
free_cpumasks(NULL, &tmp);
return 0;
}
@@ -2315,27 +3044,39 @@ static int update_prstate(struct cpuset *cs, int new_prs)
int err = PERR_NONE, old_prs = cs->partition_root_state;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
+ bool new_xcpus_state = false;
if (old_prs == new_prs)
return 0;
/*
- * For a previously invalid partition root, leave it at being
- * invalid if new_prs is not "member".
+ * Treat a previously invalid partition root as if it is a "member".
*/
- if (new_prs && is_prs_invalid(old_prs)) {
- cs->partition_root_state = -new_prs;
- return 0;
- }
+ if (new_prs && is_prs_invalid(old_prs))
+ old_prs = PRS_MEMBER;
if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
+ /*
+ * Setup effective_xcpus if not properly set yet, it will be cleared
+ * later if partition becomes invalid.
+ */
+ if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
+ spin_lock_irq(&callback_lock);
+ cpumask_and(cs->effective_xcpus,
+ cs->cpus_allowed, parent->effective_xcpus);
+ spin_unlock_irq(&callback_lock);
+ }
+
err = update_partition_exclusive(cs, new_prs);
if (err)
goto out;
if (!old_prs) {
+ enum partition_cmd cmd = (new_prs == PRS_ROOT)
+ ? partcmd_enable : partcmd_enablei;
+
/*
* cpus_allowed cannot be empty.
*/
@@ -2344,31 +3085,33 @@ static int update_prstate(struct cpuset *cs, int new_prs)
goto out;
}
- err = update_parent_subparts_cpumask(cs, partcmd_enable,
- NULL, &tmpmask);
+ err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
+ /*
+ * If an attempt to become local partition root fails,
+ * try to become a remote partition root instead.
+ */
+ if (err && remote_partition_enable(cs, new_prs, &tmpmask))
+ err = 0;
} else if (old_prs && new_prs) {
/*
* A change in load balance state only, no change in cpumasks.
*/
- ;
+ new_xcpus_state = true;
} else {
/*
* Switching back to member is always allowed even if it
* disables child partitions.
*/
- update_parent_subparts_cpumask(cs, partcmd_disable, NULL,
- &tmpmask);
+ if (is_remote_partition(cs))
+ remote_partition_disable(cs, &tmpmask);
+ else
+ update_parent_effective_cpumask(cs, partcmd_disable,
+ NULL, &tmpmask);
/*
- * If there are child partitions, they will all become invalid.
+ * Invalidation of child partitions will be done in
+ * update_cpumasks_hier().
*/
- if (unlikely(cs->nr_subparts_cpus)) {
- spin_lock_irq(&callback_lock);
- cs->nr_subparts_cpus = 0;
- cpumask_clear(cs->subparts_cpus);
- compute_effective_cpumask(cs->effective_cpus, cs, parent);
- spin_unlock_irq(&callback_lock);
- }
}
out:
/*
@@ -2383,14 +3126,15 @@ out:
spin_lock_irq(&callback_lock);
cs->partition_root_state = new_prs;
WRITE_ONCE(cs->prs_err, err);
+ if (!is_partition_valid(cs))
+ reset_partition_data(cs);
+ else if (new_xcpus_state)
+ partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(new_xcpus_state);
- /*
- * Update child cpusets, if present.
- * Force update if switching back to member.
- */
- if (!list_empty(&cs->css.children))
- update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
+ /* Force update if switching back to member */
+ update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
/* Update sched domains and load balance flag */
update_partition_sd_lb(cs, old_prs);
@@ -2639,7 +3383,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
guarantee_online_cpus(task, cpus_attach);
else
cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
- cs->subparts_cpus);
+ subpartitions_cpus);
/*
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
@@ -2742,6 +3486,9 @@ typedef enum {
FILE_EFFECTIVE_CPULIST,
FILE_EFFECTIVE_MEMLIST,
FILE_SUBPARTS_CPULIST,
+ FILE_EXCLUSIVE_CPULIST,
+ FILE_EFFECTIVE_XCPULIST,
+ FILE_ISOLATED_CPULIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
@@ -2879,6 +3626,9 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
case FILE_CPULIST:
retval = update_cpumask(cs, trialcs, buf);
break;
+ case FILE_EXCLUSIVE_CPULIST:
+ retval = update_exclusive_cpumask(cs, trialcs, buf);
+ break;
case FILE_MEMLIST:
retval = update_nodemask(cs, trialcs, buf);
break;
@@ -2926,8 +3676,17 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_EFFECTIVE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
break;
+ case FILE_EXCLUSIVE_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
+ break;
+ case FILE_EFFECTIVE_XCPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
+ break;
case FILE_SUBPARTS_CPULIST:
- seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
+ break;
+ case FILE_ISOLATED_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
break;
default:
ret = -EINVAL;
@@ -3200,10 +3959,33 @@ static struct cftype dfl_files[] = {
},
{
+ .name = "cpus.exclusive",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_EXCLUSIVE_CPULIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "cpus.exclusive.effective",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_XCPULIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
.name = "cpus.subpartitions",
.seq_show = cpuset_common_seq_show,
.private = FILE_SUBPARTS_CPULIST,
- .flags = CFTYPE_DEBUG,
+ .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
+ },
+
+ {
+ .name = "cpus.isolated",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_ISOLATED_CPULIST,
+ .flags = CFTYPE_ONLY_ON_ROOT,
},
{ } /* terminate */
@@ -3241,6 +4023,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
+ INIT_LIST_HEAD(&cs->remote_sibling);
/* Set CS_MEMORY_MIGRATE for default hierarchy */
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
@@ -3276,6 +4059,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->effective_mems = parent->effective_mems;
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
+ /*
+ * Clear CS_SCHED_LOAD_BALANCE if parent is isolated
+ */
+ if (!is_sched_load_balance(parent))
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}
/*
@@ -3377,6 +4165,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+ cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
cpumask_copy(top_cpuset.cpus_allowed,
@@ -3515,16 +4304,22 @@ int __init cpuset_init(void)
{
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
cpumask_setall(top_cpuset.effective_cpus);
+ cpumask_setall(top_cpuset.effective_xcpus);
+ cpumask_setall(top_cpuset.exclusive_cpus);
nodes_setall(top_cpuset.effective_mems);
fmeter_init(&top_cpuset.fmeter);
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
top_cpuset.relax_domain_level = -1;
+ INIT_LIST_HEAD(&remote_children);
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
@@ -3625,6 +4420,30 @@ void cpuset_force_rebuild(void)
force_rebuild = true;
}
+/*
+ * Attempt to acquire a cpus_read_lock while a hotplug operation may be in
+ * progress.
+ * Return: true if successful, false otherwise
+ *
+ * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
+ * cpus_read_trylock() is used here to acquire the lock.
+ */
+static bool cpuset_hotplug_cpus_read_trylock(void)
+{
+ int retries = 0;
+
+ while (!cpus_read_trylock()) {
+ /*
+ * CPU hotplug still in progress. Retry 5 times
+ * with a 10ms wait before bailing out.
+ */
+ if (++retries > 5)
+ return false;
+ msleep(10);
+ }
+ return true;
+}
+
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
@@ -3640,6 +4459,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
static nodemask_t new_mems;
bool cpus_updated;
bool mems_updated;
+ bool remote;
+ int partcmd = -1;
struct cpuset *parent;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -3659,29 +4480,25 @@ retry:
compute_effective_cpumask(&new_cpus, cs, parent);
nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
- if (cs->nr_subparts_cpus)
- /*
- * Make sure that CPUs allocated to child partitions
- * do not show up in effective_cpus.
- */
- cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
-
if (!tmp || !cs->partition_root_state)
goto update_tasks;
/*
- * In the unlikely event that a partition root has empty
- * effective_cpus with tasks, we will have to invalidate child
- * partitions, if present, by setting nr_subparts_cpus to 0 to
- * reclaim their cpus.
+ * Compute effective_cpus for valid partition root, may invalidate
+ * child partition roots if necessary.
*/
- if (cs->nr_subparts_cpus && is_partition_valid(cs) &&
- cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) {
- spin_lock_irq(&callback_lock);
- cs->nr_subparts_cpus = 0;
- cpumask_clear(cs->subparts_cpus);
- spin_unlock_irq(&callback_lock);
+ remote = is_remote_partition(cs);
+ if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
+ compute_partition_effective_cpumask(cs, &new_cpus);
+
+ if (remote && cpumask_empty(&new_cpus) &&
+ partition_is_populated(cs, NULL) &&
+ cpuset_hotplug_cpus_read_trylock()) {
+ remote_partition_disable(cs, tmp);
compute_effective_cpumask(&new_cpus, cs, parent);
+ remote = false;
+ cpuset_force_rebuild();
+ cpus_read_unlock();
}
/*
@@ -3691,44 +4508,32 @@ retry:
* 2) parent is invalid or doesn't grant any cpus to child
* partitions.
*/
- if (is_partition_valid(cs) && (!parent->nr_subparts_cpus ||
- (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) {
- int old_prs, parent_prs;
-
- update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp);
- if (cs->nr_subparts_cpus) {
- spin_lock_irq(&callback_lock);
- cs->nr_subparts_cpus = 0;
- cpumask_clear(cs->subparts_cpus);
- spin_unlock_irq(&callback_lock);
- compute_effective_cpumask(&new_cpus, cs, parent);
- }
-
- old_prs = cs->partition_root_state;
- parent_prs = parent->partition_root_state;
- if (is_partition_valid(cs)) {
- spin_lock_irq(&callback_lock);
- make_partition_invalid(cs);
- spin_unlock_irq(&callback_lock);
- if (is_prs_invalid(parent_prs))
- WRITE_ONCE(cs->prs_err, PERR_INVPARENT);
- else if (!parent_prs)
- WRITE_ONCE(cs->prs_err, PERR_NOTPART);
- else
- WRITE_ONCE(cs->prs_err, PERR_HOTPLUG);
- notify_partition_change(cs, old_prs);
- }
- cpuset_force_rebuild();
- }
-
+ if (is_local_partition(cs) && (!is_partition_valid(parent) ||
+ tasks_nocpu_error(parent, cs, &new_cpus)))
+ partcmd = partcmd_invalidate;
/*
* On the other hand, an invalid partition root may be transitioned
* back to a regular one.
*/
- else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
- update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp);
- if (is_partition_valid(cs))
+ else if (is_partition_valid(parent) && is_partition_invalid(cs))
+ partcmd = partcmd_update;
+
+ /*
+ * cpus_read_lock needs to be held before calling
+ * update_parent_effective_cpumask(). To avoid circular lock
+ * dependency between cpuset_mutex and cpus_read_lock,
+ * cpus_read_trylock() is used here to acquire the lock.
+ */
+ if (partcmd >= 0) {
+ if (!cpuset_hotplug_cpus_read_trylock())
+ goto update_tasks;
+
+ update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
+ cpus_read_unlock();
+ if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
+ compute_partition_effective_cpumask(cs, &new_cpus);
cpuset_force_rebuild();
+ }
}
update_tasks:
@@ -3786,21 +4591,22 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
new_mems = node_states[N_MEMORY];
/*
- * If subparts_cpus is populated, it is likely that the check below
- * will produce a false positive on cpus_updated when the cpu list
- * isn't changed. It is extra work, but it is better to be safe.
+ * If subpartitions_cpus is populated, it is likely that the check
+ * below will produce a false positive on cpus_updated when the cpu
+ * list isn't changed. It is extra work, but it is better to be safe.
*/
- cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
+ !cpumask_empty(subpartitions_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
/*
- * In the rare case that hotplug removes all the cpus in subparts_cpus,
- * we assumed that cpus are updated.
+ * In the rare case that hotplug removes all the cpus in
+ * subpartitions_cpus, we assumed that cpus are updated.
*/
- if (!cpus_updated && top_cpuset.nr_subparts_cpus)
+ if (!cpus_updated && top_cpuset.nr_subparts)
cpus_updated = true;
- /* synchronize cpus_allowed to cpu_active_mask */
+ /* For v1, synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
spin_lock_irq(&callback_lock);
if (!on_dfl)
@@ -3808,17 +4614,16 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/*
* Make sure that CPUs allocated to child partitions
* do not show up in effective_cpus. If no CPU is left,
- * we clear the subparts_cpus & let the child partitions
+ * we clear the subpartitions_cpus & let the child partitions
* fight for the CPUs again.
*/
- if (top_cpuset.nr_subparts_cpus) {
- if (cpumask_subset(&new_cpus,
- top_cpuset.subparts_cpus)) {
- top_cpuset.nr_subparts_cpus = 0;
- cpumask_clear(top_cpuset.subparts_cpus);
+ if (!cpumask_empty(subpartitions_cpus)) {
+ if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
+ top_cpuset.nr_subparts = 0;
+ cpumask_clear(subpartitions_cpus);
} else {
cpumask_andnot(&new_cpus, &new_cpus,
- top_cpuset.subparts_cpus);
+ subpartitions_cpus);
}
}
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
@@ -3950,7 +4755,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
* We first exclude cpus allocated to partitions. If there is no
* allowable online cpu left, we fall back to all possible cpus.
*/
- cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus);
+ cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
if (!cpumask_intersects(pmask, cpu_online_mask))
cpumask_copy(pmask, possible_mask);
}
@@ -4287,7 +5092,7 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
css_put(css);
- if (retval >= PATH_MAX)
+ if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
goto out_free;
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 122dacb3a443..66d1708042a7 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -66,9 +66,15 @@ static struct freezer *parent_freezer(struct freezer *freezer)
bool cgroup_freezing(struct task_struct *task)
{
bool ret;
+ unsigned int state;
rcu_read_lock();
- ret = task_freezer(task)->state & CGROUP_FREEZING;
+ /* Check if the cgroup is still FREEZING, but not FROZEN. The extra
+ * !FROZEN check is required, because the FREEZING bit is not cleared
+ * when the state FROZEN is reached.
+ */
+ state = task_freezer(task)->state;
+ ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN);
rcu_read_unlock();
return ret;
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d80d7a608141..a8350d2d63e6 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -74,64 +74,109 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
}
/**
- * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
- * @pos: current position
- * @root: root of the tree to traversal
+ * cgroup_rstat_push_children - push children cgroups into the given list
+ * @head: current head of the list (= subtree root)
+ * @child: first child of the root
* @cpu: target cpu
+ * Return: A new singly linked list of cgroups to be flush
*
- * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts
- * the traversal and %NULL return indicates the end. During traversal,
- * each returned cgroup is unlinked from the tree. Must be called with the
- * matching cgroup_rstat_cpu_lock held.
+ * Iteratively traverse down the cgroup_rstat_cpu updated tree level by
+ * level and push all the parents first before their next level children
+ * into a singly linked list built from the tail backward like "pushing"
+ * cgroups into a stack. The root is pushed by the caller.
+ */
+static struct cgroup *cgroup_rstat_push_children(struct cgroup *head,
+ struct cgroup *child, int cpu)
+{
+ struct cgroup *chead = child; /* Head of child cgroup level */
+ struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */
+ struct cgroup *parent, *grandchild;
+ struct cgroup_rstat_cpu *crstatc;
+
+ child->rstat_flush_next = NULL;
+
+next_level:
+ while (chead) {
+ child = chead;
+ chead = child->rstat_flush_next;
+ parent = cgroup_parent(child);
+
+ /* updated_next is parent cgroup terminated */
+ while (child != parent) {
+ child->rstat_flush_next = head;
+ head = child;
+ crstatc = cgroup_rstat_cpu(child, cpu);
+ grandchild = crstatc->updated_children;
+ if (grandchild != child) {
+ /* Push the grand child to the next level */
+ crstatc->updated_children = child;
+ grandchild->rstat_flush_next = ghead;
+ ghead = grandchild;
+ }
+ child = crstatc->updated_next;
+ crstatc->updated_next = NULL;
+ }
+ }
+
+ if (ghead) {
+ chead = ghead;
+ ghead = NULL;
+ goto next_level;
+ }
+ return head;
+}
+
+/**
+ * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
+ * @root: root of the cgroup subtree to traverse
+ * @cpu: target cpu
+ * Return: A singly linked list of cgroups to be flushed
+ *
+ * Walks the updated rstat_cpu tree on @cpu from @root. During traversal,
+ * each returned cgroup is unlinked from the updated tree.
*
* The only ordering guarantee is that, for a parent and a child pair
- * covered by a given traversal, if a child is visited, its parent is
- * guaranteed to be visited afterwards.
+ * covered by a given traversal, the child is before its parent in
+ * the list.
+ *
+ * Note that updated_children is self terminated and points to a list of
+ * child cgroups if not empty. Whereas updated_next is like a sibling link
+ * within the children list and terminated by the parent cgroup. An exception
+ * here is the cgroup root whose updated_next can be self terminated.
*/
-static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
- struct cgroup *root, int cpu)
+static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
{
- struct cgroup_rstat_cpu *rstatc;
- struct cgroup *parent;
-
- if (pos == root)
- return NULL;
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+ struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
+ struct cgroup *head = NULL, *parent, *child;
+ unsigned long flags;
/*
- * We're gonna walk down to the first leaf and visit/remove it. We
- * can pick whatever unvisited node as the starting point.
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
*/
- if (!pos) {
- pos = root;
- /* return NULL if this subtree is not on-list */
- if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
- return NULL;
- } else {
- pos = cgroup_parent(pos);
- }
+ raw_spin_lock_irqsave(cpu_lock, flags);
- /* walk down to the first leaf */
- while (true) {
- rstatc = cgroup_rstat_cpu(pos, cpu);
- if (rstatc->updated_children == pos)
- break;
- pos = rstatc->updated_children;
- }
+ /* Return NULL if this subtree is not on-list */
+ if (!rstatc->updated_next)
+ goto unlock_ret;
/*
- * Unlink @pos from the tree. As the updated_children list is
+ * Unlink @root from its parent. As the updated_children list is
* singly linked, we have to walk it to find the removal point.
- * However, due to the way we traverse, @pos will be the first
- * child in most cases. The only exception is @root.
*/
- parent = cgroup_parent(pos);
+ parent = cgroup_parent(root);
if (parent) {
struct cgroup_rstat_cpu *prstatc;
struct cgroup **nextp;
prstatc = cgroup_rstat_cpu(parent, cpu);
nextp = &prstatc->updated_children;
- while (*nextp != pos) {
+ while (*nextp != root) {
struct cgroup_rstat_cpu *nrstatc;
nrstatc = cgroup_rstat_cpu(*nextp, cpu);
@@ -142,7 +187,17 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
}
rstatc->updated_next = NULL;
- return pos;
+
+ /* Push @root to the list first before pushing the children */
+ head = root;
+ root->rstat_flush_next = NULL;
+ child = rstatc->updated_children;
+ rstatc->updated_children = root;
+ if (child != root)
+ head = cgroup_rstat_push_children(head, child, cpu);
+unlock_ret:
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+ return head;
}
/*
@@ -156,19 +211,16 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* optimize away the callsite. Therefore, __weak is needed to ensure that the
* call is still emitted, by telling the compiler that we don't know what the
* function might eventually be.
- *
- * __diag_* below are needed to dismiss the missing prototype warning.
*/
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
- "kfuncs which will be used in BPF programs");
+
+__bpf_hook_start();
__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
struct cgroup *parent, int cpu)
{
}
-__diag_pop();
+__bpf_hook_end();
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
@@ -179,21 +231,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_rstat_lock);
for_each_possible_cpu(cpu) {
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
- cpu);
- struct cgroup *pos = NULL;
- unsigned long flags;
+ struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
- /*
- * The _irqsave() is needed because cgroup_rstat_lock is
- * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
- * this lock with the _irq() suffix only disables interrupts on
- * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
- * interrupts on both configurations. The _irqsave() ensures
- * that interrupts are always disabled and later restored.
- */
- raw_spin_lock_irqsave(cpu_lock, flags);
- while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
+ for (; pos; pos = pos->rstat_flush_next) {
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
@@ -205,7 +245,6 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock_irqrestore(cpu_lock, flags);
/* play nice and yield if necessary */
if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index e8db8d938661..4722b998a324 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -1,3 +1,5 @@
+# Help: Debugging for CI systems and finding regressions
+#
# The config is based on running daily CI for enterprise Linux distros to
# seek regressions on linux-next builds on different bare-metal and virtual
# platforms. It can be used for example,
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
new file mode 100644
index 000000000000..95a400f042b1
--- /dev/null
+++ b/kernel/configs/hardening.config
@@ -0,0 +1,98 @@
+# Help: Basic kernel hardening options
+#
+# These are considered the basic kernel hardening, self-protection, and
+# attack surface reduction options. They are expected to have low (or
+# no) performance impact on most workloads, and have a reasonable level
+# of legacy API removals.
+
+# Make sure reporting of various hardening actions is possible.
+CONFIG_BUG=y
+
+# Basic kernel memory permission enforcement.
+CONFIG_STRICT_KERNEL_RWX=y
+CONFIG_STRICT_MODULE_RWX=y
+CONFIG_VMAP_STACK=y
+
+# Kernel image and memory ASLR.
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RANDOMIZE_MEMORY=y
+
+# Randomize allocator freelists, harden metadata.
+CONFIG_SLAB_FREELIST_RANDOM=y
+CONFIG_SLAB_FREELIST_HARDENED=y
+CONFIG_SHUFFLE_PAGE_ALLOCATOR=y
+CONFIG_RANDOM_KMALLOC_CACHES=y
+
+# Randomize kernel stack offset on syscall entry.
+CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y
+
+# Basic stack frame overflow protection.
+CONFIG_STACKPROTECTOR=y
+CONFIG_STACKPROTECTOR_STRONG=y
+
+# Basic buffer length bounds checking.
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_FORTIFY_SOURCE=y
+
+# Basic array index bounds checking.
+CONFIG_UBSAN=y
+CONFIG_UBSAN_TRAP=y
+CONFIG_UBSAN_BOUNDS=y
+# CONFIG_UBSAN_SHIFT is not set
+# CONFIG_UBSAN_DIV_ZERO
+# CONFIG_UBSAN_UNREACHABLE
+# CONFIG_UBSAN_BOOL
+# CONFIG_UBSAN_ENUM
+# CONFIG_UBSAN_ALIGNMENT
+CONFIG_UBSAN_SANITIZE_ALL=y
+
+# Linked list integrity checking.
+CONFIG_LIST_HARDENED=y
+
+# Initialize all heap variables to zero on allocation.
+CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
+
+# Initialize all stack variables to zero on function entry.
+CONFIG_INIT_STACK_ALL_ZERO=y
+
+# Wipe RAM at reboot via EFI. For more details, see:
+# https://trustedcomputinggroup.org/resource/pc-client-work-group-platform-reset-attack-mitigation-specification/
+# https://bugzilla.redhat.com/show_bug.cgi?id=1532058
+CONFIG_RESET_ATTACK_MITIGATION=y
+
+# Disable DMA between EFI hand-off and the kernel's IOMMU setup.
+CONFIG_EFI_DISABLE_PCI_DMA=y
+
+# Force IOMMU TLB invalidation so devices will never be able to access stale
+# data content.
+CONFIG_IOMMU_SUPPORT=y
+CONFIG_IOMMU_DEFAULT_DMA_STRICT=y
+
+# Do not allow direct physical memory access to non-device memory.
+CONFIG_STRICT_DEVMEM=y
+CONFIG_IO_STRICT_DEVMEM=y
+
+# Provide userspace with seccomp BPF API for syscall attack surface reduction.
+CONFIG_SECCOMP=y
+CONFIG_SECCOMP_FILTER=y
+
+# Provides some protections against SYN flooding.
+CONFIG_SYN_COOKIES=y
+
+# Attack surface reduction: do not autoload TTY line disciplines.
+# CONFIG_LDISC_AUTOLOAD is not set
+
+# Dangerous; enabling this disables userspace brk ASLR.
+# CONFIG_COMPAT_BRK is not set
+
+# Dangerous; exposes kernel text image layout.
+# CONFIG_PROC_KCORE is not set
+
+# Dangerous; enabling this disables userspace VDSO ASLR.
+# CONFIG_COMPAT_VDSO is not set
+
+# Attack surface reduction: Use the modern PTY interface (devpts) only.
+# CONFIG_LEGACY_PTYS is not set
+
+# Attack surface reduction: Use only modesetting video drivers.
+# CONFIG_DRM_LEGACY is not set
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
index 208481d91090..d0877063d925 100644
--- a/kernel/configs/kvm_guest.config
+++ b/kernel/configs/kvm_guest.config
@@ -1,3 +1,4 @@
+# Help: Bootable as a KVM guest
CONFIG_NET=y
CONFIG_NET_CORE=y
CONFIG_NETDEVICES=y
diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config
index 81ff07863576..ebfdc3d8aa9a 100644
--- a/kernel/configs/nopm.config
+++ b/kernel/configs/nopm.config
@@ -1,3 +1,5 @@
+# Help: Disable Power Management
+
CONFIG_PM=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config
index 38a7c5362c9c..2c6e001a7284 100644
--- a/kernel/configs/rust.config
+++ b/kernel/configs/rust.config
@@ -1 +1,2 @@
+# Help: Enable Rust
CONFIG_RUST=y
diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config
index 6fac5b405334..35f48671b8d5 100644
--- a/kernel/configs/x86_debug.config
+++ b/kernel/configs/x86_debug.config
@@ -1,3 +1,4 @@
+# Help: Debugging options for tip tree testing
CONFIG_X86_DEBUG_FPU=y
CONFIG_LOCK_STAT=y
CONFIG_DEBUG_VM=y
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
index 436f806aa1ed..6878b9a49be8 100644
--- a/kernel/configs/xen.config
+++ b/kernel/configs/xen.config
@@ -1,3 +1,5 @@
+# Help: Bootable as a Xen guest
+#
# global stuff - these enable us to allow some
# of the not so generic stuff below for xen
CONFIG_PARAVIRT=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6de7c6bb74ee..e6ec3ba4950b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -659,11 +659,19 @@ static inline bool cpu_smt_thread_allowed(unsigned int cpu)
#endif
}
-static inline bool cpu_smt_allowed(unsigned int cpu)
+static inline bool cpu_bootable(unsigned int cpu)
{
if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
return true;
+ /* All CPUs are bootable if controls are not configured */
+ if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED)
+ return true;
+
+ /* All CPUs are bootable if CPU is not SMT capable */
+ if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+ return true;
+
if (topology_is_primary_thread(cpu))
return true;
@@ -685,7 +693,7 @@ bool cpu_smt_possible(void)
EXPORT_SYMBOL_GPL(cpu_smt_possible);
#else
-static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
+static inline bool cpu_bootable(unsigned int cpu) { return true; }
#endif
static inline enum cpuhp_state
@@ -788,10 +796,10 @@ static int bringup_wait_for_ap_online(unsigned int cpu)
* SMT soft disabling on X86 requires to bring the CPU out of the
* BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
* CPU marked itself as booted_once in notify_cpu_starting() so the
- * cpu_smt_allowed() check will now return false if this is not the
+ * cpu_bootable() check will now return false if this is not the
* primary sibling.
*/
- if (!cpu_smt_allowed(cpu))
+ if (!cpu_bootable(cpu))
return -ECANCELED;
return 0;
}
@@ -1372,7 +1380,14 @@ static int takedown_cpu(unsigned int cpu)
cpuhp_bp_sync_dead(cpu);
tick_cleanup_dead_cpu(cpu);
+
+ /*
+ * Callbacks must be re-integrated right away to the RCU state machine.
+ * Otherwise an RCU callback could block a further teardown function
+ * waiting for its completion.
+ */
rcutree_migrate_callbacks(cpu);
+
return 0;
}
@@ -1388,10 +1403,10 @@ void cpuhp_report_idle_dead(void)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
BUG_ON(st->state != CPUHP_AP_OFFLINE);
- rcu_report_dead(smp_processor_id());
+ rcutree_report_cpu_dead();
st->state = CPUHP_AP_IDLE_DEAD;
/*
- * We cannot call complete after rcu_report_dead() so we delegate it
+ * We cannot call complete after rcutree_report_cpu_dead() so we delegate it
* to an online cpu.
*/
smp_call_function_single(cpumask_first(cpu_online_mask),
@@ -1515,11 +1530,14 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
/*
* Ensure that the control task does not run on the to be offlined
* CPU to prevent a deadlock against cfs_b->period_timer.
+ * Also keep at least one housekeeping cpu onlined to avoid generating
+ * an empty sched_domain span.
*/
- cpu = cpumask_any_but(cpu_online_mask, cpu);
- if (cpu >= nr_cpu_ids)
- return -EBUSY;
- return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
+ for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) {
+ if (cpu != work.cpu)
+ return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
+ }
+ return -EBUSY;
}
static int cpu_down(unsigned int cpu, enum cpuhp_state target)
@@ -1617,7 +1635,7 @@ void notify_cpu_starting(unsigned int cpu)
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
- rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
+ rcutree_report_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
cpumask_set_cpu(cpu, &cpus_booted_once_mask);
/*
@@ -1725,9 +1743,6 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target)
if (!cpu_possible(cpu)) {
pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
cpu);
-#if defined(CONFIG_IA64)
- pr_err("please check additional_cpus= boot parameter\n");
-#endif
return -EINVAL;
}
@@ -1741,7 +1756,7 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target)
err = -EBUSY;
goto out;
}
- if (!cpu_smt_allowed(cpu)) {
+ if (!cpu_bootable(cpu)) {
err = -EPERM;
goto out;
}
@@ -2098,7 +2113,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_HRTIMERS_PREPARE] = {
.name = "hrtimers:prepare",
.startup.single = hrtimers_prepare_cpu,
- .teardown.single = hrtimers_dead_cpu,
+ .teardown.single = NULL,
},
[CPUHP_SMPCFD_PREPARE] = {
.name = "smpcfd:prepare",
@@ -2110,11 +2125,6 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = relay_prepare_cpu,
.teardown.single = NULL,
},
- [CPUHP_SLAB_PREPARE] = {
- .name = "slab:prepare",
- .startup.single = slab_prepare_cpu,
- .teardown.single = slab_dead_cpu,
- },
[CPUHP_RCUTREE_PREP] = {
.name = "RCU/tree:prepare",
.startup.single = rcutree_prepare_cpu,
@@ -2190,6 +2200,12 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = NULL,
.teardown.single = smpcfd_dying_cpu,
},
+ [CPUHP_AP_HRTIMERS_DYING] = {
+ .name = "hrtimers:dying",
+ .startup.single = NULL,
+ .teardown.single = hrtimers_cpu_dying,
+ },
+
/* Entry state on starting. Interrupts enabled from here on. Transient
* state for synchronsization */
[CPUHP_AP_ONLINE] = {
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 03a7932cde0a..75cd6a736d03 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -5,7 +5,6 @@
*/
#include <linux/buildid.h>
-#include <linux/crash_core.h>
#include <linux/init.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
@@ -13,6 +12,8 @@
#include <linux/kexec.h>
#include <linux/memory.h>
#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kmemleak.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -33,6 +34,22 @@ u32 *vmcoreinfo_note;
/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
static unsigned char *vmcoreinfo_data_safecopy;
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+
/*
* parsing the "crashkernel" commandline
*
@@ -181,7 +198,7 @@ static __initdata char *suffix_tbl[] = {
* It returns 0 on success and -EINVAL on failure.
*/
static int __init parse_crashkernel_suffix(char *cmdline,
- unsigned long long *crash_size,
+ unsigned long long *crash_size,
const char *suffix)
{
char *cur = cmdline;
@@ -248,11 +265,11 @@ static int __init __parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base,
- const char *name,
const char *suffix)
{
- char *first_colon, *first_space;
- char *ck_cmdline;
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+ char *name = "crashkernel=";
BUG_ON(!crash_size || !crash_base);
*crash_size = 0;
@@ -283,32 +300,53 @@ static int __init __parse_crashkernel(char *cmdline,
/*
* That function is the entry point for command line parsing and should be
* called from the arch-specific code.
+ *
+ * If crashkernel=,high|low is supported on architecture, non-NULL values
+ * should be passed to parameters 'low_size' and 'high'.
*/
int __init parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
- unsigned long long *crash_base)
+ unsigned long long *crash_base,
+ unsigned long long *low_size,
+ bool *high)
{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", NULL);
-}
+ int ret;
-int __init parse_crashkernel_high(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
+ /* crashkernel=X[@offset] */
+ ret = __parse_crashkernel(cmdline, system_ram, crash_size,
+ crash_base, NULL);
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ /*
+ * If non-NULL 'high' passed in and no normal crashkernel
+ * setting detected, try parsing crashkernel=,high|low.
+ */
+ if (high && ret == -ENOENT) {
+ ret = __parse_crashkernel(cmdline, 0, crash_size,
+ crash_base, suffix_tbl[SUFFIX_HIGH]);
+ if (ret || !*crash_size)
+ return -EINVAL;
-int __init parse_crashkernel_low(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+ /*
+ * crashkernel=Y,low can be specified or not, but invalid value
+ * is not allowed.
+ */
+ ret = __parse_crashkernel(cmdline, 0, low_size,
+ crash_base, suffix_tbl[SUFFIX_LOW]);
+ if (ret == -ENOENT) {
+ *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ ret = 0;
+ } else if (ret) {
+ return ret;
+ }
+
+ *high = true;
+ }
+#endif
+ if (!*crash_size)
+ ret = -EINVAL;
+
+ return ret;
}
/*
@@ -321,6 +359,119 @@ static int __init parse_crashkernel_dummy(char *arg)
}
early_param("crashkernel", parse_crashkernel_dummy);
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static int __init reserve_crashkernel_low(unsigned long long low_size)
+{
+#ifdef CONFIG_64BIT
+ unsigned long long low_base;
+
+ low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
+ if (!low_base) {
+ pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
+ return -ENOMEM;
+ }
+
+ pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
+ low_base, low_base + low_size, low_size >> 20);
+
+ crashk_low_res.start = low_base;
+ crashk_low_res.end = low_base + low_size - 1;
+#endif
+ return 0;
+}
+
+void __init reserve_crashkernel_generic(char *cmdline,
+ unsigned long long crash_size,
+ unsigned long long crash_base,
+ unsigned long long crash_low_size,
+ bool high)
+{
+ unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
+ bool fixed_base = false;
+
+ /* User specifies base address explicitly. */
+ if (crash_base) {
+ fixed_base = true;
+ search_base = crash_base;
+ search_end = crash_base + crash_size;
+ } else if (high) {
+ search_base = CRASH_ADDR_LOW_MAX;
+ search_end = CRASH_ADDR_HIGH_MAX;
+ }
+
+retry:
+ crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+ search_base, search_end);
+ if (!crash_base) {
+ /*
+ * For crashkernel=size[KMG]@offset[KMG], print out failure
+ * message if can't reserve the specified region.
+ */
+ if (fixed_base) {
+ pr_warn("crashkernel reservation failed - memory is in use.\n");
+ return;
+ }
+
+ /*
+ * For crashkernel=size[KMG], if the first attempt was for
+ * low memory, fall back to high memory, the minimum required
+ * low memory will be reserved later.
+ */
+ if (!high && search_end == CRASH_ADDR_LOW_MAX) {
+ search_end = CRASH_ADDR_HIGH_MAX;
+ search_base = CRASH_ADDR_LOW_MAX;
+ crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ goto retry;
+ }
+
+ /*
+ * For crashkernel=size[KMG],high, if the first attempt was
+ * for high memory, fall back to low memory.
+ */
+ if (high && search_end == CRASH_ADDR_HIGH_MAX) {
+ search_end = CRASH_ADDR_LOW_MAX;
+ search_base = 0;
+ goto retry;
+ }
+ pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+ crash_size);
+ return;
+ }
+
+ if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
+ crash_low_size && reserve_crashkernel_low(crash_low_size)) {
+ memblock_phys_free(crash_base, crash_size);
+ return;
+ }
+
+ pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+ crash_base, crash_base + crash_size, crash_size >> 20);
+
+ /*
+ * The crashkernel memory will be removed from the kernel linear
+ * map. Inform kmemleak so that it won't try to access it.
+ */
+ kmemleak_ignore_phys(crash_base);
+ if (crashk_low_res.end)
+ kmemleak_ignore_phys(crashk_low_res.start);
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+}
+
+static __init int insert_crashkernel_resources(void)
+{
+ if (crashk_res.start < crashk_res.end)
+ insert_resource(&iomem_resource, &crashk_res);
+
+ if (crashk_low_res.start < crashk_low_res.end)
+ insert_resource(&iomem_resource, &crashk_low_res);
+
+ return 0;
+}
+early_initcall(insert_crashkernel_resources);
+#endif
+
int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
void **addr, unsigned long *sz)
{
@@ -409,9 +560,11 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
phdr->p_align = 0;
ehdr->e_phnum++;
- pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
- phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
- ehdr->e_phnum, phdr->p_offset);
+#ifdef CONFIG_KEXEC_FILE
+ kexec_dprintk("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+#endif
phdr++;
}
@@ -423,9 +576,8 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
int crash_exclude_mem_range(struct crash_mem *mem,
unsigned long long mstart, unsigned long long mend)
{
- int i, j;
+ int i;
unsigned long long start, end, p_start, p_end;
- struct range temp_range = {0, 0};
for (i = 0; i < mem->nr_ranges; i++) {
start = mem->ranges[i].start;
@@ -433,72 +585,51 @@ int crash_exclude_mem_range(struct crash_mem *mem,
p_start = mstart;
p_end = mend;
- if (mstart > end || mend < start)
+ if (p_start > end)
continue;
+ /*
+ * Because the memory ranges in mem->ranges are stored in
+ * ascending order, when we detect `p_end < start`, we can
+ * immediately exit the for loop, as the subsequent memory
+ * ranges will definitely be outside the range we are looking
+ * for.
+ */
+ if (p_end < start)
+ break;
+
/* Truncate any area outside of range */
- if (mstart < start)
+ if (p_start < start)
p_start = start;
- if (mend > end)
+ if (p_end > end)
p_end = end;
/* Found completely overlapping range */
if (p_start == start && p_end == end) {
- mem->ranges[i].start = 0;
- mem->ranges[i].end = 0;
- if (i < mem->nr_ranges - 1) {
- /* Shift rest of the ranges to left */
- for (j = i; j < mem->nr_ranges - 1; j++) {
- mem->ranges[j].start =
- mem->ranges[j+1].start;
- mem->ranges[j].end =
- mem->ranges[j+1].end;
- }
-
- /*
- * Continue to check if there are another overlapping ranges
- * from the current position because of shifting the above
- * mem ranges.
- */
- i--;
- mem->nr_ranges--;
- continue;
- }
+ memmove(&mem->ranges[i], &mem->ranges[i + 1],
+ (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i]));
+ i--;
mem->nr_ranges--;
- return 0;
- }
-
- if (p_start > start && p_end < end) {
+ } else if (p_start > start && p_end < end) {
/* Split original range */
+ if (mem->nr_ranges >= mem->max_nr_ranges)
+ return -ENOMEM;
+
+ memmove(&mem->ranges[i + 2], &mem->ranges[i + 1],
+ (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i]));
+
mem->ranges[i].end = p_start - 1;
- temp_range.start = p_end + 1;
- temp_range.end = end;
+ mem->ranges[i + 1].start = p_end + 1;
+ mem->ranges[i + 1].end = end;
+
+ i++;
+ mem->nr_ranges++;
} else if (p_start != start)
mem->ranges[i].end = p_start - 1;
else
mem->ranges[i].start = p_end + 1;
- break;
}
- /* If a split happened, add the split to array */
- if (!temp_range.end)
- return 0;
-
- /* Split happened */
- if (i == mem->max_nr_ranges - 1)
- return -ENOMEM;
-
- /* Location where new range should go */
- j = i + 1;
- if (j < mem->nr_ranges) {
- /* Move over all ranges one slot towards the end */
- for (i = mem->nr_ranges - 1; i >= j; i--)
- mem->ranges[i + 1] = mem->ranges[i];
- }
-
- mem->ranges[j].start = temp_range.start;
- mem->ranges[j].end = temp_range.end;
- mem->nr_ranges++;
return 0;
}
@@ -660,7 +791,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(list_head, prev);
VMCOREINFO_OFFSET(vmap_area, va_start);
VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1);
+ VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
log_buf_vmcoreinfo_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
VMCOREINFO_NUMBER(NR_FREE_PAGES);
@@ -740,6 +871,17 @@ subsys_initcall(crash_notes_memory_init);
#define pr_fmt(fmt) "crash hp: " fmt
/*
+ * Different than kexec/kdump loading/unloading/jumping/shrinking which
+ * usually rarely happen, there will be many crash hotplug events notified
+ * during one short period, e.g one memory board is hot added and memory
+ * regions are online. So mutex lock __crash_hotplug_lock is used to
+ * serialize the crash hotplug handling specifically.
+ */
+static DEFINE_MUTEX(__crash_hotplug_lock);
+#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock)
+#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock)
+
+/*
* This routine utilized when the crash_hotplug sysfs node is read.
* It reflects the kernel's ability/permission to update the crash
* elfcorehdr directly.
@@ -748,9 +890,11 @@ int crash_check_update_elfcorehdr(void)
{
int rc = 0;
+ crash_hotplug_lock();
/* Obtain lock while reading crash information */
if (!kexec_trylock()) {
pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ crash_hotplug_unlock();
return 0;
}
if (kexec_crash_image) {
@@ -761,6 +905,7 @@ int crash_check_update_elfcorehdr(void)
}
/* Release lock now that update complete */
kexec_unlock();
+ crash_hotplug_unlock();
return rc;
}
@@ -783,9 +928,11 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
{
struct kimage *image;
+ crash_hotplug_lock();
/* Obtain lock while changing crash information */
if (!kexec_trylock()) {
pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+ crash_hotplug_unlock();
return;
}
@@ -852,6 +999,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
out:
/* Release lock now that update complete */
kexec_unlock();
+ crash_hotplug_unlock();
}
static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
diff --git a/kernel/cred.c b/kernel/cred.c
index 98cb4eca23fb..c033a201c808 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -36,17 +36,13 @@ do { \
static struct kmem_cache *cred_jar;
/* init to 2 - one for init_task, one to ensure it is never freed */
-static struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
/*
* The initial credentials for the initial task
*/
struct cred init_cred = {
.usage = ATOMIC_INIT(4),
-#ifdef CONFIG_DEBUG_CREDENTIALS
- .subscribers = ATOMIC_INIT(2),
- .magic = CRED_MAGIC,
-#endif
.uid = GLOBAL_ROOT_UID,
.gid = GLOBAL_ROOT_GID,
.suid = GLOBAL_ROOT_UID,
@@ -66,31 +62,6 @@ struct cred init_cred = {
.ucounts = &init_ucounts,
};
-static inline void set_cred_subscribers(struct cred *cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- atomic_set(&cred->subscribers, n);
-#endif
-}
-
-static inline int read_cred_subscribers(const struct cred *cred)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- return atomic_read(&cred->subscribers);
-#else
- return 0;
-#endif
-}
-
-static inline void alter_cred_subscribers(const struct cred *_cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- struct cred *cred = (struct cred *) _cred;
-
- atomic_add(n, &cred->subscribers);
-#endif
-}
-
/*
* The RCU callback to actually dispose of a set of credentials
*/
@@ -100,20 +71,9 @@ static void put_cred_rcu(struct rcu_head *rcu)
kdebug("put_cred_rcu(%p)", cred);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- if (cred->magic != CRED_MAGIC_DEAD ||
- atomic_read(&cred->usage) != 0 ||
- read_cred_subscribers(cred) != 0)
- panic("CRED: put_cred_rcu() sees %p with"
- " mag %x, put %p, usage %d, subscr %d\n",
- cred, cred->magic, cred->put_addr,
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
-#else
- if (atomic_read(&cred->usage) != 0)
- panic("CRED: put_cred_rcu() sees %p with usage %d\n",
- cred, atomic_read(&cred->usage));
-#endif
+ if (atomic_long_read(&cred->usage) != 0)
+ panic("CRED: put_cred_rcu() sees %p with usage %ld\n",
+ cred, atomic_long_read(&cred->usage));
security_cred_free(cred);
key_put(cred->session_keyring);
@@ -137,16 +97,10 @@ static void put_cred_rcu(struct rcu_head *rcu)
*/
void __put_cred(struct cred *cred)
{
- kdebug("__put_cred(%p{%d,%d})", cred,
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
-
- BUG_ON(atomic_read(&cred->usage) != 0);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(cred) != 0);
- cred->magic = CRED_MAGIC_DEAD;
- cred->put_addr = __builtin_return_address(0);
-#endif
+ kdebug("__put_cred(%p{%ld})", cred,
+ atomic_long_read(&cred->usage));
+
+ BUG_ON(atomic_long_read(&cred->usage) != 0);
BUG_ON(cred == current->cred);
BUG_ON(cred == current->real_cred);
@@ -162,23 +116,23 @@ EXPORT_SYMBOL(__put_cred);
*/
void exit_creds(struct task_struct *tsk)
{
- struct cred *cred;
+ struct cred *real_cred, *cred;
- kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
- atomic_read(&tsk->cred->usage),
- read_cred_subscribers(tsk->cred));
+ kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred,
+ atomic_long_read(&tsk->cred->usage));
- cred = (struct cred *) tsk->real_cred;
+ real_cred = (struct cred *) tsk->real_cred;
tsk->real_cred = NULL;
- validate_creds(cred);
- alter_cred_subscribers(cred, -1);
- put_cred(cred);
cred = (struct cred *) tsk->cred;
tsk->cred = NULL;
- validate_creds(cred);
- alter_cred_subscribers(cred, -1);
- put_cred(cred);
+
+ if (real_cred == cred) {
+ put_cred_many(cred, 2);
+ } else {
+ put_cred(real_cred);
+ put_cred(cred);
+ }
#ifdef CONFIG_KEYS_REQUEST_CACHE
key_put(tsk->cached_requested_key);
@@ -224,10 +178,7 @@ struct cred *cred_alloc_blank(void)
if (!new)
return NULL;
- atomic_set(&new->usage, 1);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- new->magic = CRED_MAGIC;
-#endif
+ atomic_long_set(&new->usage, 1);
if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
@@ -258,8 +209,6 @@ struct cred *prepare_creds(void)
const struct cred *old;
struct cred *new;
- validate_process_creds();
-
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
@@ -270,8 +219,7 @@ struct cred *prepare_creds(void)
memcpy(new, old, sizeof(struct cred));
new->non_rcu = 0;
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
+ atomic_long_set(&new->usage, 1);
get_group_info(new->group_info);
get_uid(new->user);
get_user_ns(new->user_ns);
@@ -294,7 +242,6 @@ struct cred *prepare_creds(void)
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
- validate_creds(new);
return new;
error:
@@ -355,12 +302,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
#endif
clone_flags & CLONE_THREAD
) {
- p->real_cred = get_cred(p->cred);
- get_cred(p->cred);
- alter_cred_subscribers(p->cred, 2);
- kdebug("share_creds(%p{%d,%d})",
- p->cred, atomic_read(&p->cred->usage),
- read_cred_subscribers(p->cred));
+ p->real_cred = get_cred_many(p->cred, 2);
+ kdebug("share_creds(%p{%ld})",
+ p->cred, atomic_long_read(&p->cred->usage));
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
return 0;
}
@@ -399,8 +343,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
p->cred = p->real_cred = get_cred(new);
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
- alter_cred_subscribers(new, 2);
- validate_creds(new);
return 0;
error_put:
@@ -452,17 +394,11 @@ int commit_creds(struct cred *new)
struct task_struct *task = current;
const struct cred *old = task->real_cred;
- kdebug("commit_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
+ kdebug("commit_creds(%p{%ld})", new,
+ atomic_long_read(&new->usage));
BUG_ON(task->cred != old);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(old) < 2);
- validate_creds(old);
- validate_creds(new);
-#endif
- BUG_ON(atomic_read(&new->usage) < 1);
+ BUG_ON(atomic_long_read(&new->usage) < 1);
get_cred(new); /* we will require a ref for the subj creds too */
@@ -497,14 +433,12 @@ int commit_creds(struct cred *new)
* RLIMIT_NPROC limits on user->processes have already been checked
* in set_user().
*/
- alter_cred_subscribers(new, 2);
if (new->user != old->user || new->user_ns != old->user_ns)
inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user || new->user_ns != old->user_ns)
dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
- alter_cred_subscribers(old, -2);
/* send notifications */
if (!uid_eq(new->uid, old->uid) ||
@@ -520,8 +454,7 @@ int commit_creds(struct cred *new)
proc_id_connector(task, PROC_EVENT_GID);
/* release the old obj and subj refs both */
- put_cred(old);
- put_cred(old);
+ put_cred_many(old, 2);
return 0;
}
EXPORT_SYMBOL(commit_creds);
@@ -535,14 +468,10 @@ EXPORT_SYMBOL(commit_creds);
*/
void abort_creds(struct cred *new)
{
- kdebug("abort_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
+ kdebug("abort_creds(%p{%ld})", new,
+ atomic_long_read(&new->usage));
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(new) != 0);
-#endif
- BUG_ON(atomic_read(&new->usage) < 1);
+ BUG_ON(atomic_long_read(&new->usage) < 1);
put_cred(new);
}
EXPORT_SYMBOL(abort_creds);
@@ -558,12 +487,8 @@ const struct cred *override_creds(const struct cred *new)
{
const struct cred *old = current->cred;
- kdebug("override_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
-
- validate_creds(old);
- validate_creds(new);
+ kdebug("override_creds(%p{%ld})", new,
+ atomic_long_read(&new->usage));
/*
* NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
@@ -572,18 +497,12 @@ const struct cred *override_creds(const struct cred *new)
* we are only installing the cred into the thread-synchronous
* '->cred' pointer, not the '->real_cred' pointer that is
* visible to other threads under RCU.
- *
- * Also note that we did validate_creds() manually, not depending
- * on the validation in 'get_cred()'.
*/
get_new_cred((struct cred *)new);
- alter_cred_subscribers(new, 1);
rcu_assign_pointer(current->cred, new);
- alter_cred_subscribers(old, -1);
- kdebug("override_creds() = %p{%d,%d}", old,
- atomic_read(&old->usage),
- read_cred_subscribers(old));
+ kdebug("override_creds() = %p{%ld}", old,
+ atomic_long_read(&old->usage));
return old;
}
EXPORT_SYMBOL(override_creds);
@@ -599,15 +518,10 @@ void revert_creds(const struct cred *old)
{
const struct cred *override = current->cred;
- kdebug("revert_creds(%p{%d,%d})", old,
- atomic_read(&old->usage),
- read_cred_subscribers(old));
+ kdebug("revert_creds(%p{%ld})", old,
+ atomic_long_read(&old->usage));
- validate_creds(old);
- validate_creds(override);
- alter_cred_subscribers(old, 1);
rcu_assign_pointer(current->cred, old);
- alter_cred_subscribers(override, -1);
put_cred(override);
}
EXPORT_SYMBOL(revert_creds);
@@ -727,12 +641,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
kdebug("prepare_kernel_cred() alloc %p", new);
old = get_task_cred(daemon);
- validate_creds(old);
*new = *old;
new->non_rcu = 0;
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
+ atomic_long_set(&new->usage, 1);
get_uid(new->user);
get_user_ns(new->user_ns);
get_group_info(new->group_info);
@@ -756,7 +668,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
goto error;
put_cred(old);
- validate_creds(new);
return new;
error:
@@ -821,109 +732,3 @@ int set_create_files_as(struct cred *new, struct inode *inode)
return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);
-
-#ifdef CONFIG_DEBUG_CREDENTIALS
-
-bool creds_are_invalid(const struct cred *cred)
-{
- if (cred->magic != CRED_MAGIC)
- return true;
- return false;
-}
-EXPORT_SYMBOL(creds_are_invalid);
-
-/*
- * dump invalid credentials
- */
-static void dump_invalid_creds(const struct cred *cred, const char *label,
- const struct task_struct *tsk)
-{
- pr_err("%s credentials: %p %s%s%s\n",
- label, cred,
- cred == &init_cred ? "[init]" : "",
- cred == tsk->real_cred ? "[real]" : "",
- cred == tsk->cred ? "[eff]" : "");
- pr_err("->magic=%x, put_addr=%p\n",
- cred->magic, cred->put_addr);
- pr_err("->usage=%d, subscr=%d\n",
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
- pr_err("->*uid = { %d,%d,%d,%d }\n",
- from_kuid_munged(&init_user_ns, cred->uid),
- from_kuid_munged(&init_user_ns, cred->euid),
- from_kuid_munged(&init_user_ns, cred->suid),
- from_kuid_munged(&init_user_ns, cred->fsuid));
- pr_err("->*gid = { %d,%d,%d,%d }\n",
- from_kgid_munged(&init_user_ns, cred->gid),
- from_kgid_munged(&init_user_ns, cred->egid),
- from_kgid_munged(&init_user_ns, cred->sgid),
- from_kgid_munged(&init_user_ns, cred->fsgid));
-#ifdef CONFIG_SECURITY
- pr_err("->security is %p\n", cred->security);
- if ((unsigned long) cred->security >= PAGE_SIZE &&
- (((unsigned long) cred->security & 0xffffff00) !=
- (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
- pr_err("->security {%x, %x}\n",
- ((u32*)cred->security)[0],
- ((u32*)cred->security)[1]);
-#endif
-}
-
-/*
- * report use of invalid credentials
- */
-void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line)
-{
- pr_err("Invalid credentials\n");
- pr_err("At %s:%u\n", file, line);
- dump_invalid_creds(cred, "Specified", current);
- BUG();
-}
-EXPORT_SYMBOL(__invalid_creds);
-
-/*
- * check the credentials on a process
- */
-void __validate_process_creds(struct task_struct *tsk,
- const char *file, unsigned line)
-{
- if (tsk->cred == tsk->real_cred) {
- if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
- creds_are_invalid(tsk->cred)))
- goto invalid_creds;
- } else {
- if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
- read_cred_subscribers(tsk->cred) < 1 ||
- creds_are_invalid(tsk->real_cred) ||
- creds_are_invalid(tsk->cred)))
- goto invalid_creds;
- }
- return;
-
-invalid_creds:
- pr_err("Invalid process credentials\n");
- pr_err("At %s:%u\n", file, line);
-
- dump_invalid_creds(tsk->real_cred, "Real", tsk);
- if (tsk->cred != tsk->real_cred)
- dump_invalid_creds(tsk->cred, "Effective", tsk);
- else
- pr_err("Effective creds == Real creds\n");
- BUG();
-}
-EXPORT_SYMBOL(__validate_process_creds);
-
-/*
- * check creds for do_exit()
- */
-void validate_creds_for_do_exit(struct task_struct *tsk)
-{
- kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
- tsk->real_cred, tsk->cred,
- atomic_read(&tsk->cred->usage),
- read_cred_subscribers(tsk->cred));
-
- __validate_process_creds(tsk, __FILE__, __LINE__);
-}
-
-#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 621037a0aa87..ce1bb2301c06 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1006,6 +1006,9 @@ void kgdb_panic(const char *msg)
if (panic_timeout)
return;
+ debug_locks_off();
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
+
if (dbg_kdb_mode)
kdb_printf("PANIC: %s\n", msg);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 813cb6cf72d6..9443bc63c5a2 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -590,6 +590,8 @@ static void kdb_msg_write(const char *msg, int msg_len)
continue;
if (c == dbg_io_ops->cons)
continue;
+ if (!c->write)
+ continue;
/*
* Set oops_in_progress to encourage the console drivers to
* disregard their internal spin locks: in the current calling
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 438b868cbfa9..d05066cb40b2 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -272,11 +272,10 @@ char *kdbgetenv(const char *match)
* kdballocenv - This function is used to allocate bytes for
* environment entries.
* Parameters:
- * match A character string representing a numeric value
- * Outputs:
- * *value the unsigned long representation of the env variable 'match'
+ * bytes The number of bytes to allocate in the static buffer.
* Returns:
- * Zero on success, a kdb diagnostic on failure.
+ * A pointer to the allocated space in the buffer on success.
+ * NULL if bytes > size available in the envbuffer.
* Remarks:
* We use a static environment buffer (envbuffer) to hold the values
* of dynamically generated environment variables (see kdb_set). Buffer
@@ -1349,8 +1348,6 @@ do_full_getstr:
/* PROMPT can only be set if we have MEM_READ permission. */
snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
raw_smp_processor_id());
- if (defcmd_in_progress)
- strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
/*
* Fetch command from keyboard
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 4c1e9a3c0ab6..d62f5957f36b 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -135,6 +135,8 @@ config DMA_COHERENT_POOL
config DMA_GLOBAL_POOL
select DMA_DECLARE_COHERENT
+ depends on !ARCH_HAS_DMA_SET_UNCACHED
+ depends on !DMA_DIRECT_REMAP
bool
config DMA_DIRECT_REMAP
@@ -142,6 +144,15 @@ config DMA_DIRECT_REMAP
select DMA_COHERENT_POOL
select DMA_NONCOHERENT_MMAP
+#
+# Fallback to arch code for DMA allocations. This should eventually go away.
+#
+config ARCH_HAS_DMA_ALLOC
+ depends on !ARCH_HAS_DMA_SET_UNCACHED
+ depends on !DMA_DIRECT_REMAP
+ depends on !DMA_GLOBAL_POOL
+ bool
+
config DMA_CMA
bool "DMA Contiguous Memory Allocator"
depends on HAVE_DMA_CONTIGUOUS && CMA
@@ -160,7 +171,7 @@ if DMA_CMA
config DMA_NUMA_CMA
bool "Enable separate DMA Contiguous Memory Area for NUMA Node"
- default NUMA
+ depends on NUMA
help
Enable this option to get numa CMA areas so that NUMA devices
can get local memory by DMA coherent APIs.
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index c21abc77c53e..ff5683a57f77 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -132,8 +132,10 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
void dma_release_coherent_memory(struct device *dev)
{
- if (dev)
+ if (dev) {
_dma_release_coherent_memory(dev->dma_mem);
+ dev->dma_mem = NULL;
+ }
}
static void *__dma_alloc_from_coherent(struct device *dev,
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 88c595e49e34..f005c66f378c 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -473,11 +473,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
return -EBUSY;
}
- if (memblock_is_region_reserved(rmem->base, rmem->size)) {
- pr_info("Reserved memory: overlap with other memblock reserved region\n");
- return -EBUSY;
- }
-
if (!of_get_flat_dt_prop(node, "reusable", NULL) ||
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index f190651bcadd..a6e3792b15f8 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -62,7 +62,8 @@ enum map_err_types {
* @pfn: page frame of the start address
* @offset: offset of mapping relative to pfn
* @map_err_type: track whether dma_mapping_error() was checked
- * @stacktrace: support backtraces when a violation is detected
+ * @stack_len: number of backtrace entries in @stack_entries
+ * @stack_entries: stack of backtrace history
*/
struct dma_debug_entry {
struct list_head list;
@@ -139,7 +140,7 @@ static const char *const maperr2str[] = {
static const char *type2name[] = {
[dma_debug_single] = "single",
- [dma_debug_sg] = "scather-gather",
+ [dma_debug_sg] = "scatter-gather",
[dma_debug_coherent] = "coherent",
[dma_debug_resource] = "resource",
};
@@ -637,15 +638,19 @@ static struct dma_debug_entry *__dma_entry_alloc(void)
return entry;
}
-static void __dma_entry_alloc_check_leak(void)
+/*
+ * This should be called outside of free_entries_lock scope to avoid potential
+ * deadlocks with serial consoles that use DMA.
+ */
+static void __dma_entry_alloc_check_leak(u32 nr_entries)
{
- u32 tmp = nr_total_entries % nr_prealloc_entries;
+ u32 tmp = nr_entries % nr_prealloc_entries;
/* Shout each time we tick over some multiple of the initial pool */
if (tmp < DMA_DEBUG_DYNAMIC_ENTRIES) {
pr_info("dma_debug_entry pool grown to %u (%u00%%)\n",
- nr_total_entries,
- (nr_total_entries / nr_prealloc_entries));
+ nr_entries,
+ (nr_entries / nr_prealloc_entries));
}
}
@@ -656,8 +661,10 @@ static void __dma_entry_alloc_check_leak(void)
*/
static struct dma_debug_entry *dma_entry_alloc(void)
{
+ bool alloc_check_leak = false;
struct dma_debug_entry *entry;
unsigned long flags;
+ u32 nr_entries;
spin_lock_irqsave(&free_entries_lock, flags);
if (num_free_entries == 0) {
@@ -667,13 +674,17 @@ static struct dma_debug_entry *dma_entry_alloc(void)
pr_err("debugging out of memory - disabling\n");
return NULL;
}
- __dma_entry_alloc_check_leak();
+ alloc_check_leak = true;
+ nr_entries = nr_total_entries;
}
entry = __dma_entry_alloc();
spin_unlock_irqrestore(&free_entries_lock, flags);
+ if (alloc_check_leak)
+ __dma_entry_alloc_check_leak(nr_entries);
+
#ifdef CONFIG_STACKTRACE
entry->stack_len = stack_trace_save(entry->stack_entries,
ARRAY_SIZE(entry->stack_entries),
@@ -866,7 +877,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti
return 0;
}
-void dma_debug_add_bus(struct bus_type *bus)
+void dma_debug_add_bus(const struct bus_type *bus)
{
struct notifier_block *nb;
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 9596ae1aa0da..98b2e192fd69 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -220,13 +220,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
if (!dev_is_dma_coherent(dev)) {
- /*
- * Fallback to the arch handler if it exists. This should
- * eventually go away.
- */
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) &&
!is_swiotlb_for_alloc(dev))
return arch_dma_alloc(dev, size, dma_handle, gfp,
attrs);
@@ -240,27 +234,24 @@ void *dma_direct_alloc(struct device *dev, size_t size,
dma_handle);
/*
- * Otherwise remap if the architecture is asking for it. But
- * given that remapping memory is a blocking operation we'll
- * instead have to dip into the atomic pools.
+ * Otherwise we require the architecture to either be able to
+ * mark arbitrary parts of the kernel direct mapping uncached,
+ * or remapped it uncached.
*/
+ set_uncached = IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED);
remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP);
- if (remap) {
- if (dma_direct_use_pool(dev, gfp))
- return dma_direct_alloc_from_pool(dev, size,
- dma_handle, gfp);
- } else {
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED))
- return NULL;
- set_uncached = true;
+ if (!set_uncached && !remap) {
+ pr_warn_once("coherent DMA allocations not supported on this platform.\n");
+ return NULL;
}
}
/*
- * Decrypting memory may block, so allocate the memory from the atomic
- * pools if we can't block.
+ * Remapping or decrypting memory may block, allocate the memory from
+ * the atomic pools instead if we aren't allowed block.
*/
- if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
+ if ((remap || force_dma_unencrypted(dev)) &&
+ dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
/* we always manually zero the memory once we are done */
@@ -330,9 +321,7 @@ void dma_direct_free(struct device *dev, size_t size,
return;
}
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) &&
!dev_is_dma_coherent(dev) &&
!is_swiotlb_for_alloc(dev)) {
arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
@@ -598,6 +587,46 @@ int dma_direct_supported(struct device *dev, u64 mask)
return mask >= phys_to_dma_unencrypted(dev, min_mask);
}
+/*
+ * To check whether all ram resource ranges are covered by dma range map
+ * Returns 0 when further check is needed
+ * Returns 1 if there is some RAM range can't be covered by dma_range_map
+ */
+static int check_ram_in_range_map(unsigned long start_pfn,
+ unsigned long nr_pages, void *data)
+{
+ unsigned long end_pfn = start_pfn + nr_pages;
+ const struct bus_dma_region *bdr = NULL;
+ const struct bus_dma_region *m;
+ struct device *dev = data;
+
+ while (start_pfn < end_pfn) {
+ for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) {
+ unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start);
+
+ if (start_pfn >= cpu_start_pfn &&
+ start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) {
+ bdr = m;
+ break;
+ }
+ }
+ if (!bdr)
+ return 1;
+
+ start_pfn = PFN_DOWN(bdr->cpu_start) + PFN_DOWN(bdr->size);
+ }
+
+ return 0;
+}
+
+bool dma_direct_all_ram_mapped(struct device *dev)
+{
+ if (!dev->dma_range_map)
+ return true;
+ return !walk_system_ram_range(0, PFN_DOWN(ULONG_MAX) + 1, dev,
+ check_ram_in_range_map);
+}
+
size_t dma_direct_max_mapping_size(struct device *dev)
{
/* If SWIOTLB is active, use its maximum mapping size */
@@ -648,7 +677,6 @@ int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
return -ENOMEM;
map[0].cpu_start = cpu_start;
map[0].dma_start = dma_start;
- map[0].offset = offset;
map[0].size = size;
dev->dma_range_map = map;
return 0;
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 97ec892ea0b5..18d346118fe8 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -20,6 +20,7 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs);
+bool dma_direct_all_ram_mapped(struct device *dev);
size_t dma_direct_max_mapping_size(struct device *dev);
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index e323ca48f7f2..58db8fd70471 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -793,6 +793,28 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_coherent_mask);
+/**
+ * dma_addressing_limited - return if the device is addressing limited
+ * @dev: device to check
+ *
+ * Return %true if the devices DMA mask is too small to address all memory in
+ * the system, else %false. Lack of addressing bits is the prime reason for
+ * bounce buffering, but might not be the only one.
+ */
+bool dma_addressing_limited(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
+ dma_get_required_mask(dev))
+ return true;
+
+ if (unlikely(ops))
+ return false;
+ return !dma_direct_all_ram_mapped(dev);
+}
+EXPORT_SYMBOL_GPL(dma_addressing_limited);
+
size_t dma_max_mapping_size(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 1acec2e22827..d10613eb0f63 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
void *addr;
int ret = -ENOMEM;
- /* Cannot allocate larger than MAX_ORDER */
- order = min(get_order(pool_size), MAX_ORDER);
+ /* Cannot allocate larger than MAX_PAGE_ORDER */
+ order = min(get_order(pool_size), MAX_PAGE_ORDER);
do {
pool_size = 1 << (PAGE_SHIFT + order);
@@ -135,9 +135,9 @@ encrypt_mapping:
remove_mapping:
#ifdef CONFIG_DMA_DIRECT_REMAP
dma_common_free_remap(addr, pool_size);
-#endif
-free_page: __maybe_unused
+free_page:
__free_pages(page, order);
+#endif
out:
return ret;
}
@@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void)
/*
* If coherent_pool was not used on the command line, default the pool
- * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER.
+ * sizes to 128KB per 1GB of memory, min 128KB, max MAX_PAGE_ORDER.
*/
if (!atomic_pool_size) {
unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 394494a6b1f3..b079a9a8e087 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -283,7 +283,8 @@ static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
}
for (i = 0; i < mem->nslabs; i++) {
- mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+ mem->slots[i].list = min(IO_TLB_SEGSIZE - io_tlb_offset(i),
+ mem->nslabs - i);
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
}
@@ -399,14 +400,13 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
}
mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
- default_nareas), SMP_CACHE_BYTES);
+ nareas), SMP_CACHE_BYTES);
if (!mem->areas) {
pr_warn("%s: Failed to allocate mem->areas.\n", __func__);
return;
}
- swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false,
- default_nareas);
+ swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, nareas);
add_mem_pool(&io_tlb_default_mem, mem);
if (flags & SWIOTLB_VERBOSE)
@@ -559,29 +559,40 @@ void __init swiotlb_exit(void)
* alloc_dma_pages() - allocate pages to be used for DMA
* @gfp: GFP flags for the allocation.
* @bytes: Size of the buffer.
+ * @phys_limit: Maximum allowed physical address of the buffer.
*
* Allocate pages from the buddy allocator. If successful, make the allocated
* pages decrypted that they can be used for DMA.
*
- * Return: Decrypted pages, or %NULL on failure.
+ * Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN)
+ * if the allocated physical address was above @phys_limit.
*/
-static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes)
+static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
{
unsigned int order = get_order(bytes);
struct page *page;
+ phys_addr_t paddr;
void *vaddr;
page = alloc_pages(gfp, order);
if (!page)
return NULL;
- vaddr = page_address(page);
+ paddr = page_to_phys(page);
+ if (paddr + bytes - 1 > phys_limit) {
+ __free_pages(page, order);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ vaddr = phys_to_virt(paddr);
if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
goto error;
return page;
error:
- __free_pages(page, order);
+ /* Intentional leak if pages cannot be encrypted again. */
+ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ __free_pages(page, order);
return NULL;
}
@@ -619,11 +630,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
else if (phys_limit <= DMA_BIT_MASK(32))
gfp |= __GFP_DMA32;
- while ((page = alloc_dma_pages(gfp, bytes)) &&
- page_to_phys(page) + bytes - 1 > phys_limit) {
- /* allocated, but too high */
- __free_pages(page, get_order(bytes));
-
+ while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) {
if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
phys_limit < DMA_BIT_MASK(64) &&
!(gfp & (__GFP_DMA32 | __GFP_DMA)))
@@ -679,6 +686,11 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
size_t pool_size;
size_t tlb_size;
+ if (nslabs > SLABS_PER_PAGE << MAX_PAGE_ORDER) {
+ nslabs = SLABS_PER_PAGE << MAX_PAGE_ORDER;
+ nareas = limit_nareas(nareas, nslabs);
+ }
+
pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas);
pool = kzalloc(pool_size, gfp);
if (!pool)
@@ -729,9 +741,6 @@ static void swiotlb_dyn_alloc(struct work_struct *work)
}
add_mem_pool(mem, pool);
-
- /* Pairs with smp_rmb() in is_swiotlb_buffer(). */
- smp_wmb();
}
/**
@@ -948,7 +957,7 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
#endif /* CONFIG_DEBUG_FS */
/**
- * swiotlb_area_find_slots() - search for slots in one IO TLB memory area
+ * swiotlb_search_pool_area() - search one memory area in one pool
* @dev: Device which maps the buffer.
* @pool: Memory pool to be searched.
* @area_index: Index of the IO TLB memory area to be searched.
@@ -963,7 +972,7 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
*
* Return: Index of the first allocated slot, or -1 on error.
*/
-static int swiotlb_area_find_slots(struct device *dev, struct io_tlb_pool *pool,
+static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool,
int area_index, phys_addr_t orig_addr, size_t alloc_size,
unsigned int alloc_align_mask)
{
@@ -1057,41 +1066,50 @@ found:
return slot_index;
}
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
/**
- * swiotlb_pool_find_slots() - search for slots in one memory pool
+ * swiotlb_search_area() - search one memory area in all pools
* @dev: Device which maps the buffer.
- * @pool: Memory pool to be searched.
+ * @start_cpu: Start CPU number.
+ * @cpu_offset: Offset from @start_cpu.
* @orig_addr: Original (non-bounced) IO buffer address.
* @alloc_size: Total requested size of the bounce buffer,
* including initial alignment padding.
* @alloc_align_mask: Required alignment of the allocated buffer.
+ * @retpool: Used memory pool, updated on return.
*
- * Search through one memory pool to find a sequence of slots that match the
+ * Search one memory area in all pools for a sequence of slots that match the
* allocation constraints.
*
* Return: Index of the first allocated slot, or -1 on error.
*/
-static int swiotlb_pool_find_slots(struct device *dev, struct io_tlb_pool *pool,
- phys_addr_t orig_addr, size_t alloc_size,
- unsigned int alloc_align_mask)
+static int swiotlb_search_area(struct device *dev, int start_cpu,
+ int cpu_offset, phys_addr_t orig_addr, size_t alloc_size,
+ unsigned int alloc_align_mask, struct io_tlb_pool **retpool)
{
- int start = raw_smp_processor_id() & (pool->nareas - 1);
- int i = start, index;
-
- do {
- index = swiotlb_area_find_slots(dev, pool, i, orig_addr,
- alloc_size, alloc_align_mask);
- if (index >= 0)
- return index;
- if (++i >= pool->nareas)
- i = 0;
- } while (i != start);
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+ int area_index;
+ int index = -1;
- return -1;
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node) {
+ if (cpu_offset >= pool->nareas)
+ continue;
+ area_index = (start_cpu + cpu_offset) & (pool->nareas - 1);
+ index = swiotlb_search_pool_area(dev, pool, area_index,
+ orig_addr, alloc_size,
+ alloc_align_mask);
+ if (index >= 0) {
+ *retpool = pool;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return index;
}
-#ifdef CONFIG_SWIOTLB_DYNAMIC
-
/**
* swiotlb_find_slots() - search for slots in the whole swiotlb
* @dev: Device which maps the buffer.
@@ -1115,18 +1133,20 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
unsigned long nslabs;
unsigned long flags;
u64 phys_limit;
+ int cpu, i;
int index;
- rcu_read_lock();
- list_for_each_entry_rcu(pool, &mem->pools, node) {
- index = swiotlb_pool_find_slots(dev, pool, orig_addr,
- alloc_size, alloc_align_mask);
- if (index >= 0) {
- rcu_read_unlock();
+ if (alloc_size > IO_TLB_SEGSIZE * IO_TLB_SIZE)
+ return -1;
+
+ cpu = raw_smp_processor_id();
+ for (i = 0; i < default_nareas; ++i) {
+ index = swiotlb_search_area(dev, cpu, i, orig_addr, alloc_size,
+ alloc_align_mask, &pool);
+ if (index >= 0)
goto found;
- }
}
- rcu_read_unlock();
+
if (!mem->can_grow)
return -1;
@@ -1139,8 +1159,8 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
if (!pool)
return -1;
- index = swiotlb_pool_find_slots(dev, pool, orig_addr,
- alloc_size, alloc_align_mask);
+ index = swiotlb_search_pool_area(dev, pool, 0, orig_addr,
+ alloc_size, alloc_align_mask);
if (index < 0) {
swiotlb_dyn_free(&pool->rcu);
return -1;
@@ -1152,9 +1172,26 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
found:
- dev->dma_uses_io_tlb = true;
- /* Pairs with smp_rmb() in is_swiotlb_buffer() */
- smp_wmb();
+ WRITE_ONCE(dev->dma_uses_io_tlb, true);
+
+ /*
+ * The general barrier orders reads and writes against a presumed store
+ * of the SWIOTLB buffer address by a device driver (to a driver private
+ * data structure). It serves two purposes.
+ *
+ * First, the store to dev->dma_uses_io_tlb must be ordered before the
+ * presumed store. This guarantees that the returned buffer address
+ * cannot be passed to another CPU before updating dev->dma_uses_io_tlb.
+ *
+ * Second, the load from mem->pools must be ordered before the same
+ * presumed store. This guarantees that the returned buffer address
+ * cannot be observed by another CPU before an update of the RCU list
+ * that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy
+ * atomicity).
+ *
+ * See also the comment in is_swiotlb_buffer().
+ */
+ smp_mb();
*retpool = pool;
return index;
@@ -1166,9 +1203,21 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
size_t alloc_size, unsigned int alloc_align_mask,
struct io_tlb_pool **retpool)
{
- *retpool = &dev->dma_io_tlb_mem->defpool;
- return swiotlb_pool_find_slots(dev, *retpool,
- orig_addr, alloc_size, alloc_align_mask);
+ struct io_tlb_pool *pool;
+ int start, i;
+ int index;
+
+ *retpool = pool = &dev->dma_io_tlb_mem->defpool;
+ i = start = raw_smp_processor_id() & (pool->nareas - 1);
+ do {
+ index = swiotlb_search_pool_area(dev, pool, i, orig_addr,
+ alloc_size, alloc_align_mask);
+ if (index >= 0)
+ return index;
+ if (++i >= pool->nareas)
+ i = 0;
+ } while (i != start);
+ return -1;
}
#endif /* CONFIG_SWIOTLB_DYNAMIC */
@@ -1283,11 +1332,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
tlb_addr = slot_addr(pool->start, index) + offset;
/*
- * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
- * to the tlb buffer, if we knew for sure the device will
- * overwrite the entire current content. But we don't. Thus
- * unconditional bounce may prevent leaking swiotlb content (i.e.
- * kernel memory) to user-space.
+ * When the device is writing memory, i.e. dir == DMA_FROM_DEVICE, copy
+ * the original buffer to the TLB buffer before initiating DMA in order
+ * to preserve the original's data if the device does a partial write,
+ * i.e. if the device doesn't overwrite the entire buffer. Preserving
+ * the original data, even if it's garbage, is necessary to match
+ * hardware behavior. Use of swiotlb is supposed to be transparent,
+ * i.e. swiotlb must not corrupt memory by clobbering unwritten bytes.
*/
swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
return tlb_addr;
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index d7ee4bc3f2ba..88cb3c88aaa5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -15,26 +15,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
-/* See comment for enter_from_user_mode() in entry-common.h */
-static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
-{
- arch_enter_from_user_mode(regs);
- lockdep_hardirqs_off(CALLER_ADDR0);
-
- CT_WARN_ON(__ct_state() != CONTEXT_USER);
- user_exit_irqoff();
-
- instrumentation_begin();
- kmsan_unpoison_entry_regs(regs);
- trace_hardirqs_off_finish();
- instrumentation_end();
-}
-
-void noinstr enter_from_user_mode(struct pt_regs *regs)
-{
- __enter_from_user_mode(regs);
-}
-
static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
{
if (unlikely(audit_context())) {
@@ -45,7 +25,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
}
}
-static long syscall_trace_enter(struct pt_regs *regs, long syscall,
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
unsigned long work)
{
long ret = 0;
@@ -85,67 +65,24 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
return ret ? : syscall;
}
-static __always_inline long
-__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
-{
- unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
-
- if (work & SYSCALL_WORK_ENTER)
- syscall = syscall_trace_enter(regs, syscall, work);
-
- return syscall;
-}
-
-long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
-{
- return __syscall_enter_from_user_work(regs, syscall);
-}
-
-noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
-{
- long ret;
-
- __enter_from_user_mode(regs);
-
- instrumentation_begin();
- local_irq_enable();
- ret = __syscall_enter_from_user_work(regs, syscall);
- instrumentation_end();
-
- return ret;
-}
-
noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
{
- __enter_from_user_mode(regs);
+ enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
instrumentation_end();
}
-/* See comment for exit_to_user_mode() in entry-common.h */
-static __always_inline void __exit_to_user_mode(void)
-{
- instrumentation_begin();
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare();
- instrumentation_end();
-
- user_enter_irqoff();
- arch_exit_to_user_mode();
- lockdep_hardirqs_on(CALLER_ADDR0);
-}
-
-void noinstr exit_to_user_mode(void)
-{
- __exit_to_user_mode();
-}
-
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
-static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
- unsigned long ti_work)
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ * @regs: Pointer to pt_regs on entry stack
+ * @ti_work: TIF work flags as read by the caller
+ */
+__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
{
/*
* Before returning to user space ensure that all pending work
@@ -190,27 +127,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
return ti_work;
}
-static void exit_to_user_mode_prepare(struct pt_regs *regs)
-{
- unsigned long ti_work;
-
- lockdep_assert_irqs_disabled();
-
- /* Flush pending rcuog wakeup before the last need_resched() check */
- tick_nohz_user_enter_prepare();
-
- ti_work = read_thread_flags();
- if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
- ti_work = exit_to_user_mode_loop(regs, ti_work);
-
- arch_exit_to_user_mode_prepare(regs, ti_work);
-
- /* Ensure that kernel state is sane for a return to userspace */
- kmap_assert_nomap();
- lockdep_assert_irqs_disabled();
- lockdep_sys_exit();
-}
-
/*
* If SYSCALL_EMU is set, then the only reason to report is when
* SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
@@ -295,12 +211,12 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
instrumentation_begin();
__syscall_exit_to_user_mode_work(regs);
instrumentation_end();
- __exit_to_user_mode();
+ exit_to_user_mode();
}
noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
- __enter_from_user_mode(regs);
+ enter_from_user_mode(regs);
}
noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
@@ -308,7 +224,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
instrumentation_begin();
exit_to_user_mode_prepare(regs);
instrumentation_end();
- __exit_to_user_mode();
+ exit_to_user_mode();
}
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c72a41f11af..f0f0f71213a1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -375,6 +375,7 @@ enum event_type_t {
EVENT_TIME = 0x4,
/* see ctx_resched() for details */
EVENT_CPU = 0x8,
+ EVENT_CGROUP = 0x10,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
@@ -449,8 +450,8 @@ static void update_perf_cpu_limits(void)
static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
-int perf_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int perf_cpu = sysctl_perf_cpu_time_max_percent;
@@ -684,20 +685,26 @@ do { \
___p; \
})
-static void perf_ctx_disable(struct perf_event_context *ctx)
+static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
perf_pmu_disable(pmu_ctx->pmu);
+ }
}
-static void perf_ctx_enable(struct perf_event_context *ctx)
+static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
perf_pmu_enable(pmu_ctx->pmu);
+ }
}
static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
@@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task)
return;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, true);
- ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+ ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
/*
* must not be done before ctxswout due
* to update_cgrp_time_from_cpuctx() in
@@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task)
* perf_cgroup_set_timestamp() in ctx_sched_in()
* to not have to pass task around
*/
- ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
+ ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, true);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
@@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
if (!is_cgroup_event(event))
return;
+ event->pmu_ctx->nr_cgroups++;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
@@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
if (!is_cgroup_event(event))
return;
+ event->pmu_ctx->nr_cgroups--;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
@@ -1803,31 +1814,34 @@ static inline void perf_event__state_init(struct perf_event *event)
PERF_EVENT_STATE_INACTIVE;
}
-static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
+static int __perf_event_read_size(u64 read_format, int nr_siblings)
{
int entry = sizeof(u64); /* value */
int size = 0;
int nr = 1;
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_ID)
+ if (read_format & PERF_FORMAT_ID)
entry += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_LOST)
+ if (read_format & PERF_FORMAT_LOST)
entry += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_GROUP) {
+ if (read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
}
- size += entry * nr;
- event->read_size = size;
+ /*
+ * Since perf_event_validate_size() limits this to 16k and inhibits
+ * adding more siblings, this will never overflow.
+ */
+ return size + nr * entry;
}
static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
@@ -1877,8 +1891,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
*/
static void perf_event__header_size(struct perf_event *event)
{
- __perf_event_read_size(event,
- event->group_leader->nr_siblings);
+ event->read_size =
+ __perf_event_read_size(event->attr.read_format,
+ event->group_leader->nr_siblings);
__perf_event_header_size(event, event->attr.sample_type);
}
@@ -1909,23 +1924,44 @@ static void perf_event__id_header_size(struct perf_event *event)
event->id_header_size = size;
}
+/*
+ * Check that adding an event to the group does not result in anybody
+ * overflowing the 64k event limit imposed by the output buffer.
+ *
+ * Specifically, check that the read_size for the event does not exceed 16k,
+ * read_size being the one term that grows with groups size. Since read_size
+ * depends on per-event read_format, also (re)check the existing events.
+ *
+ * This leaves 48k for the constant size fields and things like callchains,
+ * branch stacks and register sets.
+ */
static bool perf_event_validate_size(struct perf_event *event)
{
- /*
- * The values computed here will be over-written when we actually
- * attach the event.
- */
- __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
- __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
- perf_event__id_header_size(event);
+ struct perf_event *sibling, *group_leader = event->group_leader;
+
+ if (__perf_event_read_size(event->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+
+ if (__perf_event_read_size(group_leader->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
/*
- * Sum the lot; should not exceed the 64k limit we have on records.
- * Conservative limit to allow for callchains and other variable fields.
+ * When creating a new group leader, group_leader->ctx is initialized
+ * after the size has been validated, but we cannot safely use
+ * for_each_sibling_event() until group_leader->ctx is set. A new group
+ * leader cannot have any siblings yet, so we can safely skip checking
+ * the non-existent siblings.
*/
- if (event->read_size + event->header_size +
- event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
- return false;
+ if (event == group_leader)
+ return true;
+
+ for_each_sibling_event(sibling, group_leader) {
+ if (__perf_event_read_size(sibling->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+ }
return true;
}
@@ -1954,6 +1990,7 @@ static void perf_group_attach(struct perf_event *event)
list_add_tail(&event->sibling_list, &group_leader->sibling_list);
group_leader->nr_siblings++;
+ group_leader->group_generation++;
perf_event__header_size(group_leader);
@@ -2144,6 +2181,7 @@ static void perf_group_detach(struct perf_event *event)
if (leader != event) {
list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
+ event->group_leader->group_generation++;
goto out;
}
@@ -2677,9 +2715,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
event_type &= EVENT_ALL;
- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, false);
if (task_ctx) {
- perf_ctx_disable(task_ctx);
+ perf_ctx_disable(task_ctx, false);
task_ctx_sched_out(task_ctx, event_type);
}
@@ -2697,9 +2735,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
perf_event_sched_in(cpuctx, task_ctx);
- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, false);
if (task_ctx)
- perf_ctx_enable(task_ctx);
+ perf_ctx_enable(task_ctx, false);
}
void perf_pmu_resched(struct pmu *pmu)
@@ -3244,6 +3282,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
@@ -3290,8 +3331,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
is_active ^= ctx->is_active; /* changed bits */
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
__pmu_ctx_sched_out(pmu_ctx, is_active);
+ }
}
/*
@@ -3482,7 +3526,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
/* PMIs are disabled; ctx->nr_pending is stable. */
if (local_read(&ctx->nr_pending) ||
@@ -3502,7 +3546,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
perf_ctx_sched_task_cb(ctx, false);
perf_event_swap_task_ctx_data(ctx, next_ctx);
- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3526,13 +3570,13 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
inside_switch:
perf_ctx_sched_task_cb(ctx, false);
task_ctx_sched_out(ctx, EVENT_ALL);
- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
}
}
@@ -3818,47 +3862,32 @@ static int merge_sched_in(struct perf_event *event, void *data)
return 0;
}
-static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void pmu_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ struct pmu *pmu)
{
- struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
-
- if (pmu) {
- visit_groups_merge(ctx, &ctx->pinned_groups,
- smp_processor_id(), pmu,
- merge_sched_in, &can_add_hw);
- } else {
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- can_add_hw = 1;
- visit_groups_merge(ctx, &ctx->pinned_groups,
- smp_processor_id(), pmu_ctx->pmu,
- merge_sched_in, &can_add_hw);
- }
- }
+ visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
}
-static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void ctx_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- int can_add_hw = 1;
- if (pmu) {
- visit_groups_merge(ctx, &ctx->flexible_groups,
- smp_processor_id(), pmu,
- merge_sched_in, &can_add_hw);
- } else {
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- can_add_hw = 1;
- visit_groups_merge(ctx, &ctx->flexible_groups,
- smp_processor_id(), pmu_ctx->pmu,
- merge_sched_in, &can_add_hw);
- }
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
+ pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
}
}
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
+ struct pmu *pmu)
{
- ctx_flexible_sched_in(ctx, pmu);
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
}
static void
@@ -3866,6 +3895,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
@@ -3898,11 +3930,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, NULL);
+ ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, NULL);
+ ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
}
static void perf_event_context_sched_in(struct task_struct *task)
@@ -3917,11 +3949,11 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (cpuctx->task_ctx == ctx) {
perf_ctx_lock(cpuctx, ctx);
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
perf_ctx_sched_task_cb(ctx, true);
- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
perf_ctx_unlock(cpuctx, ctx);
goto rcu_unlock;
}
@@ -3934,7 +3966,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (!ctx->nr_events)
goto unlock;
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3944,7 +3976,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
* events, no need to flip the cpuctx's events around.
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, false);
ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
}
@@ -3953,9 +3985,9 @@ static void perf_event_context_sched_in(struct task_struct *task)
perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, false);
- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
unlock:
perf_ctx_unlock(cpuctx, ctx);
@@ -4425,6 +4457,9 @@ static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
u16 local_pkg, event_pkg;
+ if ((unsigned)event_cpu >= nr_cpu_ids)
+ return event_cpu;
+
if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
int local_cpu = smp_processor_id();
@@ -4527,6 +4562,8 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
u64 *enabled, u64 *running)
{
unsigned long flags;
+ int event_oncpu;
+ int event_cpu;
int ret = 0;
/*
@@ -4551,15 +4588,22 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
goto out;
}
+ /*
+ * Get the event CPU numbers, and adjust them to local if the event is
+ * a per-package event that can be read locally
+ */
+ event_oncpu = __perf_event_read_cpu(event, event->oncpu);
+ event_cpu = __perf_event_read_cpu(event, event->cpu);
+
/* If this is a per-CPU event, it must be for this CPU */
if (!(event->attach_state & PERF_ATTACH_TASK) &&
- event->cpu != smp_processor_id()) {
+ event_cpu != smp_processor_id()) {
ret = -EINVAL;
goto out;
}
/* If this is a pinned event it must be running on this CPU */
- if (event->attr.pinned && event->oncpu != smp_processor_id()) {
+ if (event->attr.pinned && event_oncpu != smp_processor_id()) {
ret = -EBUSY;
goto out;
}
@@ -4569,7 +4613,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
* or local to this CPU. Furthermore it means its ACTIVE (otherwise
* oncpu == -1).
*/
- if (event->oncpu == smp_processor_id())
+ if (event_oncpu == smp_processor_id())
event->pmu->read(event);
*value = local64_read(&event->count);
@@ -4809,6 +4853,11 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
void *task_ctx_data = NULL;
if (!ctx->task) {
+ /*
+ * perf_pmu_migrate_context() / __perf_pmu_install_event()
+ * relies on the fact that find_get_pmu_context() cannot fail
+ * for CPU contexts.
+ */
struct perf_cpu_pmu_context *cpc;
cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
@@ -5440,7 +5489,7 @@ static int __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
{
struct perf_event_context *ctx = leader->ctx;
- struct perf_event *sub;
+ struct perf_event *sub, *parent;
unsigned long flags;
int n = 1; /* skip @nr */
int ret;
@@ -5450,6 +5499,33 @@ static int __perf_read_group_add(struct perf_event *leader,
return ret;
raw_spin_lock_irqsave(&ctx->lock, flags);
+ /*
+ * Verify the grouping between the parent and child (inherited)
+ * events is still in tact.
+ *
+ * Specifically:
+ * - leader->ctx->lock pins leader->sibling_list
+ * - parent->child_mutex pins parent->child_list
+ * - parent->ctx->mutex pins parent->sibling_list
+ *
+ * Because parent->ctx != leader->ctx (and child_list nests inside
+ * ctx->mutex), group destruction is not atomic between children, also
+ * see perf_event_release_kernel(). Additionally, parent can grow the
+ * group.
+ *
+ * Therefore it is possible to have parent and child groups in a
+ * different configuration and summing over such a beast makes no sense
+ * what so ever.
+ *
+ * Reject this.
+ */
+ parent = leader->parent;
+ if (parent &&
+ (parent->group_generation != leader->group_generation ||
+ parent->nr_siblings != leader->nr_siblings)) {
+ ret = -ECHILD;
+ goto unlock;
+ }
/*
* Since we co-schedule groups, {enabled,running} times of siblings
@@ -5483,8 +5559,9 @@ static int __perf_read_group_add(struct perf_event *leader,
values[n++] = atomic64_read(&sub->lost_samples);
}
+unlock:
raw_spin_unlock_irqrestore(&ctx->lock, flags);
- return 0;
+ return ret;
}
static int perf_read_group(struct perf_event *event,
@@ -5503,10 +5580,6 @@ static int perf_read_group(struct perf_event *event,
values[0] = 1 + leader->nr_siblings;
- /*
- * By locking the child_mutex of the leader we effectively
- * lock the child list of all siblings.. XXX explain how.
- */
mutex_lock(&leader->child_mutex);
ret = __perf_read_group_add(leader, read_format, values);
@@ -7324,6 +7397,14 @@ void perf_output_sample(struct perf_output_handle *handle,
if (branch_sample_hw_index(event))
perf_output_put(handle, data->br_stack->hw_idx);
perf_output_copy(handle, data->br_stack->entries, size);
+ /*
+ * Add the extension space which is appended
+ * right after the struct perf_branch_stack.
+ */
+ if (data->br_stack_cntr) {
+ size = data->br_stack->nr * sizeof(u64);
+ perf_output_copy(handle, data->br_stack_cntr, size);
+ }
} else {
/*
* we always store at least the value of nr
@@ -11352,9 +11433,30 @@ static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
+ &dev_attr_nr_addr_filters.attr,
+ NULL,
+};
+
+static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ if (n == 2 && !pmu->nr_addr_filters)
+ return 0;
+
+ return a->mode;
+}
+
+static struct attribute_group pmu_dev_attr_group = {
+ .is_visible = pmu_dev_is_visible,
+ .attrs = pmu_dev_attrs,
+};
+
+static const struct attribute_group *pmu_dev_groups[] = {
+ &pmu_dev_attr_group,
NULL,
};
-ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running;
static struct bus_type pmu_bus = {
@@ -11391,18 +11493,11 @@ static int pmu_dev_alloc(struct pmu *pmu)
if (ret)
goto free_dev;
- /* For PMUs with address filters, throw in an extra attribute: */
- if (pmu->nr_addr_filters)
- ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
-
- if (ret)
- goto del_dev;
-
- if (pmu->attr_update)
+ if (pmu->attr_update) {
ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
-
- if (ret)
- goto del_dev;
+ if (ret)
+ goto del_dev;
+ }
out:
return ret;
@@ -12846,6 +12941,9 @@ static void __perf_pmu_install_event(struct pmu *pmu,
int cpu, struct perf_event *event)
{
struct perf_event_pmu_context *epc;
+ struct perf_event_context *old_ctx = event->ctx;
+
+ get_ctx(ctx); /* normally find_get_context() */
event->cpu = cpu;
epc = find_get_pmu_context(pmu, ctx, event);
@@ -12854,6 +12952,11 @@ static void __perf_pmu_install_event(struct pmu *pmu,
if (event->state >= PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_INACTIVE;
perf_install_in_context(ctx, event, cpu);
+
+ /*
+ * Now that event->ctx is updated and visible, put the old ctx.
+ */
+ put_ctx(old_ctx);
}
static void __perf_pmu_install(struct perf_event_context *ctx,
@@ -12892,6 +12995,10 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
struct perf_event_context *src_ctx, *dst_ctx;
LIST_HEAD(events);
+ /*
+ * Since per-cpu context is persistent, no need to grab an extra
+ * reference.
+ */
src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
@@ -13346,6 +13453,8 @@ static int inherit_group(struct perf_event *parent_event,
!perf_get_aux_event(child_ctr, leader))
return -EINVAL;
}
+ if (leader)
+ leader->group_generation = parent_event->group_generation;
return 0;
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index fb1e180b5f0a..60ed43d1c29e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -610,8 +610,8 @@ static struct page *rb_alloc_aux_page(int node, int order)
{
struct page *page;
- if (order > MAX_ORDER)
- order = MAX_ORDER;
+ if (order > MAX_PAGE_ORDER)
+ order = MAX_PAGE_ORDER;
do {
page = alloc_pages_node(node, PERF_AUX_GFP, order);
@@ -700,6 +700,12 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
watermark = 0;
}
+ /*
+ * kcalloc_node() is unable to allocate buffer if the size is larger
+ * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case.
+ */
+ if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER)
+ return -ENOMEM;
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
node);
if (!rb->aux_pages)
@@ -815,7 +821,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
size = sizeof(struct perf_buffer);
size += nr_pages * sizeof(void *);
- if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER)
+ if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER)
goto fail;
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 3048589e2e85..929e98c62965 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -181,7 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (new_page) {
folio_get(new_folio);
- page_add_new_anon_rmap(new_page, vma, addr);
+ folio_add_new_anon_rmap(new_folio, vma, addr);
folio_add_lru_vma(new_folio, vma);
} else
/* no new page, just dec_mm_counter for old_page */
@@ -198,7 +198,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
set_pte_at_notify(mm, addr, pvmw.pte,
mk_pte(new_page, vma->vm_page_prot));
- page_remove_rmap(old_page, vma, false);
+ folio_remove_rmap_pte(old_folio, old_page, vma);
if (!folio_mapped(old_folio))
folio_free_swap(old_folio);
page_vma_mapped_walk_done(&pvmw);
@@ -474,8 +474,8 @@ retry:
gup_flags |= FOLL_SPLIT_PMD;
/* Read the page with vaddr into memory */
old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
- if (IS_ERR_OR_NULL(old_page))
- return old_page ? PTR_ERR(old_page) : 0;
+ if (IS_ERR(old_page))
+ return PTR_ERR(old_page);
ret = verify_opcode(old_page, vaddr, &opcode);
if (ret <= 0)
@@ -537,7 +537,7 @@ retry:
}
}
- ret = __replace_page(vma, vaddr, old_page, new_page);
+ ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
if (new_page)
put_page(new_page);
put_old:
diff --git a/kernel/exit.c b/kernel/exit.c
index edb50b4c9972..dfb963d2f862 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,11 +69,15 @@
#include <linux/rethook.h>
#include <linux/sysfs.h>
#include <linux/user_events.h>
-
#include <linux/uaccess.h>
+
+#include <uapi/linux/wait.h>
+
#include <asm/unistd.h>
#include <asm/mmu_context.h>
+#include "exit.h"
+
/*
* The default value should be high enough to not crash a system that randomly
* crashes its kernel from time to time, but low enough to at least not permit
@@ -133,7 +137,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
list_del_init(&p->sibling);
__this_cpu_dec(process_counts);
}
- list_del_rcu(&p->thread_group);
list_del_rcu(&p->thread_node);
}
@@ -539,7 +542,6 @@ static void exit_mm(void)
exit_mm_release(current, mm);
if (!mm)
return;
- sync_mm_rss(mm);
mmap_read_lock(mm);
mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
@@ -824,14 +826,9 @@ void __noreturn do_exit(long code)
ptrace_event(PTRACE_EVENT_EXIT, code);
user_events_exit(tsk);
- validate_creds_for_do_exit(tsk);
-
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
- /* sync mm's RSS info before statistics gathering */
- if (tsk->mm)
- sync_mm_rss(tsk->mm);
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
@@ -912,7 +909,6 @@ void __noreturn do_exit(long code)
if (tsk->task_frag.page)
put_page(tsk->task_frag.page);
- validate_creds_for_do_exit(tsk);
exit_task_stack_account(tsk);
check_stack_usage();
@@ -1037,26 +1033,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
return 0;
}
-struct waitid_info {
- pid_t pid;
- uid_t uid;
- int status;
- int cause;
-};
-
-struct wait_opts {
- enum pid_type wo_type;
- int wo_flags;
- struct pid *wo_pid;
-
- struct waitid_info *wo_info;
- int wo_stat;
- struct rusage *wo_rusage;
-
- wait_queue_entry_t child_wait;
- int notask_error;
-};
-
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
return wo->wo_type == PIDTYPE_MAX ||
@@ -1151,17 +1127,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* and nobody can change them.
*
* psig->stats_lock also protects us from our sub-threads
- * which can reap other children at the same time. Until
- * we change k_getrusage()-like users to rely on this lock
- * we have to take ->siglock as well.
+ * which can reap other children at the same time.
*
* We use thread_group_cputime_adjusted() to get times for
* the thread group, which consolidates times for all threads
* in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- spin_lock_irq(&current->sighand->siglock);
- write_seqlock(&psig->stats_lock);
+ write_seqlock_irq(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1184,8 +1157,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
- write_sequnlock(&psig->stats_lock);
- spin_unlock_irq(&current->sighand->siglock);
+ write_sequnlock_irq(&psig->stats_lock);
}
if (wo->wo_rusage)
@@ -1520,6 +1492,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
return 0;
}
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
+{
+ if (!eligible_pid(wo, p))
+ return false;
+
+ if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
+ return false;
+
+ return true;
+}
+
static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
@@ -1527,13 +1510,10 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
child_wait);
struct task_struct *p = key;
- if (!eligible_pid(wo, p))
- return 0;
+ if (pid_child_should_wake(wo, p))
+ return default_wake_function(wait, mode, sync, key);
- if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
- return 0;
-
- return default_wake_function(wait, mode, sync, key);
+ return 0;
}
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
@@ -1582,16 +1562,10 @@ static int do_wait_pid(struct wait_opts *wo)
return 0;
}
-static long do_wait(struct wait_opts *wo)
+long __do_wait(struct wait_opts *wo)
{
- int retval;
-
- trace_sched_process_wait(wo->wo_pid);
+ long retval;
- init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
- wo->child_wait.private = current;
- add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
-repeat:
/*
* If there is nothing that can match our criteria, just get out.
* We will clear ->notask_error to zero if we see any child that
@@ -1603,24 +1577,23 @@ repeat:
(!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
goto notask;
- set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
if (wo->wo_type == PIDTYPE_PID) {
retval = do_wait_pid(wo);
if (retval)
- goto end;
+ return retval;
} else {
struct task_struct *tsk = current;
do {
retval = do_wait_thread(wo, tsk);
if (retval)
- goto end;
+ return retval;
retval = ptrace_do_wait(wo, tsk);
if (retval)
- goto end;
+ return retval;
if (wo->wo_flags & __WNOTHREAD)
break;
@@ -1630,27 +1603,44 @@ repeat:
notask:
retval = wo->notask_error;
- if (!retval && !(wo->wo_flags & WNOHANG)) {
- retval = -ERESTARTSYS;
- if (!signal_pending(current)) {
- schedule();
- goto repeat;
- }
- }
-end:
+ if (!retval && !(wo->wo_flags & WNOHANG))
+ return -ERESTARTSYS;
+
+ return retval;
+}
+
+static long do_wait(struct wait_opts *wo)
+{
+ int retval;
+
+ trace_sched_process_wait(wo->wo_pid);
+
+ init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
+ wo->child_wait.private = current;
+ add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
+
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ retval = __do_wait(wo);
+ if (retval != -ERESTARTSYS)
+ break;
+ if (signal_pending(current))
+ break;
+ schedule();
+ } while (1);
+
__set_current_state(TASK_RUNNING);
remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
return retval;
}
-static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
- int options, struct rusage *ru)
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+ struct waitid_info *infop, int options,
+ struct rusage *ru)
{
- struct wait_opts wo;
+ unsigned int f_flags = 0;
struct pid *pid = NULL;
enum pid_type type;
- long ret;
- unsigned int f_flags = 0;
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL))
@@ -1693,19 +1683,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
return -EINVAL;
}
- wo.wo_type = type;
- wo.wo_pid = pid;
- wo.wo_flags = options;
- wo.wo_info = infop;
- wo.wo_rusage = ru;
+ wo->wo_type = type;
+ wo->wo_pid = pid;
+ wo->wo_flags = options;
+ wo->wo_info = infop;
+ wo->wo_rusage = ru;
if (f_flags & O_NONBLOCK)
- wo.wo_flags |= WNOHANG;
+ wo->wo_flags |= WNOHANG;
+
+ return 0;
+}
+
+static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
+ int options, struct rusage *ru)
+{
+ struct wait_opts wo;
+ long ret;
+
+ ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
+ if (ret)
+ return ret;
ret = do_wait(&wo);
- if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
+ if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
ret = -EAGAIN;
- put_pid(pid);
+ put_pid(wo.wo_pid);
return ret;
}
diff --git a/kernel/exit.h b/kernel/exit.h
new file mode 100644
index 000000000000..278faa26a653
--- /dev/null
+++ b/kernel/exit.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef LINUX_WAITID_H
+#define LINUX_WAITID_H
+
+struct waitid_info {
+ pid_t pid;
+ uid_t uid;
+ int status;
+ int cause;
+};
+
+struct wait_opts {
+ enum pid_type wo_type;
+ int wo_flags;
+ struct pid *wo_pid;
+
+ struct waitid_info *wo_info;
+ int wo_stat;
+ struct rusage *wo_rusage;
+
+ wait_queue_entry_t child_wait;
+ int notask_error;
+};
+
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p);
+long __do_wait(struct wait_opts *wo);
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+ struct waitid_info *infop, int options,
+ struct rusage *ru);
+#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..0d944e92a43f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -53,6 +53,7 @@
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
+#include <linux/syscall_user_dispatch.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
@@ -99,6 +100,7 @@
#include <linux/stackprotector.h>
#include <linux/user_events.h>
#include <linux/iommu.h>
+#include <linux/rseq.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -165,7 +167,6 @@ void __weak arch_release_task_struct(struct task_struct *tsk)
{
}
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
static inline struct task_struct *alloc_task_struct_node(int node)
@@ -177,9 +178,6 @@ static inline void free_task_struct(struct task_struct *tsk)
{
kmem_cache_free(task_struct_cachep, tsk);
}
-#endif
-
-#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
/*
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@ -412,24 +410,6 @@ void thread_stack_cache_init(void)
}
# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
-#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
-
-static int alloc_thread_stack_node(struct task_struct *tsk, int node)
-{
- unsigned long *stack;
-
- stack = arch_alloc_thread_stack_node(tsk, node);
- tsk->stack = stack;
- return stack ? 0 : -ENOMEM;
-}
-
-static void free_thread_stack(struct task_struct *tsk)
-{
- arch_free_thread_stack(tsk);
- tsk->stack = NULL;
-}
-
-#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -650,7 +630,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
int retval;
unsigned long charge = 0;
LIST_HEAD(uf);
- VMA_ITERATOR(old_vmi, oldmm, 0);
VMA_ITERATOR(vmi, mm, 0);
uprobe_start_dup_mmap();
@@ -678,16 +657,22 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
goto out;
khugepaged_fork(mm, oldmm);
- retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
- if (retval)
+ /* Use __mt_dup() to efficiently build an identical maple tree. */
+ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+ if (unlikely(retval))
goto out;
mt_clear_in_rcu(vmi.mas.tree);
- for_each_vma(old_vmi, mpnt) {
+ for_each_vma(vmi, mpnt) {
struct file *file;
vma_start_write(mpnt);
if (mpnt->vm_flags & VM_DONTCOPY) {
+ retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+ mpnt->vm_end, GFP_KERNEL);
+ if (retval)
+ goto loop_out;
+
vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
continue;
}
@@ -733,7 +718,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
get_file(file);
i_mmap_lock_write(mapping);
- if (tmp->vm_flags & VM_SHARED)
+ if (vma_is_shared_maywrite(tmp))
mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
@@ -749,9 +734,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (is_vm_hugetlb_page(tmp))
hugetlb_dup_vma_private(tmp);
- /* Link the vma into the MT */
- if (vma_iter_bulk_store(&vmi, tmp))
- goto fail_nomem_vmi_store;
+ /*
+ * Link the vma into the MT. After using __mt_dup(), memory
+ * allocation is not necessary here, so it cannot fail.
+ */
+ vma_iter_bulk_store(&vmi, tmp);
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -760,15 +747,28 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
- if (retval)
+ if (retval) {
+ mpnt = vma_next(&vmi);
goto loop_out;
+ }
}
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
- if (!retval)
+ if (!retval) {
mt_set_in_rcu(vmi.mas.tree);
+ } else if (mpnt) {
+ /*
+ * The entire maple tree has already been duplicated. If the
+ * mmap duplication fails, mark the failure point with
+ * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+ * stop releasing VMAs that have not been duplicated after this
+ * point.
+ */
+ mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+ mas_store(&vmi.mas, XA_ZERO_ENTRY);
+ }
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -778,8 +778,6 @@ fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
-fail_nomem_vmi_store:
- unlink_anon_vmas(tmp);
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
fail_nomem_policy:
@@ -1021,7 +1019,6 @@ static void set_max_threads(unsigned int max_threads_suggested)
int arch_task_struct_size __read_mostly;
#endif
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
/* Fetch thread_struct whitelist for the architecture. */
@@ -1036,12 +1033,10 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
else
*offset += offsetof(struct task_struct, thread);
}
-#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
void __init fork_init(void)
{
int i;
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN 0
#endif
@@ -1054,7 +1049,6 @@ void __init fork_init(void)
arch_task_struct_size, align,
SLAB_PANIC|SLAB_ACCOUNT,
useroffset, usersize, NULL);
-#endif
/* do the arch specific task caches init */
arch_task_cache_init();
@@ -1179,7 +1173,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->use_memdelay = 0;
#endif
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
tsk->pasid_activated = 0;
#endif
@@ -1288,7 +1282,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
hugetlb_count_init(mm);
if (current->mm) {
- mm->flags = current->mm->flags & MMF_INIT_MASK;
+ mm->flags = mmf_init_flags(current->mm->flags);
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
} else {
mm->flags = default_dump_filter;
@@ -1393,6 +1387,8 @@ EXPORT_SYMBOL_GPL(mmput_async);
/**
* set_mm_exe_file - change a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
*
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
@@ -1432,6 +1428,8 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
/**
* replace_mm_exe_file - replace a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
*
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
@@ -1483,6 +1481,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
/**
* get_mm_exe_file - acquire a reference to the mm's executable file
+ * @mm: The mm of interest.
*
* Returns %NULL if mm has no associated executable file.
* User must release file via fput().
@@ -1492,15 +1491,14 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
struct file *exe_file;
rcu_read_lock();
- exe_file = rcu_dereference(mm->exe_file);
- if (exe_file && !get_file_rcu(exe_file))
- exe_file = NULL;
+ exe_file = get_file_rcu(&mm->exe_file);
rcu_read_unlock();
return exe_file;
}
/**
* get_task_exe_file - acquire a reference to the task's executable file
+ * @task: The task.
*
* Returns %NULL if task's mm (if any) has no associated executable file or
* this is a kernel thread with borrowed mm (see the comment above get_task_mm).
@@ -1523,6 +1521,7 @@ struct file *get_task_exe_file(struct task_struct *task)
/**
* get_task_mm - acquire a reference to the task's mm
+ * @task: The task.
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
* this kernel workthread has transiently adopted a user mm with use_mm,
@@ -1583,7 +1582,7 @@ static void complete_vfork_done(struct task_struct *tsk)
static int wait_for_vfork_done(struct task_struct *child,
struct completion *vfork)
{
- unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
+ unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
int killed;
cgroup_enter_frozen();
@@ -1749,6 +1748,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
if (clone_flags & CLONE_FS) {
/* tsk->fs is already what we want */
spin_lock(&fs->lock);
+ /* "users" and "in_exec" locked for check_unsafe_exec() */
if (fs->in_exec) {
spin_unlock(&fs->lock);
return -EAGAIN;
@@ -2102,11 +2102,11 @@ const struct file_operations pidfd_fops = {
* __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
* @flags: flags of the new @pidfd
- * @pidfd: the pidfd to return
+ * @ret: Where to return the file for the pidfd.
*
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
-
+ *
* The helper doesn't perform checks on @pid which makes it useful for pidfds
* created via CLONE_PIDFD where @pid has no task attached when the pidfd and
* pidfd file are prepared.
@@ -2153,7 +2153,7 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
* pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
* @flags: flags of the new @pidfd
- * @pidfd: the pidfd to return
+ * @ret: Where to return the pidfd.
*
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
@@ -2406,10 +2406,6 @@ __latent_entropy struct task_struct *copy_process(
p->io_uring = NULL;
#endif
-#if defined(SPLIT_RSS_COUNTING)
- memset(&p->rss_stat, 0, sizeof(p->rss_stat));
-#endif
-
p->default_timer_slack_ns = current->timer_slack_ns;
#ifdef CONFIG_PSI
@@ -2576,7 +2572,6 @@ __latent_entropy struct task_struct *copy_process(
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
- INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
clear_posix_cputimers_work(p);
@@ -2704,8 +2699,6 @@ __latent_entropy struct task_struct *copy_process(
atomic_inc(&current->signal->live);
refcount_inc(&current->signal->sigcnt);
task_join_group_stop(p);
- list_add_tail_rcu(&p->thread_group,
- &p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
@@ -2930,7 +2923,7 @@ pid_t kernel_clone(struct kernel_clone_args *args)
get_task_struct(p);
}
- if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
/* lock the task to synchronize with memcg migration */
task_lock(p);
lru_gen_add_mm(p->mm);
@@ -3144,7 +3137,7 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
return false;
-#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
+#if !defined(CONFIG_STACK_GROWSUP)
kargs->stack += kargs->stack_size;
#endif
}
@@ -3181,7 +3174,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
}
/**
- * clone3 - create a new process with specific properties
+ * sys_clone3 - create a new process with specific properties
* @uargs: argument structure
* @size: size of @uargs
*
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 4fad0e6fca64..f57aaf96b829 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -71,7 +71,11 @@ bool __refrigerator(bool check_kthr_stop)
for (;;) {
bool freeze;
+ raw_spin_lock_irq(&current->pi_lock);
set_current_state(TASK_FROZEN);
+ /* unstale saved_state so that __thaw_task() will wake us up */
+ current->saved_state = TASK_RUNNING;
+ raw_spin_unlock_irq(&current->pi_lock);
spin_lock_irq(&freezer_lock);
freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop());
@@ -129,6 +133,7 @@ static int __set_task_frozen(struct task_struct *p, void *arg)
WARN_ON_ONCE(debug_locks && p->lockdep_depth);
#endif
+ p->saved_state = p->__state;
WRITE_ONCE(p->__state, TASK_FROZEN);
return TASK_FROZEN;
}
@@ -170,42 +175,35 @@ bool freeze_task(struct task_struct *p)
}
/*
- * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical
- * state in p->jobctl. If either of them got a wakeup that was missed because
- * TASK_FROZEN, then their canonical state reflects that and the below will
- * refuse to restore the special state and instead issue the wakeup.
+ * Restore the saved_state before the task entered freezer. For typical task
+ * in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens
+ * here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state
+ * is restored unless they got an expected wakeup (see ttwu_state_match()).
+ * Returns 1 if the task state was restored.
*/
-static int __set_task_special(struct task_struct *p, void *arg)
+static int __restore_freezer_state(struct task_struct *p, void *arg)
{
- unsigned int state = 0;
+ unsigned int state = p->saved_state;
- if (p->jobctl & JOBCTL_TRACED)
- state = TASK_TRACED;
-
- else if (p->jobctl & JOBCTL_STOPPED)
- state = TASK_STOPPED;
-
- if (state)
+ if (state != TASK_RUNNING) {
WRITE_ONCE(p->__state, state);
+ p->saved_state = TASK_RUNNING;
+ return 1;
+ }
- return state;
+ return 0;
}
void __thaw_task(struct task_struct *p)
{
- unsigned long flags, flags2;
+ unsigned long flags;
spin_lock_irqsave(&freezer_lock, flags);
if (WARN_ON_ONCE(freezing(p)))
goto unlock;
- if (lock_task_sighand(p, &flags2)) {
- /* TASK_FROZEN -> TASK_{STOPPED,TRACED} */
- bool ret = task_call_func(p, __set_task_special, NULL);
- unlock_task_sighand(p, &flags2);
- if (ret)
- goto unlock;
- }
+ if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL))
+ goto unlock;
wake_up_state(p, TASK_FROZEN);
unlock:
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index f10587d1d481..1e78ef24321e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -34,6 +34,7 @@
#include <linux/compat.h>
#include <linux/jhash.h>
#include <linux/pagemap.h>
+#include <linux/plist.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
#include <linux/slab.h>
@@ -193,7 +194,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
- * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
+ * @flags: FLAGS_*
* @key: address where result is stored.
* @rw: mapping needs to be read/write (values: FUTEX_READ,
* FUTEX_WRITE)
@@ -217,14 +218,18 @@ static u64 get_inode_sequence_number(struct inode *inode)
*
* lock_page() might sleep, the caller should not hold a spinlock.
*/
-int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page, *tail;
+ struct page *page;
+ struct folio *folio;
struct address_space *mapping;
int err, ro = 0;
+ bool fshared;
+
+ fshared = flags & FLAGS_SHARED;
/*
* The futex address must be "naturally" aligned.
@@ -248,7 +253,17 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- key->private.mm = mm;
+ /*
+ * On no-MMU, shared futexes are treated as private, therefore
+ * we must not include the current process in the key. Since
+ * there is only one address space, the address is a unique key
+ * on its own.
+ */
+ if (IS_ENABLED(CONFIG_MMU))
+ key->private.mm = mm;
+ else
+ key->private.mm = NULL;
+
key->private.address = address;
return 0;
}
@@ -273,54 +288,52 @@ again:
err = 0;
/*
- * The treatment of mapping from this point on is critical. The page
- * lock protects many things but in this context the page lock
+ * The treatment of mapping from this point on is critical. The folio
+ * lock protects many things but in this context the folio lock
* stabilizes mapping, prevents inode freeing in the shared
* file-backed region case and guards against movement to swap cache.
*
- * Strictly speaking the page lock is not needed in all cases being
- * considered here and page lock forces unnecessarily serialization
+ * Strictly speaking the folio lock is not needed in all cases being
+ * considered here and folio lock forces unnecessarily serialization.
* From this point on, mapping will be re-verified if necessary and
- * page lock will be acquired only if it is unavoidable
+ * folio lock will be acquired only if it is unavoidable
*
- * Mapping checks require the head page for any compound page so the
- * head page and mapping is looked up now. For anonymous pages, it
- * does not matter if the page splits in the future as the key is
- * based on the address. For filesystem-backed pages, the tail is
- * required as the index of the page determines the key. For
- * base pages, there is no tail page and tail == page.
+ * Mapping checks require the folio so it is looked up now. For
+ * anonymous pages, it does not matter if the folio is split
+ * in the future as the key is based on the address. For
+ * filesystem-backed pages, the precise page is required as the
+ * index of the page determines the key.
*/
- tail = page;
- page = compound_head(page);
- mapping = READ_ONCE(page->mapping);
+ folio = page_folio(page);
+ mapping = READ_ONCE(folio->mapping);
/*
- * If page->mapping is NULL, then it cannot be a PageAnon
+ * If folio->mapping is NULL, then it cannot be an anonymous
* page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast
* found it, but truncated or holepunched or subjected to
- * invalidate_complete_page2 before we got the page lock (also
+ * invalidate_complete_page2 before we got the folio lock (also
* cases which we are happy to fail). And we hold a reference,
* so refcount care in invalidate_inode_page's remove_mapping
* prevents drop_caches from setting mapping to NULL beneath us.
*
* The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page->mapping.
+ * an unlikely race, but we do need to retry for folio->mapping.
*/
if (unlikely(!mapping)) {
int shmem_swizzled;
/*
- * Page lock is required to identify which special case above
- * applies. If this is really a shmem page then the page lock
+ * Folio lock is required to identify which special case above
+ * applies. If this is really a shmem page then the folio lock
* will prevent unexpected transitions.
*/
- lock_page(page);
- shmem_swizzled = PageSwapCache(page) || page->mapping;
- unlock_page(page);
- put_page(page);
+ folio_lock(folio);
+ shmem_swizzled = folio_test_swapcache(folio) || folio->mapping;
+ folio_unlock(folio);
+ folio_put(folio);
if (shmem_swizzled)
goto again;
@@ -331,14 +344,14 @@ again:
/*
* Private mappings are handled in a simple way.
*
- * If the futex key is stored on an anonymous page, then the associated
+ * If the futex key is stored in anonymous memory, then the associated
* object is the mm which is implicitly pinned by the calling process.
*
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page)) {
+ if (folio_test_anon(folio)) {
/*
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
@@ -357,10 +370,10 @@ again:
/*
* The associated futex object in this case is the inode and
- * the page->mapping must be traversed. Ordinarily this should
- * be stabilised under page lock but it's not strictly
+ * the folio->mapping must be traversed. Ordinarily this should
+ * be stabilised under folio lock but it's not strictly
* necessary in this case as we just want to pin the inode, not
- * update the radix tree or anything like that.
+ * update i_pages or anything like that.
*
* The RCU read lock is taken as the inode is finally freed
* under RCU. If the mapping still matches expectations then the
@@ -368,9 +381,9 @@ again:
*/
rcu_read_lock();
- if (READ_ONCE(page->mapping) != mapping) {
+ if (READ_ONCE(folio->mapping) != mapping) {
rcu_read_unlock();
- put_page(page);
+ folio_put(folio);
goto again;
}
@@ -378,19 +391,19 @@ again:
inode = READ_ONCE(mapping->host);
if (!inode) {
rcu_read_unlock();
- put_page(page);
+ folio_put(folio);
goto again;
}
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.i_seq = get_inode_sequence_number(inode);
- key->shared.pgoff = page_to_pgoff(tail);
+ key->shared.pgoff = folio->index + folio_page_idx(folio, page);
rcu_read_unlock();
}
out:
- put_page(page);
+ folio_put(folio);
return err;
}
@@ -614,12 +627,21 @@ retry:
}
/*
- * PI futexes can not be requeued and must remove themselves from the
- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
+ * PI futexes can not be requeued and must remove themselves from the hash
+ * bucket. The hash bucket lock (i.e. lock_ptr) is held.
*/
void futex_unqueue_pi(struct futex_q *q)
{
- __futex_unqueue(q);
+ /*
+ * If the lock was not acquired (due to timeout or signal) then the
+ * rt_waiter is removed before futex_q is. If this is observed by
+ * an unlocker after dropping the rtmutex wait lock and before
+ * acquiring the hash bucket lock, then the unlocker dequeues the
+ * futex_q from the hash bucket list to guarantee consistent state
+ * vs. userspace. Therefore the dequeue here must be conditional.
+ */
+ if (!plist_node_empty(&q->list))
+ __futex_unqueue(q);
BUG_ON(!q->pi_state);
put_pi_state(q->pi_state);
@@ -688,7 +710,8 @@ retry:
owner = uval & FUTEX_TID_MASK;
if (pending_op && !pi && !owner) {
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ FUTEX_BITSET_MATCH_ANY);
return 0;
}
@@ -740,8 +763,10 @@ retry:
* Wake robust non-PI futexes here. The wakeup of
* PI futexes happens in exit_pi_state():
*/
- if (!pi && (uval & FUTEX_WAITERS))
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ if (!pi && (uval & FUTEX_WAITERS)) {
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ FUTEX_BITSET_MATCH_ANY);
+ }
return 0;
}
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index b5379c0e6d6d..8b195d06f4e8 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -5,6 +5,7 @@
#include <linux/futex.h>
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
+#include <linux/compat.h>
#ifdef CONFIG_PREEMPT_RT
#include <linux/rcuwait.h>
@@ -16,17 +17,86 @@
* Futex flags used to encode options to functions and preserve them across
* restarts.
*/
+#define FLAGS_SIZE_8 0x0000
+#define FLAGS_SIZE_16 0x0001
+#define FLAGS_SIZE_32 0x0002
+#define FLAGS_SIZE_64 0x0003
+
+#define FLAGS_SIZE_MASK 0x0003
+
#ifdef CONFIG_MMU
-# define FLAGS_SHARED 0x01
+# define FLAGS_SHARED 0x0010
#else
/*
* NOMMU does not have per process address space. Let the compiler optimize
* code away.
*/
-# define FLAGS_SHARED 0x00
+# define FLAGS_SHARED 0x0000
#endif
-#define FLAGS_CLOCKRT 0x02
-#define FLAGS_HAS_TIMEOUT 0x04
+#define FLAGS_CLOCKRT 0x0020
+#define FLAGS_HAS_TIMEOUT 0x0040
+#define FLAGS_NUMA 0x0080
+#define FLAGS_STRICT 0x0100
+
+/* FUTEX_ to FLAGS_ */
+static inline unsigned int futex_to_flags(unsigned int op)
+{
+ unsigned int flags = FLAGS_SIZE_32;
+
+ if (!(op & FUTEX_PRIVATE_FLAG))
+ flags |= FLAGS_SHARED;
+
+ if (op & FUTEX_CLOCK_REALTIME)
+ flags |= FLAGS_CLOCKRT;
+
+ return flags;
+}
+
+#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
+
+/* FUTEX2_ to FLAGS_ */
+static inline unsigned int futex2_to_flags(unsigned int flags2)
+{
+ unsigned int flags = flags2 & FUTEX2_SIZE_MASK;
+
+ if (!(flags2 & FUTEX2_PRIVATE))
+ flags |= FLAGS_SHARED;
+
+ if (flags2 & FUTEX2_NUMA)
+ flags |= FLAGS_NUMA;
+
+ return flags;
+}
+
+static inline unsigned int futex_size(unsigned int flags)
+{
+ return 1 << (flags & FLAGS_SIZE_MASK);
+}
+
+static inline bool futex_flags_valid(unsigned int flags)
+{
+ /* Only 64bit futexes for 64bit code */
+ if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) {
+ if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64)
+ return false;
+ }
+
+ /* Only 32bit futexes are implemented -- for now */
+ if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
+ return false;
+
+ return true;
+}
+
+static inline bool futex_validate_input(unsigned int flags, u64 val)
+{
+ int bits = 8 * futex_size(flags);
+
+ if (bits < 64 && (val >> bits))
+ return false;
+
+ return true;
+}
#ifdef CONFIG_FAIL_FUTEX
extern bool should_fail_futex(bool fshared);
@@ -69,11 +139,16 @@ struct futex_pi_state {
union futex_key key;
} __randomize_layout;
+struct futex_q;
+typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
+
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
* @list: priority-sorted list of tasks waiting on this futex
* @task: the task waiting on the futex
* @lock_ptr: the hash bucket lock
+ * @wake: the wake handler for this queue
+ * @wake_data: data associated with the wake handler
* @key: the key the futex is hashed on
* @pi_state: optional priority inheritance state
* @rt_waiter: rt_waiter storage for use with requeue_pi
@@ -98,6 +173,8 @@ struct futex_q {
struct task_struct *task;
spinlock_t *lock_ptr;
+ futex_wake_fn *wake;
+ void *wake_data;
union futex_key key;
struct futex_pi_state *pi_state;
struct rt_mutex_waiter *rt_waiter;
@@ -116,7 +193,7 @@ enum futex_access {
FUTEX_WRITE
};
-extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw);
extern struct hrtimer_sleeper *
@@ -144,6 +221,7 @@ extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb);
extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
struct hrtimer_sleeper *timeout);
+extern bool __futex_wake_mark(struct futex_q *q);
extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
extern int fault_in_user_writeable(u32 __user *uaddr);
@@ -260,10 +338,14 @@ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
val, ktime_t *abs_time, u32 bitset, u32 __user
*uaddr2);
-extern int futex_requeue(u32 __user *uaddr1, unsigned int flags,
- u32 __user *uaddr2, int nr_wake, int nr_requeue,
+extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+ u32 __user *uaddr2, unsigned int flags2,
+ int nr_wake, int nr_requeue,
u32 *cmpval, int requeue_pi);
+extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ struct hrtimer_sleeper *to, u32 bitset);
+
extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
ktime_t *abs_time, u32 bitset);
@@ -279,6 +361,16 @@ struct futex_vector {
struct futex_q q;
};
+extern int futex_parse_waitv(struct futex_vector *futexv,
+ struct futex_waitv __user *uwaitv,
+ unsigned int nr_futexes, futex_wake_fn *wake,
+ void *wake_data);
+
+extern int futex_wait_multiple_setup(struct futex_vector *vs, int count,
+ int *woken);
+
+extern int futex_unqueue_multiple(struct futex_vector *v, int count);
+
extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
struct hrtimer_sleeper *to);
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index ce2889f12375..5722467f2737 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/slab.h>
+#include <linux/sched/rt.h>
#include <linux/sched/task.h>
#include "futex.h"
@@ -610,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
/*
* Caller must hold a reference on @pi_state.
*/
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
+static int wake_futex_pi(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
+ struct rt_mutex_waiter *top_waiter)
{
- struct rt_mutex_waiter *top_waiter;
struct task_struct *new_owner;
bool postunlock = false;
DEFINE_RT_WAKE_Q(wqh);
u32 curval, newval;
int ret = 0;
- top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
- if (WARN_ON_ONCE(!top_waiter)) {
- /*
- * As per the comment in futex_unlock_pi() this should not happen.
- *
- * When this happens, give up our locks and try again, giving
- * the futex_lock_pi() instance time to complete, either by
- * waiting on the rtmutex or removing itself from the futex
- * queue.
- */
- ret = -EAGAIN;
- goto out_unlock;
- }
-
new_owner = top_waiter->task;
/*
@@ -945,7 +933,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
to = futex_setup_timer(time, &timeout, flags, 0);
retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
+ ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;
@@ -1002,6 +990,12 @@ retry_private:
goto no_block;
}
+ /*
+ * Must be done before we enqueue the waiter, here is unfortunately
+ * under the hb lock, but that *should* work because it does nothing.
+ */
+ rt_mutex_pre_schedule();
+
rt_mutex_init_waiter(&rt_waiter);
/*
@@ -1039,19 +1033,37 @@ retry_private:
ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
cleanup:
- spin_lock(q.lock_ptr);
/*
* If we failed to acquire the lock (deadlock/signal/timeout), we must
- * first acquire the hb->lock before removing the lock from the
- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
- * lists consistent.
+ * must unwind the above, however we canont lock hb->lock because
+ * rt_mutex already has a waiter enqueued and hb->lock can itself try
+ * and enqueue an rt_waiter through rtlock.
+ *
+ * Doing the cleanup without holding hb->lock can cause inconsistent
+ * state between hb and pi_state, but only in the direction of not
+ * seeing a waiter that is leaving.
+ *
+ * See futex_unlock_pi(), it deals with this inconsistency.
+ *
+ * There be dragons here, since we must deal with the inconsistency on
+ * the way out (here), it is impossible to detect/warn about the race
+ * the other way around (missing an incoming waiter).
*
- * In particular; it is important that futex_unlock_pi() can not
- * observe this inconsistency.
+ * What could possibly go wrong...
*/
if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
ret = 0;
+ /*
+ * Now that the rt_waiter has been dequeued, it is safe to use
+ * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
+ * the
+ */
+ spin_lock(q.lock_ptr);
+ /*
+ * Waiter is unqueued.
+ */
+ rt_mutex_post_schedule();
no_block:
/*
* Fixup the pi_state owner and possibly acquire the lock if we
@@ -1117,12 +1129,13 @@ retry:
if ((uval & FUTEX_TID_MASK) != vpid)
return -EPERM;
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
+ ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
if (ret)
return ret;
hb = futex_hash(&key);
spin_lock(&hb->lock);
+retry_hb:
/*
* Check waiters first. We do not trust user space values at
@@ -1132,6 +1145,7 @@ retry:
top_waiter = futex_top_waiter(hb, &key);
if (top_waiter) {
struct futex_pi_state *pi_state = top_waiter->pi_state;
+ struct rt_mutex_waiter *rt_waiter;
ret = -EINVAL;
if (!pi_state)
@@ -1144,22 +1158,44 @@ retry:
if (pi_state->owner != current)
goto out_unlock;
- get_pi_state(pi_state);
/*
* By taking wait_lock while still holding hb->lock, we ensure
- * there is no point where we hold neither; and therefore
- * wake_futex_p() must observe a state consistent with what we
- * observed.
+ * there is no point where we hold neither; and thereby
+ * wake_futex_pi() must observe any new waiters.
+ *
+ * Since the cleanup: case in futex_lock_pi() removes the
+ * rt_waiter without holding hb->lock, it is possible for
+ * wake_futex_pi() to not find a waiter while the above does,
+ * in this case the waiter is on the way out and it can be
+ * ignored.
*
* In particular; this forces __rt_mutex_start_proxy() to
* complete such that we're guaranteed to observe the
- * rt_waiter. Also see the WARN in wake_futex_pi().
+ * rt_waiter.
*/
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Futex vs rt_mutex waiter state -- if there are no rt_mutex
+ * waiters even though futex thinks there are, then the waiter
+ * is leaving. The entry needs to be removed from the list so a
+ * new futex_lock_pi() is not using this stale PI-state while
+ * the futex is available in user space again.
+ * There can be more than one task on its way out so it needs
+ * to retry.
+ */
+ rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
+ if (!rt_waiter) {
+ __futex_unqueue(top_waiter);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ goto retry_hb;
+ }
+
+ get_pi_state(pi_state);
spin_unlock(&hb->lock);
/* drops pi_state->pi_mutex.wait_lock */
- ret = wake_futex_pi(uaddr, uval, pi_state);
+ ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
put_pi_state(pi_state);
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index cba8b1a6a4cc..b47bb764b352 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/plist.h>
#include <linux/sched/signal.h>
#include "futex.h"
@@ -58,6 +59,7 @@ enum {
const struct futex_q futex_q_init = {
/* list gets initialized in futex_queue()*/
+ .wake = futex_wake_mark,
.key = FUTEX_KEY_INIT,
.bitset = FUTEX_BITSET_MATCH_ANY,
.requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
@@ -269,7 +271,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
union futex_key *key2, struct futex_pi_state **ps,
struct task_struct **exiting, int set_waiters)
{
- struct futex_q *top_waiter = NULL;
+ struct futex_q *top_waiter;
u32 curval;
int ret;
@@ -346,8 +348,9 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
/**
* futex_requeue() - Requeue waiters from uaddr1 to uaddr2
* @uaddr1: source futex user address
- * @flags: futex flags (FLAGS_SHARED, etc.)
+ * @flags1: futex flags (FLAGS_SHARED, etc.)
* @uaddr2: target futex user address
+ * @flags2: futex flags (FLAGS_SHARED, etc.)
* @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
* @nr_requeue: number of waiters to requeue (0-INT_MAX)
* @cmpval: @uaddr1 expected value (or %NULL)
@@ -361,7 +364,8 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
* - >=0 - on success, the number of tasks requeued or woken;
* - <0 - on error
*/
-int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+ u32 __user *uaddr2, unsigned int flags2,
int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -424,10 +428,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
}
retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ);
if (unlikely(ret != 0))
return ret;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+ ret = get_futex_key(uaddr2, flags2, &key2,
requeue_pi ? FUTEX_WRITE : FUTEX_READ);
if (unlikely(ret != 0))
return ret;
@@ -459,7 +463,7 @@ retry_private:
if (ret)
return ret;
- if (!(flags & FLAGS_SHARED))
+ if (!(flags1 & FLAGS_SHARED))
goto retry_private;
goto retry;
@@ -591,7 +595,7 @@ retry_private:
/* Plain futexes just wake or requeue and are done */
if (!requeue_pi) {
if (++task_count <= nr_wake)
- futex_wake_mark(&wake_q, this);
+ this->wake(&wake_q, this);
else
requeue_futex(this, hb1, hb2, &key2);
continue;
@@ -789,7 +793,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
rt_mutex_init_waiter(&rt_waiter);
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;
@@ -850,11 +854,13 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
- /* Current is not longer pi_blocked_on */
- spin_lock(q.lock_ptr);
+ /*
+ * See futex_unlock_pi()'s cleanup: comment.
+ */
if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
ret = 0;
+ spin_lock(q.lock_ptr);
debug_rt_mutex_free_waiter(&rt_waiter);
/*
* Fixup the pi_state owner and possibly acquire the lock if we
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index a8074079b09e..4b6da9116aa6 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-#include <linux/compat.h>
#include <linux/syscalls.h>
#include <linux/time_namespace.h>
@@ -85,15 +84,12 @@ err_unlock:
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
+ unsigned int flags = futex_to_flags(op);
int cmd = op & FUTEX_CMD_MASK;
- unsigned int flags = 0;
- if (!(op & FUTEX_PRIVATE_FLAG))
- flags |= FLAGS_SHARED;
-
- if (op & FUTEX_CLOCK_REALTIME) {
- flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
+ if (flags & FLAGS_CLOCKRT) {
+ if (cmd != FUTEX_WAIT_BITSET &&
+ cmd != FUTEX_WAIT_REQUEUE_PI &&
cmd != FUTEX_LOCK_PI2)
return -ENOSYS;
}
@@ -110,9 +106,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
case FUTEX_WAKE_BITSET:
return futex_wake(uaddr, flags, val, val3);
case FUTEX_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
case FUTEX_CMP_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
case FUTEX_WAKE_OP:
return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
case FUTEX_LOCK_PI:
@@ -129,7 +125,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
uaddr2);
case FUTEX_CMP_REQUEUE_PI:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
}
return -ENOSYS;
}
@@ -183,43 +179,91 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
-/* Mask of available flags for each futex in futex_waitv list */
-#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
-
/**
* futex_parse_waitv - Parse a waitv array from userspace
* @futexv: Kernel side list of waiters to be filled
* @uwaitv: Userspace list to be parsed
* @nr_futexes: Length of futexv
+ * @wake: Wake to call when futex is woken
+ * @wake_data: Data for the wake handler
*
* Return: Error code on failure, 0 on success
*/
-static int futex_parse_waitv(struct futex_vector *futexv,
- struct futex_waitv __user *uwaitv,
- unsigned int nr_futexes)
+int futex_parse_waitv(struct futex_vector *futexv,
+ struct futex_waitv __user *uwaitv,
+ unsigned int nr_futexes, futex_wake_fn *wake,
+ void *wake_data)
{
struct futex_waitv aux;
unsigned int i;
for (i = 0; i < nr_futexes; i++) {
+ unsigned int flags;
+
if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
return -EFAULT;
- if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved)
+ if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
+ return -EINVAL;
+
+ flags = futex2_to_flags(aux.flags);
+ if (!futex_flags_valid(flags))
return -EINVAL;
- if (!(aux.flags & FUTEX_32))
+ if (!futex_validate_input(flags, aux.val))
return -EINVAL;
- futexv[i].w.flags = aux.flags;
+ futexv[i].w.flags = flags;
futexv[i].w.val = aux.val;
futexv[i].w.uaddr = aux.uaddr;
futexv[i].q = futex_q_init;
+ futexv[i].q.wake = wake;
+ futexv[i].q.wake_data = wake_data;
}
return 0;
}
+static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
+ clockid_t clockid, struct hrtimer_sleeper *to)
+{
+ int flag_clkid = 0, flag_init = 0;
+ struct timespec64 ts;
+ ktime_t time;
+ int ret;
+
+ if (!timeout)
+ return 0;
+
+ if (clockid == CLOCK_REALTIME) {
+ flag_clkid = FLAGS_CLOCKRT;
+ flag_init = FUTEX_CLOCK_REALTIME;
+ }
+
+ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
+ return -EINVAL;
+
+ if (get_timespec64(&ts, timeout))
+ return -EFAULT;
+
+ /*
+ * Since there's no opcode for futex_waitv, use
+ * FUTEX_WAIT_BITSET that uses absolute timeout as well
+ */
+ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
+ if (ret)
+ return ret;
+
+ futex_setup_timer(&time, to, flag_clkid, 0);
+ return 0;
+}
+
+static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
+{
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+}
+
/**
* sys_futex_waitv - Wait on a list of futexes
* @waiters: List of futexes to wait on
@@ -249,8 +293,6 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
{
struct hrtimer_sleeper to;
struct futex_vector *futexv;
- struct timespec64 ts;
- ktime_t time;
int ret;
/* This syscall supports no flags for now */
@@ -260,30 +302,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
return -EINVAL;
- if (timeout) {
- int flag_clkid = 0, flag_init = 0;
-
- if (clockid == CLOCK_REALTIME) {
- flag_clkid = FLAGS_CLOCKRT;
- flag_init = FUTEX_CLOCK_REALTIME;
- }
-
- if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
- return -EINVAL;
-
- if (get_timespec64(&ts, timeout))
- return -EFAULT;
-
- /*
- * Since there's no opcode for futex_waitv, use
- * FUTEX_WAIT_BITSET that uses absolute timeout as well
- */
- ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
- if (ret)
- return ret;
-
- futex_setup_timer(&time, &to, flag_clkid, 0);
- }
+ if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+ return ret;
futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
if (!futexv) {
@@ -291,20 +311,133 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
goto destroy_timer;
}
- ret = futex_parse_waitv(futexv, waiters, nr_futexes);
+ ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
+ NULL);
if (!ret)
ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
kfree(futexv);
destroy_timer:
- if (timeout) {
- hrtimer_cancel(&to.timer);
- destroy_hrtimer_on_stack(&to.timer);
- }
+ if (timeout)
+ futex2_destroy_timeout(&to);
return ret;
}
+/*
+ * sys_futex_wake - Wake a number of futexes
+ * @uaddr: Address of the futex(es) to wake
+ * @mask: bitmask
+ * @nr: Number of the futexes to wake
+ * @flags: FUTEX2 flags
+ *
+ * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_wake,
+ void __user *, uaddr,
+ unsigned long, mask,
+ int, nr,
+ unsigned int, flags)
+{
+ if (flags & ~FUTEX2_VALID_MASK)
+ return -EINVAL;
+
+ flags = futex2_to_flags(flags);
+ if (!futex_flags_valid(flags))
+ return -EINVAL;
+
+ if (!futex_validate_input(flags, mask))
+ return -EINVAL;
+
+ return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
+}
+
+/*
+ * sys_futex_wait - Wait on a futex
+ * @uaddr: Address of the futex to wait on
+ * @val: Value of @uaddr
+ * @mask: bitmask
+ * @flags: FUTEX2 flags
+ * @timeout: Optional absolute timeout
+ * @clockid: Clock to be used for the timeout, realtime or monotonic
+ *
+ * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
+ * futex2 familiy of calls.
+ */
+
+SYSCALL_DEFINE6(futex_wait,
+ void __user *, uaddr,
+ unsigned long, val,
+ unsigned long, mask,
+ unsigned int, flags,
+ struct __kernel_timespec __user *, timeout,
+ clockid_t, clockid)
+{
+ struct hrtimer_sleeper to;
+ int ret;
+
+ if (flags & ~FUTEX2_VALID_MASK)
+ return -EINVAL;
+
+ flags = futex2_to_flags(flags);
+ if (!futex_flags_valid(flags))
+ return -EINVAL;
+
+ if (!futex_validate_input(flags, val) ||
+ !futex_validate_input(flags, mask))
+ return -EINVAL;
+
+ if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+ return ret;
+
+ ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
+
+ if (timeout)
+ futex2_destroy_timeout(&to);
+
+ return ret;
+}
+
+/*
+ * sys_futex_requeue - Requeue a waiter from one futex to another
+ * @waiters: array describing the source and destination futex
+ * @flags: unused
+ * @nr_wake: number of futexes to wake
+ * @nr_requeue: number of futexes to requeue
+ *
+ * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_requeue,
+ struct futex_waitv __user *, waiters,
+ unsigned int, flags,
+ int, nr_wake,
+ int, nr_requeue)
+{
+ struct futex_vector futexes[2];
+ u32 cmpval;
+ int ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (!waiters)
+ return -EINVAL;
+
+ ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
+ if (ret)
+ return ret;
+
+ cmpval = futexes[0].w.val;
+
+ return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
+ u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
+ nr_wake, nr_requeue, &cmpval, 0);
+}
+
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(set_robust_list,
struct compat_robust_list_head __user *, head,
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index ba01b9408203..3a10375d9521 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/plist.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/freezer.h>
@@ -106,20 +107,11 @@
* double_lock_hb() and double_unlock_hb(), respectively.
*/
-/*
- * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed. Callers
- * must ensure to later call wake_up_q() for the actual
- * wakeups to occur.
- */
-void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
+bool __futex_wake_mark(struct futex_q *q)
{
- struct task_struct *p = q->task;
-
if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
- return;
+ return false;
- get_task_struct(p);
__futex_unqueue(q);
/*
* The waiting task can free the futex_q as soon as q->lock_ptr = NULL
@@ -130,6 +122,26 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
*/
smp_store_release(&q->lock_ptr, NULL);
+ return true;
+}
+
+/*
+ * The hash bucket lock must be held when this is called.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
+ */
+void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
+{
+ struct task_struct *p = q->task;
+
+ get_task_struct(p);
+
+ if (!__futex_wake_mark(q)) {
+ put_task_struct(p);
+ return;
+ }
+
/*
* Queue the task for later wakeup for after we've released
* the hb->lock.
@@ -145,16 +157,19 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
- int ret;
DEFINE_WAKE_Q(wake_q);
+ int ret;
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+ ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);
if (unlikely(ret != 0))
return ret;
+ if ((flags & FLAGS_STRICT) && !nr_wake)
+ return 0;
+
hb = futex_hash(&key);
/* Make sure we really have tasks to wakeup */
@@ -174,7 +189,7 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!(this->bitset & bitset))
continue;
- futex_wake_mark(&wake_q, this);
+ this->wake(&wake_q, this);
if (++ret >= nr_wake)
break;
}
@@ -245,10 +260,10 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
DEFINE_WAKE_Q(wake_q);
retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
if (unlikely(ret != 0))
return ret;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
if (unlikely(ret != 0))
return ret;
@@ -289,7 +304,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
- futex_wake_mark(&wake_q, this);
+ this->wake(&wake_q, this);
if (++ret >= nr_wake)
break;
}
@@ -303,7 +318,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
- futex_wake_mark(&wake_q, this);
+ this->wake(&wake_q, this);
if (++op_ret >= nr_wake2)
break;
}
@@ -358,7 +373,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
}
/**
- * unqueue_multiple - Remove various futexes from their hash bucket
+ * futex_unqueue_multiple - Remove various futexes from their hash bucket
* @v: The list of futexes to unqueue
* @count: Number of futexes in the list
*
@@ -368,7 +383,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
* - >=0 - Index of the last futex that was awoken;
* - -1 - No futex was awoken
*/
-static int unqueue_multiple(struct futex_vector *v, int count)
+int futex_unqueue_multiple(struct futex_vector *v, int count)
{
int ret = -1, i;
@@ -396,7 +411,7 @@ static int unqueue_multiple(struct futex_vector *v, int count)
* - 0 - Success
* - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
*/
-static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
+int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
{
struct futex_hash_bucket *hb;
bool retry = false;
@@ -419,11 +434,11 @@ static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *wo
*/
retry:
for (i = 0; i < count; i++) {
- if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry)
+ if (!(vs[i].w.flags & FLAGS_SHARED) && retry)
continue;
ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
- !(vs[i].w.flags & FUTEX_PRIVATE_FLAG),
+ vs[i].w.flags,
&vs[i].q.key, FUTEX_READ);
if (unlikely(ret))
@@ -435,7 +450,7 @@ retry:
for (i = 0; i < count; i++) {
u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
struct futex_q *q = &vs[i].q;
- u32 val = (u32)vs[i].w.val;
+ u32 val = vs[i].w.val;
hb = futex_q_lock(q);
ret = futex_get_value_locked(&uval, uaddr);
@@ -458,7 +473,7 @@ retry:
* was woken, we don't return error and return this index to
* userspace
*/
- *woken = unqueue_multiple(vs, i);
+ *woken = futex_unqueue_multiple(vs, i);
if (*woken >= 0)
return 1;
@@ -543,7 +558,7 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
__set_current_state(TASK_RUNNING);
- ret = unqueue_multiple(vs, count);
+ ret = futex_unqueue_multiple(vs, count);
if (ret >= 0)
return ret;
@@ -599,7 +614,7 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
* while the syscall executes.
*/
retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+ ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
if (unlikely(ret != 0))
return ret;
@@ -629,20 +644,18 @@ retry_private:
return ret;
}
-int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ struct hrtimer_sleeper *to, u32 bitset)
{
- struct hrtimer_sleeper timeout, *to;
- struct restart_block *restart;
- struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
+ struct futex_hash_bucket *hb;
int ret;
if (!bitset)
return -EINVAL;
+
q.bitset = bitset;
- to = futex_setup_timer(abs_time, &timeout, flags,
- current->timer_slack_ns);
retry:
/*
* Prepare to wait on uaddr. On success, it holds hb->lock and q
@@ -650,18 +663,17 @@ retry:
*/
ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
- goto out;
+ return ret;
/* futex_queue and wait for wakeup, timeout, or a signal. */
futex_wait_queue(hb, &q, to);
/* If we were woken (and unqueued), we succeeded, whatever. */
- ret = 0;
if (!futex_unqueue(&q))
- goto out;
- ret = -ETIMEDOUT;
+ return 0;
+
if (to && !to->task)
- goto out;
+ return -ETIMEDOUT;
/*
* We expect signal_pending(current), but we might be the
@@ -670,24 +682,38 @@ retry:
if (!signal_pending(current))
goto retry;
- ret = -ERESTARTSYS;
- if (!abs_time)
- goto out;
+ return -ERESTARTSYS;
+}
- restart = &current->restart_block;
- restart->futex.uaddr = uaddr;
- restart->futex.val = val;
- restart->futex.time = *abs_time;
- restart->futex.bitset = bitset;
- restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct restart_block *restart;
+ int ret;
- ret = set_restart_fn(restart, futex_wait_restart);
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
+ ret = __futex_wait(uaddr, flags, val, to, bitset);
+
+ /* No timeout, nothing to clean up. */
+ if (!to)
+ return ret;
+
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+
+ if (ret == -ERESTARTSYS) {
+ restart = &current->restart_block;
+ restart->futex.uaddr = uaddr;
+ restart->futex.val = val;
+ restart->futex.time = *abs_time;
+ restart->futex.bitset = bitset;
+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+
+ return set_restart_fn(restart, futex_wait_restart);
}
+
return ret;
}
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 5c3086cad8f9..01520689b57c 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -99,7 +99,7 @@ struct gcov_iterator {
struct gcov_info *info;
size_t size;
loff_t pos;
- char buffer[];
+ char buffer[] __counted_by(size);
};
/**
diff --git a/kernel/groups.c b/kernel/groups.c
index 9aaed2a31073..9b43da22647d 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -19,7 +19,7 @@ struct group_info *groups_alloc(int gidsetsize)
if (!gi)
return NULL;
- atomic_set(&gi->usage, 1);
+ refcount_set(&gi->usage, 1);
gi->ngroups = gidsetsize;
return gi;
}
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 5971a66be034..aae0402507ed 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -121,7 +121,6 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
BIT_MASK_DESCR(IRQD_CAN_RESERVE),
- BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c653cd31548d..d39a40bc542b 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -219,11 +219,15 @@ void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
int num_ct, unsigned int irq_base,
void __iomem *reg_base, irq_flow_handler_t handler)
{
+ struct irq_chip_type *ct = gc->chip_types;
+ int i;
+
raw_spin_lock_init(&gc->lock);
gc->num_ct = num_ct;
gc->irq_base = irq_base;
gc->reg_base = reg_base;
- gc->chip_types->chip.name = name;
+ for (i = 0; i < num_ct; i++)
+ ct[i].chip.name = name;
gc->chip_types->handler = handler;
}
@@ -544,21 +548,34 @@ EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
unsigned int clr, unsigned int set)
{
- unsigned int i = gc->irq_base;
+ unsigned int i, virq;
raw_spin_lock(&gc_lock);
list_del(&gc->list);
raw_spin_unlock(&gc_lock);
- for (; msk; msk >>= 1, i++) {
+ for (i = 0; msk; msk >>= 1, i++) {
if (!(msk & 0x01))
continue;
+ /*
+ * Interrupt domain based chips store the base hardware
+ * interrupt number in gc::irq_base. Otherwise gc::irq_base
+ * contains the base Linux interrupt number.
+ */
+ if (gc->domain) {
+ virq = irq_find_mapping(gc->domain, gc->irq_base + i);
+ if (!virq)
+ continue;
+ } else {
+ virq = gc->irq_base + i;
+ }
+
/* Remove handler first. That will mask the irq line */
- irq_set_handler(i, NULL);
- irq_set_chip(i, &no_irq_chip);
- irq_set_chip_data(i, NULL);
- irq_modify_status(i, clr, set);
+ irq_set_handler(virq, NULL);
+ irq_set_chip(virq, &no_irq_chip);
+ irq_set_chip_data(virq, NULL);
+ irq_modify_status(virq, clr, set);
}
}
EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 27ca1c866f29..371eb1711d34 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -600,7 +600,7 @@ int __init early_irq_init(void)
mutex_init(&desc[i].request_mutex);
init_waitqueue_head(&desc[i].wait_for_threads);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
- irq_resend_init(desc);
+ irq_resend_init(&desc[i]);
}
return arch_early_irq_init();
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d309ba84e08a..1782f90cd8c6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1852,15 +1852,13 @@ out_thread:
struct task_struct *t = new->thread;
new->thread = NULL;
- kthread_stop(t);
- put_task_struct(t);
+ kthread_stop_put(t);
}
if (new->secondary && new->secondary->thread) {
struct task_struct *t = new->secondary->thread;
new->secondary->thread = NULL;
- kthread_stop(t);
- put_task_struct(t);
+ kthread_stop_put(t);
}
out_mput:
module_put(desc->owner);
@@ -1971,12 +1969,9 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* the same bit to a newly requested action.
*/
if (action->thread) {
- kthread_stop(action->thread);
- put_task_struct(action->thread);
- if (action->secondary && action->secondary->thread) {
- kthread_stop(action->secondary->thread);
- put_task_struct(action->secondary->thread);
- }
+ kthread_stop_put(action->thread);
+ if (action->secondary && action->secondary->thread)
+ kthread_stop_put(action->secondary->thread);
}
/* Last action releases resources */
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 1698e77645ac..75d0ae490e29 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -466,16 +466,16 @@ unsigned int irq_matrix_reserved(struct irq_matrix *m)
}
/**
- * irq_matrix_allocated - Get the number of allocated irqs on the local cpu
+ * irq_matrix_allocated - Get the number of allocated non-managed irqs on the local CPU
* @m: Pointer to the matrix to search
*
- * This returns number of allocated irqs
+ * This returns number of allocated non-managed interrupts.
*/
unsigned int irq_matrix_allocated(struct irq_matrix *m)
{
struct cpumap *cm = this_cpu_ptr(m->maps);
- return cm->allocated;
+ return cm->allocated - cm->managed_allocated;
}
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index b4c31a5c1147..79b4a58ba9c3 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1204,7 +1204,6 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
#define VIRQ_CAN_RESERVE 0x01
#define VIRQ_ACTIVATE 0x02
-#define VIRQ_NOMASK_QUIRK 0x04
static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags)
{
@@ -1213,8 +1212,6 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
if (!(vflags & VIRQ_CAN_RESERVE)) {
irqd_clr_can_reserve(irqd);
- if (vflags & VIRQ_NOMASK_QUIRK)
- irqd_set_msi_nomask_quirk(irqd);
/*
* If the interrupt is managed but no CPU is available to
@@ -1275,15 +1272,8 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain
* Interrupt can use a reserved vector and will not occupy
* a real device vector until the interrupt is requested.
*/
- if (msi_check_reservation_mode(domain, info, dev)) {
+ if (msi_check_reservation_mode(domain, info, dev))
vflags |= VIRQ_CAN_RESERVE;
- /*
- * MSI affinity setting requires a special quirk (X86) when
- * reservation mode is active.
- */
- if (info->flags & MSI_FLAG_NOMASK_QUIRK)
- vflags |= VIRQ_NOMASK_QUIRK;
- }
xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED))
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 5353edfad8e1..b0639f21041f 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -64,8 +64,10 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
struct file *file;
rcu_read_lock();
- file = task_lookup_fd_rcu(task, idx);
+ file = task_lookup_fdget_rcu(task, idx);
rcu_read_unlock();
+ if (file)
+ fput(file);
return file;
}
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 0ddbdab5903d..015586217875 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -699,12 +699,9 @@ static void test_barrier_nothreads(struct kunit *test)
KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true);
KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false);
KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true);
-
-#ifdef clear_bit_unlock_is_negative_byte
- KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
- KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
- KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
-#endif
+ KCSAN_EXPECT_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
kcsan_nestable_atomic_end();
}
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 8679322450f2..84a1200271af 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -227,12 +227,9 @@ static bool __init test_barrier(void)
KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock));
spin_lock(&test_spinlock);
KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock));
-
-#ifdef clear_bit_unlock_is_negative_byte
- KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
- KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
- KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
-#endif
+ KCSAN_CHECK_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+ KCSAN_CHECK_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
kcsan_nestable_atomic_end();
return ret;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 107f355eac10..8f35a5a42af8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -247,7 +247,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
return -EINVAL;
- ksegments = memdup_user(segments, nr_segments * sizeof(ksegments[0]));
+ ksegments = memdup_array_user(segments, nr_segments, sizeof(ksegments[0]));
if (IS_ERR(ksegments))
return PTR_ERR(ksegments);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 9dc728982d79..d08fc7b5db97 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -52,22 +52,7 @@ atomic_t __kexec_lock = ATOMIC_INIT(0);
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
- .desc = IORES_DESC_CRASH_KERNEL
-};
-struct resource crashk_low_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
- .desc = IORES_DESC_CRASH_KERNEL
-};
+bool kexec_file_dbg_print;
int kexec_should_crash(struct task_struct *p)
{
@@ -293,8 +278,8 @@ int kimage_is_destination_range(struct kimage *image,
unsigned long mstart, mend;
mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((end > mstart) && (start < mend))
+ mend = mstart + image->segment[i].memsz - 1;
+ if ((end >= mstart) && (start <= mend))
return 1;
}
@@ -387,7 +372,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
pfn = page_to_boot_pfn(pages);
epfn = pfn + count;
addr = pfn << PAGE_SHIFT;
- eaddr = epfn << PAGE_SHIFT;
+ eaddr = (epfn << PAGE_SHIFT) - 1;
if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
kimage_is_destination_range(image, addr, eaddr)) {
list_add(&pages->lru, &extra_pages);
@@ -447,7 +432,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
pages = NULL;
size = (1 << order) << PAGE_SHIFT;
- hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+ hole_start = ALIGN(image->control_page, size);
hole_end = hole_start + size - 1;
while (hole_end <= crashk_res.end) {
unsigned long i;
@@ -464,7 +449,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
mend = mstart + image->segment[i].memsz - 1;
if ((hole_end >= mstart) && (hole_start <= mend)) {
/* Advance the hole to the end of the segment */
- hole_start = (mend + (size - 1)) & ~(size - 1);
+ hole_start = ALIGN(mend, size);
hole_end = hole_start + size - 1;
break;
}
@@ -472,7 +457,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
/* If I don't overlap any segments I have found my hole! */
if (i == image->nr_segments) {
pages = pfn_to_page(hole_start >> PAGE_SHIFT);
- image->control_page = hole_end;
+ image->control_page = hole_end + 1;
break;
}
}
@@ -733,7 +718,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
/* If the page is not a destination page use it */
if (!kimage_is_destination_range(image, addr,
- addr + PAGE_SIZE))
+ addr + PAGE_SIZE - 1))
break;
/*
@@ -1080,9 +1065,10 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs)
* panic(). Otherwise parallel calls of panic() and crash_kexec()
* may stop each other. To exclude them, we use panic_cpu here too.
*/
+ old_cpu = PANIC_CPU_INVALID;
this_cpu = raw_smp_processor_id();
- old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
- if (old_cpu == PANIC_CPU_INVALID) {
+
+ if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
/* This is the 1st CPU which comes here, so go ahead. */
__crash_kexec(regs);
@@ -1271,6 +1257,7 @@ int kernel_kexec(void)
kexec_in_progress = true;
kernel_restart_prepare("kexec reboot");
migrate_to_reboot_cpu();
+ syscore_shutdown();
/*
* migrate_to_reboot_cpu() disables CPU hotplug assuming that
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f9a419cd22d4..bef2f6f2571b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -123,6 +123,8 @@ void kimage_file_post_load_cleanup(struct kimage *image)
*/
kfree(image->image_loader_data);
image->image_loader_data = NULL;
+
+ kexec_file_dbg_print = false;
}
#ifdef CONFIG_KEXEC_SIG
@@ -202,6 +204,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
if (ret < 0)
return ret;
image->kernel_buf_len = ret;
+ kexec_dprintk("kernel: %p kernel_size: %#lx\n",
+ image->kernel_buf, image->kernel_buf_len);
/* Call arch image probe handlers */
ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
@@ -278,6 +282,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
if (!image)
return -ENOMEM;
+ kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG);
image->file_mode = 1;
if (kexec_on_panic) {
@@ -384,13 +389,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (ret)
goto out;
+ kexec_dprintk("nr_segments = %lu\n", image->nr_segments);
for (i = 0; i < image->nr_segments; i++) {
struct kexec_segment *ksegment;
ksegment = &image->segment[i];
- pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
- i, ksegment->buf, ksegment->bufsz, ksegment->mem,
- ksegment->memsz);
+ kexec_dprintk("segment[%d]: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
ret = kimage_load_segment(image, &image->segment[i]);
if (ret)
@@ -403,6 +409,8 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (ret)
goto out;
+ kexec_dprintk("kexec_file_load: type:%u, start:0x%lx head:0x%lx flags:0x%lx\n",
+ image->type, image->start, image->head, flags);
/*
* Free up any temporary buffers allocated which are not needed
* after image has been loaded
@@ -426,11 +434,11 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
unsigned long temp_start, temp_end;
temp_end = min(end, kbuf->buf_max);
- temp_start = temp_end - kbuf->memsz;
+ temp_start = temp_end - kbuf->memsz + 1;
do {
/* align down start */
- temp_start = temp_start & (~(kbuf->buf_align - 1));
+ temp_start = ALIGN_DOWN(temp_start, kbuf->buf_align);
if (temp_start < start || temp_start < kbuf->buf_min)
return 0;
@@ -592,6 +600,8 @@ static int kexec_walk_resources(struct kexec_buf *kbuf,
IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
crashk_res.start, crashk_res.end,
kbuf, func);
+ else if (kbuf->top_down)
+ return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func);
else
return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0c6185aefaef..9d9095e81792 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1877,13 +1877,27 @@ static struct notifier_block kprobe_exceptions_nb = {
#ifdef CONFIG_KRETPROBES
#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
+
+/* callbacks for objpool of kretprobe instances */
+static int kretprobe_init_inst(void *nod, void *context)
+{
+ struct kretprobe_instance *ri = nod;
+
+ ri->rph = context;
+ return 0;
+}
+static int kretprobe_fini_pool(struct objpool_head *head, void *context)
+{
+ kfree(context);
+ return 0;
+}
+
static void free_rp_inst_rcu(struct rcu_head *head)
{
struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
+ struct kretprobe_holder *rph = ri->rph;
- if (refcount_dec_and_test(&ri->rph->ref))
- kfree(ri->rph);
- kfree(ri);
+ objpool_drop(ri, &rph->pool);
}
NOKPROBE_SYMBOL(free_rp_inst_rcu);
@@ -1892,7 +1906,7 @@ static void recycle_rp_inst(struct kretprobe_instance *ri)
struct kretprobe *rp = get_kretprobe(ri);
if (likely(rp))
- freelist_add(&ri->freelist, &rp->freelist);
+ objpool_push(ri, &rp->rph->pool);
else
call_rcu(&ri->rcu, free_rp_inst_rcu);
}
@@ -1929,23 +1943,12 @@ NOKPROBE_SYMBOL(kprobe_flush_task);
static inline void free_rp_inst(struct kretprobe *rp)
{
- struct kretprobe_instance *ri;
- struct freelist_node *node;
- int count = 0;
-
- node = rp->freelist.head;
- while (node) {
- ri = container_of(node, struct kretprobe_instance, freelist);
- node = node->next;
-
- kfree(ri);
- count++;
- }
+ struct kretprobe_holder *rph = rp->rph;
- if (refcount_sub_and_test(count, &rp->rph->ref)) {
- kfree(rp->rph);
- rp->rph = NULL;
- }
+ if (!rph)
+ return;
+ rp->rph = NULL;
+ objpool_fini(&rph->pool);
}
/* This assumes the 'tsk' is the current task or the is not running. */
@@ -1990,7 +1993,7 @@ NOKPROBE_SYMBOL(__kretprobe_find_ret_addr);
unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
struct llist_node **cur)
{
- struct kretprobe_instance *ri = NULL;
+ struct kretprobe_instance *ri;
kprobe_opcode_t *ret;
if (WARN_ON_ONCE(!cur))
@@ -2087,19 +2090,17 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+ struct kretprobe_holder *rph = rp->rph;
struct kretprobe_instance *ri;
- struct freelist_node *fn;
- fn = freelist_try_get(&rp->freelist);
- if (!fn) {
+ ri = objpool_pop(&rph->pool);
+ if (!ri) {
rp->nmissed++;
return 0;
}
- ri = container_of(fn, struct kretprobe_instance, freelist);
-
if (rp->entry_handler && rp->entry_handler(ri, regs)) {
- freelist_add(&ri->freelist, &rp->freelist);
+ objpool_push(ri, &rph->pool);
return 0;
}
@@ -2193,7 +2194,6 @@ int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long o
int register_kretprobe(struct kretprobe *rp)
{
int ret;
- struct kretprobe_instance *inst;
int i;
void *addr;
@@ -2227,19 +2227,12 @@ int register_kretprobe(struct kretprobe *rp)
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
- rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler);
- if (!rp->rh)
- return -ENOMEM;
+ rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler,
+ sizeof(struct kretprobe_instance) +
+ rp->data_size, rp->maxactive);
+ if (IS_ERR(rp->rh))
+ return PTR_ERR(rp->rh);
- for (i = 0; i < rp->maxactive; i++) {
- inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL);
- if (inst == NULL) {
- rethook_free(rp->rh);
- rp->rh = NULL;
- return -ENOMEM;
- }
- rethook_add_node(rp->rh, &inst->node);
- }
rp->nmissed = 0;
/* Establish function entry probe point */
ret = register_kprobe(&rp->kp);
@@ -2248,24 +2241,18 @@ int register_kretprobe(struct kretprobe *rp)
rp->rh = NULL;
}
#else /* !CONFIG_KRETPROBE_ON_RETHOOK */
- rp->freelist.head = NULL;
rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
if (!rp->rph)
return -ENOMEM;
- rp->rph->rp = rp;
- for (i = 0; i < rp->maxactive; i++) {
- inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL);
- if (inst == NULL) {
- refcount_set(&rp->rph->ref, i);
- free_rp_inst(rp);
- return -ENOMEM;
- }
- inst->rph = rp->rph;
- freelist_add(&inst->freelist, &rp->freelist);
+ if (objpool_init(&rp->rph->pool, rp->maxactive, rp->data_size +
+ sizeof(struct kretprobe_instance), GFP_KERNEL,
+ rp->rph, kretprobe_init_inst, kretprobe_fini_pool)) {
+ kfree(rp->rph);
+ rp->rph = NULL;
+ return -ENOMEM;
}
- refcount_set(&rp->rph->ref, i);
-
+ rcu_assign_pointer(rp->rph->rp, rp);
rp->nmissed = 0;
/* Establish function entry probe point */
ret = register_kprobe(&rp->kp);
@@ -2313,7 +2300,7 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
rethook_free(rps[i]->rh);
#else
- rps[i]->rph->rp = NULL;
+ rcu_assign_pointer(rps[i]->rph->rp, NULL);
#endif
}
mutex_unlock(&kprobe_mutex);
@@ -2815,7 +2802,7 @@ static int show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
struct kprobe *p, *kp;
- const char *sym = NULL;
+ const char *sym;
unsigned int i = *(loff_t *) v;
unsigned long offset = 0;
char *modname, namebuf[KSYM_NAME_LEN];
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1eea53050bab..c5e40830c1f2 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -715,6 +715,24 @@ int kthread_stop(struct task_struct *k)
}
EXPORT_SYMBOL(kthread_stop);
+/**
+ * kthread_stop_put - stop a thread and put its task struct
+ * @k: thread created by kthread_create().
+ *
+ * Stops a thread created by kthread_create() and put its task_struct.
+ * Only use when holding an extra task struct reference obtained by
+ * calling get_task_struct().
+ */
+int kthread_stop_put(struct task_struct *k)
+{
+ int ret;
+
+ ret = kthread_stop(k);
+ put_task_struct(k);
+ return ret;
+}
+EXPORT_SYMBOL(kthread_stop_put);
+
int kthreadd(void *unused)
{
struct task_struct *tsk = current;
@@ -1469,7 +1487,6 @@ void kthread_unuse_mm(struct mm_struct *mm)
* clearing tsk->mm.
*/
smp_mb__after_spinlock();
- sync_mm_rss(mm);
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 61328328c474..ecbc9b6aba3a 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -243,7 +243,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
* symbols are exported and normal relas can be used instead.
*/
if (!sec_vmlinux && sym_vmlinux) {
- pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section",
+ pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section\n",
sym_name);
return -EINVAL;
}
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
index fa2c2f951c6b..e68d82099558 100644
--- a/kernel/locking/lock_events.c
+++ b/kernel/locking/lock_events.c
@@ -146,7 +146,7 @@ static int __init init_lockevent_counts(void)
struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
int i;
- if (!d_counts)
+ if (IS_ERR(d_counts))
goto out;
/*
@@ -159,14 +159,14 @@ static int __init init_lockevent_counts(void)
for (i = 0; i < lockevent_num; i++) {
if (skip_lockevent(lockevent_names[i]))
continue;
- if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
- (void *)(long)i, &fops_lockevent))
+ if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts,
+ (void *)(long)i, &fops_lockevent)))
goto fail_undo;
}
- if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+ if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
d_counts, (void *)(long)LOCKEVENT_reset_cnts,
- &fops_lockevent))
+ &fops_lockevent)))
goto fail_undo;
return 0;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e85b5ad3e206..151bd3de5936 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3497,7 +3497,8 @@ static int alloc_chain_hlocks(int req)
size = chain_block_size(curr);
if (likely(size >= req)) {
del_chain_block(0, size, chain_block_next(curr));
- add_chain_block(curr + req, size - req);
+ if (size > req)
+ add_chain_block(curr + req, size - req);
return curr;
}
}
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 15fdc7fa5c68..e2bfb1db589d 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -440,7 +440,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
static void seq_time(struct seq_file *m, s64 time)
{
- char num[15];
+ char num[22];
snprint_time(num, sizeof(num), time);
seq_printf(m, " %14s", num);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 270c7f80ce84..415d81e6ce70 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -33,21 +33,23 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
-torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads");
-torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
+torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies).");
+torture_param(int, call_rcu_chains, 0, "Self-propagate call_rcu() chains during test (0=disable).");
torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable");
+torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
+torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, rt_boost, 2,
+ "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
+torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable");
torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
-torture_param(int, rt_boost, 2,
- "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
-torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
-torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
-torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
#define MAX_NESTED_LOCKS 8
@@ -56,6 +58,55 @@ module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type,
"Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
+static cpumask_var_t bind_readers; // Bind the readers to the specified set of CPUs.
+static cpumask_var_t bind_writers; // Bind the writers to the specified set of CPUs.
+
+// Parse a cpumask kernel parameter. If there are more users later on,
+// this might need to got to a more central location.
+static int param_set_cpumask(const char *val, const struct kernel_param *kp)
+{
+ cpumask_var_t *cm_bind = kp->arg;
+ int ret;
+ char *s;
+
+ if (!alloc_cpumask_var(cm_bind, GFP_KERNEL)) {
+ s = "Out of memory";
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ ret = cpulist_parse(val, *cm_bind);
+ if (!ret)
+ return ret;
+ s = "Bad CPU range";
+out_err:
+ pr_warn("%s: %s, all CPUs set\n", kp->name, s);
+ cpumask_setall(*cm_bind);
+ return ret;
+}
+
+// Output a cpumask kernel parameter.
+static int param_get_cpumask(char *buffer, const struct kernel_param *kp)
+{
+ cpumask_var_t *cm_bind = kp->arg;
+
+ return sprintf(buffer, "%*pbl", cpumask_pr_args(*cm_bind));
+}
+
+static bool cpumask_nonempty(cpumask_var_t mask)
+{
+ return cpumask_available(mask) && !cpumask_empty(mask);
+}
+
+static const struct kernel_param_ops lt_bind_ops = {
+ .set = param_set_cpumask,
+ .get = param_get_cpumask,
+};
+
+module_param_cb(bind_readers, &lt_bind_ops, &bind_readers, 0644);
+module_param_cb(bind_writers, &lt_bind_ops, &bind_writers, 0644);
+
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
+
static struct task_struct *stats_task;
static struct task_struct **writer_tasks;
static struct task_struct **reader_tasks;
@@ -69,6 +120,12 @@ struct lock_stress_stats {
long n_lock_acquired;
};
+struct call_rcu_chain {
+ struct rcu_head crc_rh;
+ bool crc_stop;
+};
+struct call_rcu_chain *call_rcu_chain_list;
+
/* Forward reference. */
static void lock_torture_cleanup(void);
@@ -116,12 +173,9 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused)
static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
@@ -194,15 +248,14 @@ __acquires(torture_spinlock)
static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
unsigned long j;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) {
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) {
j = jiffies;
- mdelay(longdelay_ms);
+ mdelay(long_hold);
pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j);
}
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us)))
@@ -320,14 +373,12 @@ __acquires(torture_rwlock)
static void torture_rwlock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
else
udelay(shortdelay_us);
}
@@ -348,14 +399,12 @@ __acquires(torture_rwlock)
static void torture_rwlock_read_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 10;
- const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+ mdelay(long_hold);
else
udelay(shortdelay_us);
}
@@ -453,12 +502,9 @@ __acquires(torture_mutex)
static void torture_mutex_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 5);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold * 5);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
@@ -626,15 +672,13 @@ __acquires(torture_rtmutex)
static void torture_rtmutex_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
/*
* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
@@ -691,12 +735,9 @@ __acquires(torture_rwsem)
static void torture_rwsem_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 10);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold * 10);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
@@ -716,14 +757,11 @@ __acquires(torture_rwsem)
static void torture_rwsem_read_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 2);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+ mdelay(long_hold * 2);
else
- mdelay(longdelay_ms / 2);
+ mdelay(long_hold / 2);
if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
torture_preempt_schedule(); /* Allow test to be preempted. */
}
@@ -803,11 +841,13 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
*/
static int lock_torture_writer(void *arg)
{
+ unsigned long j;
+ unsigned long j1;
+ u32 lockset_mask;
struct lock_stress_stats *lwsp = arg;
- int tid = lwsp - cxt.lwsa;
DEFINE_TORTURE_RANDOM(rand);
- u32 lockset_mask;
bool skip_main_lock;
+ int tid = lwsp - cxt.lwsa;
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
if (!rt_task(current))
@@ -834,17 +874,24 @@ static int lock_torture_writer(void *arg)
cxt.cur_ops->nested_lock(tid, lockset_mask);
if (!skip_main_lock) {
+ if (acq_writer_lim > 0)
+ j = jiffies;
cxt.cur_ops->writelock(tid);
if (WARN_ON_ONCE(lock_is_write_held))
lwsp->n_lock_fail++;
lock_is_write_held = true;
if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
lwsp->n_lock_fail++; /* rare, but... */
-
+ if (acq_writer_lim > 0) {
+ j1 = jiffies;
+ WARN_ONCE(time_after(j1, j + acq_writer_lim),
+ "%s: Lock acquisition took %lu jiffies.\n",
+ __func__, j1 - j);
+ }
lwsp->n_lock_acquired++;
- }
- if (!skip_main_lock) {
+
cxt.cur_ops->write_delay(&rand);
+
lock_is_write_held = false;
WRITE_ONCE(last_lock_release, jiffies);
cxt.cur_ops->writeunlock(tid);
@@ -986,16 +1033,69 @@ static int lock_torture_stats(void *arg)
return 0;
}
+
static inline void
lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
const char *tag)
{
+ static cpumask_t cpumask_all;
+ cpumask_t *rcmp = cpumask_nonempty(bind_readers) ? bind_readers : &cpumask_all;
+ cpumask_t *wcmp = cpumask_nonempty(bind_writers) ? bind_writers : &cpumask_all;
+
+ cpumask_setall(&cpumask_all);
pr_alert("%s" TORTURE_FLAG
- "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ "--- %s%s: acq_writer_lim=%d bind_readers=%*pbl bind_writers=%*pbl call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d\n",
torture_type, tag, cxt.debug_lock ? " [debug]": "",
- cxt.nrealwriters_stress, cxt.nrealreaders_stress,
- nested_locks, stat_interval, verbose, shuffle_interval,
- stutter, shutdown_secs, onoff_interval, onoff_holdoff);
+ acq_writer_lim, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp),
+ call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress,
+ cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost,
+ rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter,
+ verbose, writer_fifo);
+}
+
+// If requested, maintain call_rcu() chains to keep a grace period always
+// in flight. These increase the probability of getting an RCU CPU stall
+// warning and associated diagnostics when a locking primitive stalls.
+
+static void call_rcu_chain_cb(struct rcu_head *rhp)
+{
+ struct call_rcu_chain *crcp = container_of(rhp, struct call_rcu_chain, crc_rh);
+
+ if (!smp_load_acquire(&crcp->crc_stop)) {
+ (void)start_poll_synchronize_rcu(); // Start one grace period...
+ call_rcu(&crcp->crc_rh, call_rcu_chain_cb); // ... and later start another.
+ }
+}
+
+// Start the requested number of call_rcu() chains.
+static int call_rcu_chain_init(void)
+{
+ int i;
+
+ if (call_rcu_chains <= 0)
+ return 0;
+ call_rcu_chain_list = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain_list), GFP_KERNEL);
+ if (!call_rcu_chain_list)
+ return -ENOMEM;
+ for (i = 0; i < call_rcu_chains; i++) {
+ call_rcu_chain_list[i].crc_stop = false;
+ call_rcu(&call_rcu_chain_list[i].crc_rh, call_rcu_chain_cb);
+ }
+ return 0;
+}
+
+// Stop all of the call_rcu() chains.
+static void call_rcu_chain_cleanup(void)
+{
+ int i;
+
+ if (!call_rcu_chain_list)
+ return;
+ for (i = 0; i < call_rcu_chains; i++)
+ smp_store_release(&call_rcu_chain_list[i].crc_stop, true);
+ rcu_barrier();
+ kfree(call_rcu_chain_list);
+ call_rcu_chain_list = NULL;
}
static void lock_torture_cleanup(void)
@@ -1048,6 +1148,8 @@ static void lock_torture_cleanup(void)
kfree(cxt.lrsa);
cxt.lrsa = NULL;
+ call_rcu_chain_cleanup();
+
end:
if (cxt.init_called) {
if (cxt.cur_ops->exit)
@@ -1177,6 +1279,10 @@ static int __init lock_torture_init(void)
}
}
+ firsterr = call_rcu_chain_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
+
lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
/* Prepare torture context. */
@@ -1250,6 +1356,8 @@ static int __init lock_torture_init(void)
writer_fifo ? sched_set_fifo : NULL);
if (torture_init_error(firsterr))
goto unwind;
+ if (cpumask_nonempty(bind_writers))
+ torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers);
create_reader:
if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
@@ -1259,6 +1367,8 @@ static int __init lock_torture_init(void)
reader_tasks[j]);
if (torture_init_error(firsterr))
goto unwind;
+ if (cpumask_nonempty(bind_readers))
+ torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers);
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(lock_torture_stats, NULL,
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index d973fe6041bf..cbae8c0b89ab 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -532,6 +532,11 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
* This function must not be used in interrupt context. Unlocking
* of a not locked mutex is not allowed.
*
+ * The caller must ensure that the mutex stays alive until this function has
+ * returned - mutex_unlock() can NOT directly be used to release an object such
+ * that another concurrent task can free it.
+ * Mutexes are different from spinlocks & refcounts in this aspect.
+ *
* This function is similar to (but not equivalent to) up().
*/
void __sched mutex_unlock(struct mutex *lock)
@@ -1126,6 +1131,9 @@ EXPORT_SYMBOL(ww_mutex_lock_interruptible);
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
#endif /* !CONFIG_PREEMPT_RT */
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin);
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end);
+
/**
* atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
* @cnt: the atomic which we are to dec
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index d5610ad52b92..75a6f6133866 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -11,6 +11,13 @@
* called from interrupt context and we have preemption disabled while
* spinning.
*/
+
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
+ int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # + 1 value */
+};
+
static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
/*
@@ -37,32 +44,28 @@ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
+ *
+ * If osq_lock() is being cancelled there must be a previous node
+ * and 'old_cpu' is its CPU #.
+ * For osq_unlock() there is never a previous node and old_cpu is
+ * set to OSQ_UNLOCKED_VAL.
*/
static inline struct optimistic_spin_node *
osq_wait_next(struct optimistic_spin_queue *lock,
struct optimistic_spin_node *node,
- struct optimistic_spin_node *prev)
+ int old_cpu)
{
- struct optimistic_spin_node *next = NULL;
int curr = encode_cpu(smp_processor_id());
- int old;
-
- /*
- * If there is a prev node in queue, then the 'old' value will be
- * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
- * we're currently last in queue, then the queue will then become empty.
- */
- old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
for (;;) {
if (atomic_read(&lock->tail) == curr &&
- atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
+ atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
* unlock()/unqueue().
*/
- break;
+ return NULL;
}
/*
@@ -76,15 +79,15 @@ osq_wait_next(struct optimistic_spin_queue *lock,
* wait for a new @node->next from its Step-C.
*/
if (node->next) {
+ struct optimistic_spin_node *next;
+
next = xchg(&node->next, NULL);
if (next)
- break;
+ return next;
}
cpu_relax();
}
-
- return next;
}
bool osq_lock(struct optimistic_spin_queue *lock)
@@ -186,7 +189,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* back to @prev.
*/
- next = osq_wait_next(lock, node, prev);
+ next = osq_wait_next(lock, node, prev->cpu);
if (!next)
return false;
@@ -226,7 +229,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
return;
}
- next = osq_wait_next(lock, node, NULL);
+ next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL);
if (next)
WRITE_ONCE(next->locked, 1);
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 21db0df0eb00..4a10e8c16fd2 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -218,6 +218,11 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
return try_cmpxchg_acquire(&lock->owner, &old, new);
}
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ return rt_mutex_cmpxchg_acquire(lock, NULL, current);
+}
+
static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
struct task_struct *old,
struct task_struct *new)
@@ -297,6 +302,20 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
}
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock);
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ /*
+ * With debug enabled rt_mutex_cmpxchg trylock() will always fail.
+ *
+ * Avoid unconditionally taking the slow path by using
+ * rt_mutex_slow_trylock() which is covered by the debug code and can
+ * acquire a non-contended rtmutex.
+ */
+ return rt_mutex_slowtrylock(lock);
+}
+
static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
struct task_struct *old,
struct task_struct *new)
@@ -1613,7 +1632,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
raw_spin_unlock_irq(&lock->wait_lock);
if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
- schedule();
+ rt_mutex_schedule();
raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
@@ -1642,7 +1661,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
WARN(1, "rtmutex deadlock detected\n");
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ rt_mutex_schedule();
}
}
@@ -1738,6 +1757,15 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
int ret;
/*
+ * Do all pre-schedule work here, before we queue a waiter and invoke
+ * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would
+ * otherwise recurse back into task_blocks_on_rt_mutex() through
+ * rtlock_slowlock() and will then enqueue a second waiter for this
+ * same task and things get really confusing real fast.
+ */
+ rt_mutex_pre_schedule();
+
+ /*
* Technically we could use raw_spin_[un]lock_irq() here, but this can
* be called in early boot if the cmpxchg() fast path is disabled
* (debug, no architecture support). In this case we will acquire the
@@ -1748,6 +1776,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
raw_spin_lock_irqsave(&lock->wait_lock, flags);
ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ rt_mutex_post_schedule();
return ret;
}
@@ -1755,7 +1784,9 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
unsigned int state)
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ lockdep_assert(!current->pi_blocked_on);
+
+ if (likely(rt_mutex_try_acquire(lock)))
return 0;
return rt_mutex_slowlock(lock, NULL, state);
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 25ec0239477c..34a59569db6b 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -71,6 +71,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
struct rt_mutex_base *rtm = &rwb->rtmutex;
int ret;
+ rwbase_pre_schedule();
raw_spin_lock_irq(&rtm->wait_lock);
/*
@@ -125,12 +126,15 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
rwbase_rtmutex_unlock(rtm);
trace_contention_end(rwb, ret);
+ rwbase_post_schedule();
return ret;
}
static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
unsigned int state)
{
+ lockdep_assert(!current->pi_blocked_on);
+
if (rwbase_read_trylock(rwb))
return 0;
@@ -237,6 +241,8 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
/* Force readers into slow path */
atomic_sub(READER_BIAS, &rwb->readers);
+ rwbase_pre_schedule();
+
raw_spin_lock_irqsave(&rtm->wait_lock, flags);
if (__rwbase_write_trylock(rwb))
goto out_unlock;
@@ -248,6 +254,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
if (rwbase_signal_pending_state(state, current)) {
rwbase_restore_current_state();
__rwbase_write_unlock(rwb, 0, flags);
+ rwbase_post_schedule();
trace_contention_end(rwb, -EINTR);
return -EINTR;
}
@@ -266,6 +273,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
out_unlock:
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_post_schedule();
return 0;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 9eabd585ce7a..2340b6d90ec6 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1427,8 +1427,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
#define rwbase_signal_pending_state(state, current) \
signal_pending_state(state, current)
+#define rwbase_pre_schedule() \
+ rt_mutex_pre_schedule()
+
#define rwbase_schedule() \
- schedule()
+ rt_mutex_schedule()
+
+#define rwbase_post_schedule() \
+ rt_mutex_post_schedule()
#include "rwbase_rt.c"
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 14235671a1a7..87b03d2e41db 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -12,6 +12,7 @@
#include <linux/debug_locks.h>
#include <linux/delay.h>
#include <linux/export.h>
+#include <linux/pid.h>
void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
struct lock_class_key *key, short inner)
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
index 48a19ed8486d..38e292454fcc 100644
--- a/kernel/locking/spinlock_rt.c
+++ b/kernel/locking/spinlock_rt.c
@@ -37,6 +37,8 @@
static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
{
+ lockdep_assert(!current->pi_blocked_on);
+
if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
rtlock_slowlock(rtm);
}
@@ -184,9 +186,13 @@ static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm)
#define rwbase_signal_pending_state(state, current) (0)
+#define rwbase_pre_schedule()
+
#define rwbase_schedule() \
schedule_rtlock()
+#define rwbase_post_schedule()
+
#include "rwbase_rt.c"
/*
* The common functions which get wrapped into the rwlock API.
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 93cca6e69860..78719e1ef1b1 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -9,7 +9,7 @@
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/module.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/slab.h>
#include <linux/ww_mutex.h>
@@ -386,6 +386,19 @@ struct stress {
int nlocks;
};
+struct rnd_state rng;
+DEFINE_SPINLOCK(rng_lock);
+
+static inline u32 prandom_u32_below(u32 ceil)
+{
+ u32 ret;
+
+ spin_lock(&rng_lock);
+ ret = prandom_u32_state(&rng) % ceil;
+ spin_unlock(&rng_lock);
+ return ret;
+}
+
static int *get_random_order(int count)
{
int *order;
@@ -399,7 +412,7 @@ static int *get_random_order(int count)
order[n] = n;
for (n = count - 1; n > 1; n--) {
- r = get_random_u32_below(n + 1);
+ r = prandom_u32_below(n + 1);
if (r != n) {
tmp = order[n];
order[n] = order[r];
@@ -452,21 +465,21 @@ retry:
ww_mutex_unlock(&locks[order[n]]);
if (err == -EDEADLK) {
- ww_mutex_lock_slow(&locks[order[contended]], &ctx);
- goto retry;
+ if (!time_after(jiffies, stress->timeout)) {
+ ww_mutex_lock_slow(&locks[order[contended]], &ctx);
+ goto retry;
+ }
}
+ ww_acquire_fini(&ctx);
if (err) {
pr_err_once("stress (%s) failed with %d\n",
__func__, err);
break;
}
-
- ww_acquire_fini(&ctx);
} while (!time_after(jiffies, stress->timeout));
kfree(order);
- kfree(stress);
}
struct reorder_lock {
@@ -531,7 +544,6 @@ out:
list_for_each_entry_safe(ll, ln, &locks, link)
kfree(ll);
kfree(order);
- kfree(stress);
}
static void stress_one_work(struct work_struct *work)
@@ -552,8 +564,6 @@ static void stress_one_work(struct work_struct *work)
break;
}
} while (!time_after(jiffies, stress->timeout));
-
- kfree(stress);
}
#define STRESS_INORDER BIT(0)
@@ -564,15 +574,24 @@ static void stress_one_work(struct work_struct *work)
static int stress(int nlocks, int nthreads, unsigned int flags)
{
struct ww_mutex *locks;
- int n;
+ struct stress *stress_array;
+ int n, count;
locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
if (!locks)
return -ENOMEM;
+ stress_array = kmalloc_array(nthreads, sizeof(*stress_array),
+ GFP_KERNEL);
+ if (!stress_array) {
+ kfree(locks);
+ return -ENOMEM;
+ }
+
for (n = 0; n < nlocks; n++)
ww_mutex_init(&locks[n], &ww_class);
+ count = 0;
for (n = 0; nthreads; n++) {
struct stress *stress;
void (*fn)(struct work_struct *work);
@@ -596,9 +615,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
if (!fn)
continue;
- stress = kmalloc(sizeof(*stress), GFP_KERNEL);
- if (!stress)
- break;
+ stress = &stress_array[count++];
INIT_WORK(&stress->work, fn);
stress->locks = locks;
@@ -613,6 +630,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
for (n = 0; n < nlocks; n++)
ww_mutex_destroy(&locks[n]);
+ kfree(stress_array);
kfree(locks);
return 0;
@@ -625,6 +643,8 @@ static int __init test_ww_mutex_init(void)
printk(KERN_INFO "Beginning ww mutex selftests\n");
+ prandom_seed_state(&rng, get_random_u64());
+
wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
if (!wq)
return -ENOMEM;
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
index d1473c624105..c7196de838ed 100644
--- a/kernel/locking/ww_rt_mutex.c
+++ b/kernel/locking/ww_rt_mutex.c
@@ -62,7 +62,7 @@ __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
}
mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip);
- if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) {
+ if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) {
if (ww_ctx)
ww_mutex_set_context_fastpath(lock, ww_ctx);
return 0;
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 33a2e991f608..0ea1b2970a23 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -236,14 +236,6 @@ choice
possible to load a signed module containing the algorithm to check
the signature on that module.
-config MODULE_SIG_SHA1
- bool "Sign modules with SHA-1"
- select CRYPTO_SHA1
-
-config MODULE_SIG_SHA224
- bool "Sign modules with SHA-224"
- select CRYPTO_SHA256
-
config MODULE_SIG_SHA256
bool "Sign modules with SHA-256"
select CRYPTO_SHA256
@@ -256,16 +248,29 @@ config MODULE_SIG_SHA512
bool "Sign modules with SHA-512"
select CRYPTO_SHA512
+config MODULE_SIG_SHA3_256
+ bool "Sign modules with SHA3-256"
+ select CRYPTO_SHA3
+
+config MODULE_SIG_SHA3_384
+ bool "Sign modules with SHA3-384"
+ select CRYPTO_SHA3
+
+config MODULE_SIG_SHA3_512
+ bool "Sign modules with SHA3-512"
+ select CRYPTO_SHA3
+
endchoice
config MODULE_SIG_HASH
string
depends on MODULE_SIG || IMA_APPRAISE_MODSIG
- default "sha1" if MODULE_SIG_SHA1
- default "sha224" if MODULE_SIG_SHA224
default "sha256" if MODULE_SIG_SHA256
default "sha384" if MODULE_SIG_SHA384
default "sha512" if MODULE_SIG_SHA512
+ default "sha3-256" if MODULE_SIG_SHA3_256
+ default "sha3-384" if MODULE_SIG_SHA3_384
+ default "sha3-512" if MODULE_SIG_SHA3_512
choice
prompt "Module compression mode"
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index 87440f714c0c..474e68f0f063 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -100,7 +100,7 @@ static ssize_t module_gzip_decompress(struct load_info *info,
s.next_in = buf + gzip_hdr_len;
s.avail_in = size - gzip_hdr_len;
- s.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
+ s.workspace = kvmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
if (!s.workspace)
return -ENOMEM;
@@ -138,7 +138,7 @@ static ssize_t module_gzip_decompress(struct load_info *info,
out_inflate_end:
zlib_inflateEnd(&s);
out:
- kfree(s.workspace);
+ kvfree(s.workspace);
return retval;
}
#elif defined(CONFIG_MODULE_COMPRESS_XZ)
@@ -241,7 +241,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
}
wksp_size = zstd_dstream_workspace_bound(header.windowSize);
- wksp = vmalloc(wksp_size);
+ wksp = kvmalloc(wksp_size, GFP_KERNEL);
if (!wksp) {
retval = -ENOMEM;
goto out;
@@ -284,7 +284,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
retval = new_size;
out:
- vfree(wksp);
+ kvfree(wksp);
return retval;
}
#else
diff --git a/kernel/module/dups.c b/kernel/module/dups.c
index f3d7ea1e96d8..9a92f2f8c9d3 100644
--- a/kernel/module/dups.c
+++ b/kernel/module/dups.c
@@ -207,7 +207,7 @@ bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
* optimization enabled ...
*/
ret = wait_for_completion_state(&kmod_req->first_req_done,
- TASK_UNINTERRUPTIBLE | TASK_KILLABLE);
+ TASK_KILLABLE);
if (ret) {
*dup_ret = ret;
return true;
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 98fedfdb8db5..36681911c05a 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2199,6 +2199,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
mod->kunit_suites = section_objs(info, ".kunit_test_suites",
sizeof(*mod->kunit_suites),
&mod->num_kunit_suites);
+ mod->kunit_init_suites = section_objs(info, ".kunit_init_test_suites",
+ sizeof(*mod->kunit_init_suites),
+ &mod->num_kunit_init_suites);
#endif
mod->extable = section_objs(info, "__ex_table",
diff --git a/kernel/module/stats.c b/kernel/module/stats.c
index 6ab2c94d6bc3..3ba0e98b3c91 100644
--- a/kernel/module/stats.c
+++ b/kernel/module/stats.c
@@ -126,7 +126,7 @@ static LIST_HEAD(dup_failed_modules);
* These typically should not happen unless your system is under memory
* pressure.
* * invalid_becoming_bytes: total number of bytes allocated and freed used
- * used to read the kernel module userspace wants us to read before we
+ * to read the kernel module userspace wants us to read before we
* promote it to be processed to be added to our @modules linked list. These
* failures can happen if we had a check in between a successful kernel_read_file_from_fd()
* call and right before we allocate the our private memory for the module
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
index c921bf044050..d964167c6658 100644
--- a/kernel/module/sysfs.c
+++ b/kernel/module/sysfs.c
@@ -143,7 +143,7 @@ static void remove_sect_attrs(struct module *mod)
struct module_notes_attrs {
struct kobject *dir;
unsigned int notes;
- struct bin_attribute attrs[];
+ struct bin_attribute attrs[] __counted_by(notes);
};
static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
diff --git a/kernel/numa.c b/kernel/numa.c
new file mode 100644
index 000000000000..67ca6b8585c0
--- /dev/null
+++ b/kernel/numa.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/printk.h>
+#include <linux/numa.h>
+
+/* Stub functions: */
+
+#ifndef memory_add_physaddr_to_nid
+int memory_add_physaddr_to_nid(u64 start)
+{
+ pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
+ start);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
+#ifndef phys_to_target_node
+int phys_to_target_node(u64 start)
+{
+ pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+ start);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
+#endif
diff --git a/kernel/padata.c b/kernel/padata.c
index 222d60195de6..179fb1518070 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -202,7 +202,7 @@ int padata_do_parallel(struct padata_shell *ps,
*cb_cpu = cpu;
}
- err = -EBUSY;
+ err = -EBUSY;
if ((pinst->flags & PADATA_RESET))
goto out;
@@ -1102,12 +1102,16 @@ EXPORT_SYMBOL(padata_alloc_shell);
*/
void padata_free_shell(struct padata_shell *ps)
{
+ struct parallel_data *pd;
+
if (!ps)
return;
mutex_lock(&ps->pinst->lock);
list_del(&ps->list);
- padata_free_pd(rcu_dereference_protected(ps->pd, 1));
+ pd = rcu_dereference_protected(ps->pd, 1);
+ if (refcount_dec_and_test(&pd->refcnt))
+ padata_free_pd(pd);
mutex_unlock(&ps->pinst->lock);
kfree(ps);
diff --git a/kernel/panic.c b/kernel/panic.c
index 07239d4ad81e..2807639aab51 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -192,14 +192,15 @@ atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
*/
void nmi_panic(struct pt_regs *regs, const char *msg)
{
- int old_cpu, cpu;
+ int old_cpu, this_cpu;
- cpu = raw_smp_processor_id();
- old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);
+ old_cpu = PANIC_CPU_INVALID;
+ this_cpu = raw_smp_processor_id();
- if (old_cpu == PANIC_CPU_INVALID)
+ /* atomic_try_cmpxchg updates old_cpu on failure */
+ if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu))
panic("%s", msg);
- else if (old_cpu != cpu)
+ else if (old_cpu != this_cpu)
nmi_panic_self_stop(regs);
}
EXPORT_SYMBOL(nmi_panic);
@@ -311,15 +312,18 @@ void panic(const char *fmt, ...)
* stop themself or will wait until they are stopped by the 1st CPU
* with smp_send_stop().
*
- * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
- * comes here, so go ahead.
+ * cmpxchg success means this is the 1st CPU which comes here,
+ * so go ahead.
* `old_cpu == this_cpu' means we came from nmi_panic() which sets
* panic_cpu to this CPU. In this case, this is also the 1st CPU.
*/
+ old_cpu = PANIC_CPU_INVALID;
this_cpu = raw_smp_processor_id();
- old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
- if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
+ /* atomic_try_cmpxchg updates old_cpu on failure */
+ if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+ /* go ahead */
+ } else if (old_cpu != this_cpu)
panic_smp_self_stop();
console_verbose();
@@ -697,6 +701,7 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,
if (!fmt) {
__warn(file, line, __builtin_return_address(0), taint,
NULL, NULL);
+ warn_rcu_exit(rcu);
return;
}
diff --git a/kernel/params.c b/kernel/params.c
index 2d4a0564697e..2e447f8ae183 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -1,19 +1,20 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/* Helpers for initial module or kernel cmdline parsing
- Copyright (C) 2001 Rusty Russell.
-
-*/
+/*
+ * Helpers for initial module or kernel cmdline parsing
+ * Copyright (C) 2001 Rusty Russell.
+ */
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/kstrtox.h>
-#include <linux/string.h>
-#include <linux/errno.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/ctype.h>
+#include <linux/overflow.h>
#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/string.h>
#ifdef CONFIG_SYSFS
/* Protects all built-in parameters, modules use their own param_lock */
@@ -48,7 +49,7 @@ static void *kmalloc_parameter(unsigned int size)
{
struct kmalloced_param *p;
- p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
+ p = kmalloc(size_add(sizeof(*p), size), GFP_KERNEL);
if (!p)
return NULL;
@@ -120,9 +121,7 @@ static int parse_one(char *param,
unsigned num_params,
s16 min_level,
s16 max_level,
- void *arg,
- int (*handle_unknown)(char *param, char *val,
- const char *doing, void *arg))
+ void *arg, parse_unknown_fn handle_unknown)
{
unsigned int i;
int err;
@@ -165,9 +164,7 @@ char *parse_args(const char *doing,
unsigned num,
s16 min_level,
s16 max_level,
- void *arg,
- int (*unknown)(char *param, char *val,
- const char *doing, void *arg))
+ void *arg, parse_unknown_fn unknown)
{
char *param, *val, *err = NULL;
@@ -264,17 +261,22 @@ EXPORT_SYMBOL_GPL(param_set_uint_minmax);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
- if (strlen(val) > 1024) {
+ size_t len, maxlen = 1024;
+
+ len = strnlen(val, maxlen + 1);
+ if (len == maxlen + 1) {
pr_err("%s: string parameter too long\n", kp->name);
return -ENOSPC;
}
maybe_kfree_parameter(*(char **)kp->arg);
- /* This is a hack. We can't kmalloc in early boot, and we
- * don't need to; this mangled commandline is preserved. */
+ /*
+ * This is a hack. We can't kmalloc() in early boot, and we
+ * don't need to; this mangled commandline is preserved.
+ */
if (slab_is_available()) {
- *(char **)kp->arg = kmalloc_parameter(strlen(val)+1);
+ *(char **)kp->arg = kmalloc_parameter(len + 1);
if (!*(char **)kp->arg)
return -ENOMEM;
strcpy(*(char **)kp->arg, val);
@@ -512,7 +514,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
- if (strlen(val)+1 > kps->maxlen) {
+ if (strnlen(val, kps->maxlen) == kps->maxlen) {
pr_err("%s: string doesn't fit in %u chars.\n",
kp->name, kps->maxlen-1);
return -ENOSPC;
@@ -743,8 +745,10 @@ void module_param_sysfs_remove(struct module *mod)
{
if (mod->mkobj.mp) {
sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
- /* We are positive that no one is using any param
- * attrs at this point. Deallocate immediately. */
+ /*
+ * We are positive that no one is using any param
+ * attrs at this point. Deallocate immediately.
+ */
free_module_param_attrs(&mod->mkobj);
}
}
diff --git a/kernel/pid.c b/kernel/pid.c
index fee14a4486a3..b52b10865454 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -609,7 +609,7 @@ int pidfd_create(struct pid *pid, unsigned int flags)
}
/**
- * pidfd_open() - Open new pid file descriptor.
+ * sys_pidfd_open() - Open new pid file descriptor.
*
* @pid: pid for which to retrieve a pidfd
* @flags: flags to pass
@@ -700,7 +700,7 @@ static int pidfd_getfd(struct pid *pid, int fd)
if (IS_ERR(file))
return PTR_ERR(file);
- ret = receive_fd(file, O_CLOEXEC);
+ ret = receive_fd(file, NULL, O_CLOEXEC);
fput(file);
return ret;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 619972c78774..7ade20e95232 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/idr.h>
+#include <uapi/linux/wait.h>
#include "pid_sysctl.h"
static DEFINE_MUTEX(pid_caches_mutex);
@@ -286,12 +287,6 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
return -EPERM;
- /*
- * Writing directly to ns' last_pid field is OK, since this field
- * is volatile in a living namespace anyway and a code writing to
- * it should synchronize its usage with external means.
- */
-
next = idr_get_cursor(&pid_ns->idr) - 1;
tmp.data = &next;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 2b4a946a6ff5..4b0b7cf2e019 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -642,9 +642,9 @@ int hibernation_platform_enter(void)
*/
static void power_down(void)
{
-#ifdef CONFIG_SUSPEND
int error;
+#ifdef CONFIG_SUSPEND
if (hibernation_mode == HIBERNATION_SUSPEND) {
error = suspend_devices_and_enter(mem_sleep_current);
if (error) {
@@ -667,7 +667,13 @@ static void power_down(void)
kernel_restart(NULL);
break;
case HIBERNATION_PLATFORM:
- hibernation_platform_enter();
+ error = hibernation_platform_enter();
+ if (error == -EAGAIN || error == -EBUSY) {
+ swsusp_unmark();
+ events_check_enabled = false;
+ pr_info("Wakeup event detected during hibernation, rolling back.\n");
+ return;
+ }
fallthrough;
case HIBERNATION_SHUTDOWN:
if (kernel_can_power_off())
@@ -684,7 +690,7 @@ static void power_down(void)
cpu_relax();
}
-static int load_image_and_restore(bool snapshot_test)
+static int load_image_and_restore(void)
{
int error;
unsigned int flags;
@@ -694,12 +700,12 @@ static int load_image_and_restore(bool snapshot_test)
lock_device_hotplug();
error = create_basic_memory_bitmaps();
if (error) {
- swsusp_close(snapshot_test);
+ swsusp_close();
goto Unlock;
}
error = swsusp_read(&flags);
- swsusp_close(snapshot_test);
+ swsusp_close();
if (!error)
error = hibernation_restore(flags & SF_PLATFORM_MODE);
@@ -786,9 +792,9 @@ int hibernate(void)
unlock_device_hotplug();
if (snapshot_test) {
pm_pr_dbg("Checking hibernation image\n");
- error = swsusp_check(snapshot_test);
+ error = swsusp_check(false);
if (!error)
- error = load_image_and_restore(snapshot_test);
+ error = load_image_and_restore();
}
thaw_processes();
@@ -945,14 +951,14 @@ static int software_resume(void)
pm_pr_dbg("Looking for hibernation image.\n");
mutex_lock(&system_transition_mutex);
- error = swsusp_check(false);
+ error = swsusp_check(true);
if (error)
goto Unlock;
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
- swsusp_close(false);
+ swsusp_close();
goto Unlock;
}
@@ -973,7 +979,7 @@ static int software_resume(void)
goto Close_Finish;
}
- error = load_image_and_restore(false);
+ error = load_image_and_restore();
thaw_processes();
Finish:
pm_notifier_call_chain(PM_POST_RESTORE);
@@ -987,7 +993,7 @@ static int software_resume(void)
pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
- swsusp_close(false);
+ swsusp_close();
goto Finish;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f6425ae3e8b0..b1ae9b677d03 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -60,22 +60,6 @@ EXPORT_SYMBOL_GPL(lock_system_sleep);
void unlock_system_sleep(unsigned int flags)
{
- /*
- * Don't use freezer_count() because we don't want the call to
- * try_to_freeze() here.
- *
- * Reason:
- * Fundamentally, we just don't need it, because freezing condition
- * doesn't come into effect until we release the
- * system_transition_mutex lock, since the freezer always works with
- * system_transition_mutex held.
- *
- * More importantly, in the case of hibernation,
- * unlock_system_sleep() gets called in snapshot_read() and
- * snapshot_write() when the freezing condition is still in effect.
- * Which means, if we use try_to_freeze() here, it would make them
- * enter the refrigerator, thus causing hibernation to lockup.
- */
if (!(flags & PF_NOFREEZE))
current->flags &= ~PF_NOFREEZE;
mutex_unlock(&system_transition_mutex);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46eb14dc50c3..8499a39c62f4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -168,13 +168,15 @@ extern int swsusp_swap_in_use(void);
#define SF_HW_SIG 8
/* kernel/power/hibernate.c */
-int swsusp_check(bool snapshot_test);
+int swsusp_check(bool exclusive);
extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
-void swsusp_close(bool snapshot_test);
+void swsusp_close(void);
#ifdef CONFIG_SUSPEND
extern int swsusp_unmark(void);
+#else
+static inline int swsusp_unmark(void) { return 0; }
#endif
struct __kernel_old_timeval;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 87e9f7e2bdc0..5c96ff067c64 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1119,7 +1119,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
int create_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
- int error = 0;
+ int error;
if (forbidden_pages_map && free_pages_map)
return 0;
@@ -1487,11 +1487,11 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
s_page = pfn_to_page(src_pfn);
d_page = pfn_to_page(dst_pfn);
if (PageHighMem(s_page)) {
- src = kmap_atomic(s_page);
- dst = kmap_atomic(d_page);
+ src = kmap_local_page(s_page);
+ dst = kmap_local_page(d_page);
zeros_only = do_copy_page(dst, src);
- kunmap_atomic(dst);
- kunmap_atomic(src);
+ kunmap_local(dst);
+ kunmap_local(src);
} else {
if (PageHighMem(d_page)) {
/*
@@ -1499,9 +1499,9 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
* data modified by kmap_atomic()
*/
zeros_only = safe_copy_page(buffer, s_page);
- dst = kmap_atomic(d_page);
+ dst = kmap_local_page(d_page);
copy_page(dst, buffer);
- kunmap_atomic(dst);
+ kunmap_local(dst);
} else {
zeros_only = safe_copy_page(page_address(d_page), s_page);
}
@@ -2545,8 +2545,9 @@ static void *get_highmem_page_buffer(struct page *page,
pbe->copy_page = tmp;
} else {
/* Copy of the page will be stored in normal memory */
- kaddr = safe_pages_list;
- safe_pages_list = safe_pages_list->next;
+ kaddr = __get_safe_page(ca->gfp_mask);
+ if (!kaddr)
+ return ERR_PTR(-ENOMEM);
pbe->copy_page = virt_to_page(kaddr);
}
pbe->next = highmem_pblist;
@@ -2647,7 +2648,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm,
memory_bm_free(bm, PG_UNSAFE_KEEP);
/* Make a copy of zero_bm so it can be created in safe pages */
- error = memory_bm_create(&tmp, GFP_ATOMIC, PG_ANY);
+ error = memory_bm_create(&tmp, GFP_ATOMIC, PG_SAFE);
if (error)
goto Free;
@@ -2660,7 +2661,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm,
goto Free;
duplicate_memory_bitmap(zero_bm, &tmp);
- memory_bm_free(&tmp, PG_UNSAFE_KEEP);
+ memory_bm_free(&tmp, PG_UNSAFE_CLEAR);
/* At this point zero_bm is in safe pages and it can be used for restoring. */
if (nr_highmem > 0) {
@@ -2750,8 +2751,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
return ERR_PTR(-ENOMEM);
}
pbe->orig_address = page_address(page);
- pbe->address = safe_pages_list;
- safe_pages_list = safe_pages_list->next;
+ pbe->address = __get_safe_page(ca->gfp_mask);
+ if (!pbe->address)
+ return ERR_PTR(-ENOMEM);
pbe->next = restore_pblist;
restore_pblist = pbe;
return pbe->address;
@@ -2776,15 +2778,13 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
int snapshot_write_next(struct snapshot_handle *handle)
{
static struct chain_allocator ca;
- int error = 0;
+ int error;
next:
/* Check if we have already loaded the entire image */
if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages)
return 0;
- handle->sync_read = 1;
-
if (!handle->cur) {
if (!buffer)
/* This makes the buffer be freed by swsusp_free() */
@@ -2827,7 +2827,6 @@ next:
memory_bm_position_reset(&zero_bm);
restore_pblist = NULL;
handle->buffer = get_buffer(&orig_bm, &ca);
- handle->sync_read = 0;
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
}
@@ -2837,9 +2836,8 @@ next:
handle->buffer = get_buffer(&orig_bm, &ca);
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
- if (handle->buffer != buffer)
- handle->sync_read = 0;
}
+ handle->sync_read = (handle->buffer == buffer);
handle->cur++;
/* Zero pages were not included in the image, memset it and move on. */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f6ebcd00c410..6053ddddaf65 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -222,7 +222,7 @@ int swsusp_swap_in_use(void)
*/
static unsigned short root_swap = 0xffff;
-static struct block_device *hib_resume_bdev;
+static struct bdev_handle *hib_resume_bdev_handle;
struct hib_bio_batch {
atomic_t count;
@@ -276,7 +276,8 @@ static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr,
struct bio *bio;
int error = 0;
- bio = bio_alloc(hib_resume_bdev, 1, opf, GFP_NOIO | __GFP_HIGH);
+ bio = bio_alloc(hib_resume_bdev_handle->bdev, 1, opf,
+ GFP_NOIO | __GFP_HIGH);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
@@ -356,14 +357,14 @@ static int swsusp_swap_check(void)
return res;
root_swap = res;
- hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+ hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
BLK_OPEN_WRITE, NULL, NULL);
- if (IS_ERR(hib_resume_bdev))
- return PTR_ERR(hib_resume_bdev);
+ if (IS_ERR(hib_resume_bdev_handle))
+ return PTR_ERR(hib_resume_bdev_handle);
- res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
+ res = set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
if (res < 0)
- blkdev_put(hib_resume_bdev, NULL);
+ bdev_release(hib_resume_bdev_handle);
return res;
}
@@ -443,14 +444,14 @@ static int get_swap_writer(struct swap_map_handle *handle)
err_rel:
release_swap_writer(handle);
err_close:
- swsusp_close(false);
+ swsusp_close();
return ret;
}
static int swap_write_page(struct swap_map_handle *handle, void *buf,
struct hib_bio_batch *hb)
{
- int error = 0;
+ int error;
sector_t offset;
if (!handle->cur)
@@ -508,7 +509,7 @@ static int swap_writer_finish(struct swap_map_handle *handle,
if (error)
free_all_swap_pages(root_swap);
release_swap_writer(handle);
- swsusp_close(false);
+ swsusp_close();
return error;
}
@@ -605,11 +606,11 @@ static int crc32_threadfn(void *data)
unsigned i;
while (1) {
- wait_event(d->go, atomic_read(&d->ready) ||
+ wait_event(d->go, atomic_read_acquire(&d->ready) ||
kthread_should_stop());
if (kthread_should_stop()) {
d->thr = NULL;
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
break;
}
@@ -618,7 +619,7 @@ static int crc32_threadfn(void *data)
for (i = 0; i < d->run_threads; i++)
*d->crc32 = crc32_le(*d->crc32,
d->unc[i], *d->unc_len[i]);
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
return 0;
@@ -648,12 +649,12 @@ static int lzo_compress_threadfn(void *data)
struct cmp_data *d = data;
while (1) {
- wait_event(d->go, atomic_read(&d->ready) ||
+ wait_event(d->go, atomic_read_acquire(&d->ready) ||
kthread_should_stop());
if (kthread_should_stop()) {
d->thr = NULL;
d->ret = -1;
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
break;
}
@@ -662,7 +663,7 @@ static int lzo_compress_threadfn(void *data)
d->ret = lzo1x_1_compress(d->unc, d->unc_len,
d->cmp + LZO_HEADER, &d->cmp_len,
d->wrk);
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
return 0;
@@ -797,7 +798,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
data[thr].unc_len = off;
- atomic_set(&data[thr].ready, 1);
+ atomic_set_release(&data[thr].ready, 1);
wake_up(&data[thr].go);
}
@@ -805,12 +806,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
break;
crc->run_threads = thr;
- atomic_set(&crc->ready, 1);
+ atomic_set_release(&crc->ready, 1);
wake_up(&crc->go);
for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
wait_event(data[thr].done,
- atomic_read(&data[thr].stop));
+ atomic_read_acquire(&data[thr].stop));
atomic_set(&data[thr].stop, 0);
ret = data[thr].ret;
@@ -849,7 +850,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
}
}
- wait_event(crc->done, atomic_read(&crc->stop));
+ wait_event(crc->done, atomic_read_acquire(&crc->stop));
atomic_set(&crc->stop, 0);
}
@@ -1131,12 +1132,12 @@ static int lzo_decompress_threadfn(void *data)
struct dec_data *d = data;
while (1) {
- wait_event(d->go, atomic_read(&d->ready) ||
+ wait_event(d->go, atomic_read_acquire(&d->ready) ||
kthread_should_stop());
if (kthread_should_stop()) {
d->thr = NULL;
d->ret = -1;
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
break;
}
@@ -1149,7 +1150,7 @@ static int lzo_decompress_threadfn(void *data)
flush_icache_range((unsigned long)d->unc,
(unsigned long)d->unc + d->unc_len);
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
return 0;
@@ -1334,7 +1335,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
if (crc->run_threads) {
- wait_event(crc->done, atomic_read(&crc->stop));
+ wait_event(crc->done, atomic_read_acquire(&crc->stop));
atomic_set(&crc->stop, 0);
crc->run_threads = 0;
}
@@ -1370,7 +1371,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
pg = 0;
}
- atomic_set(&data[thr].ready, 1);
+ atomic_set_release(&data[thr].ready, 1);
wake_up(&data[thr].go);
}
@@ -1389,7 +1390,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
wait_event(data[thr].done,
- atomic_read(&data[thr].stop));
+ atomic_read_acquire(&data[thr].stop));
atomic_set(&data[thr].stop, 0);
ret = data[thr].ret;
@@ -1420,7 +1421,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
ret = snapshot_write_next(snapshot);
if (ret <= 0) {
crc->run_threads = thr + 1;
- atomic_set(&crc->ready, 1);
+ atomic_set_release(&crc->ready, 1);
wake_up(&crc->go);
goto out_finish;
}
@@ -1428,13 +1429,13 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
crc->run_threads = thr;
- atomic_set(&crc->ready, 1);
+ atomic_set_release(&crc->ready, 1);
wake_up(&crc->go);
}
out_finish:
if (crc->run_threads) {
- wait_event(crc->done, atomic_read(&crc->stop));
+ wait_event(crc->done, atomic_read_acquire(&crc->stop));
atomic_set(&crc->stop, 0);
}
stop = ktime_get();
@@ -1513,18 +1514,19 @@ end:
static void *swsusp_holder;
/**
- * swsusp_check - Check for swsusp signature in the resume device
+ * swsusp_check - Open the resume device and check for the swsusp signature.
+ * @exclusive: Open the resume device exclusively.
*/
-int swsusp_check(bool snapshot_test)
+int swsusp_check(bool exclusive)
{
- void *holder = snapshot_test ? &swsusp_holder : NULL;
+ void *holder = exclusive ? &swsusp_holder : NULL;
int error;
- hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ,
- holder, NULL);
- if (!IS_ERR(hib_resume_bdev)) {
- set_blocksize(hib_resume_bdev, PAGE_SIZE);
+ hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
+ BLK_OPEN_READ, holder, NULL);
+ if (!IS_ERR(hib_resume_bdev_handle)) {
+ set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
clear_page(swsusp_header);
error = hib_submit_io(REQ_OP_READ, swsusp_resume_block,
swsusp_header, NULL);
@@ -1549,11 +1551,11 @@ int swsusp_check(bool snapshot_test)
put:
if (error)
- blkdev_put(hib_resume_bdev, holder);
+ bdev_release(hib_resume_bdev_handle);
else
pr_debug("Image signature found, resuming\n");
} else {
- error = PTR_ERR(hib_resume_bdev);
+ error = PTR_ERR(hib_resume_bdev_handle);
}
if (error)
@@ -1563,17 +1565,17 @@ put:
}
/**
- * swsusp_close - close swap device.
+ * swsusp_close - close resume device.
*/
-void swsusp_close(bool snapshot_test)
+void swsusp_close(void)
{
- if (IS_ERR(hib_resume_bdev)) {
+ if (IS_ERR(hib_resume_bdev_handle)) {
pr_debug("Image device not initialised\n");
return;
}
- blkdev_put(hib_resume_bdev, snapshot_test ? &swsusp_holder : NULL);
+ bdev_release(hib_resume_bdev_handle);
}
/**
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index f5b388e810b9..39a2b61c7232 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
-obj-$(CONFIG_PRINTK) += printk_safe.o
+obj-$(CONFIG_PRINTK) += printk_safe.o nbcon.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
obj-$(CONFIG_PRINTK_INDEX) += index.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 2a17704136f1..6c2afee5ef62 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -3,6 +3,8 @@
* internal.h - printk internal definitions
*/
#include <linux/percpu.h>
+#include <linux/console.h>
+#include "printk_ringbuffer.h"
#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
void __init printk_sysctl_init(void);
@@ -12,6 +14,12 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
#define printk_sysctl_init() do { } while (0)
#endif
+#define con_printk(lvl, con, fmt, ...) \
+ printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt), \
+ (con->flags & CON_NBCON) ? "" : "legacy ", \
+ (con->flags & CON_BOOT) ? "boot" : "", \
+ con->name, con->index, ##__VA_ARGS__)
+
#ifdef CONFIG_PRINTK
#ifdef CONFIG_PRINTK_CALLER
@@ -35,6 +43,8 @@ enum printk_info_flags {
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
+extern struct printk_ringbuffer *prb;
+
__printf(4, 0)
int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
@@ -61,6 +71,13 @@ void defer_console_output(void);
u16 printk_parse_prefix(const char *text, int *level,
enum printk_info_flags *flags);
+
+u64 nbcon_seq_read(struct console *con);
+void nbcon_seq_force(struct console *con, u64 seq);
+bool nbcon_alloc(struct console *con);
+void nbcon_init(struct console *con);
+void nbcon_free(struct console *con);
+
#else
#define PRINTK_PREFIX_MAX 0
@@ -76,8 +93,16 @@ u16 printk_parse_prefix(const char *text, int *level,
#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
static inline bool printk_percpu_data_ready(void) { return false; }
+static inline u64 nbcon_seq_read(struct console *con) { return 0; }
+static inline void nbcon_seq_force(struct console *con, u64 seq) { }
+static inline bool nbcon_alloc(struct console *con) { return false; }
+static inline void nbcon_init(struct console *con) { }
+static inline void nbcon_free(struct console *con) { }
+
#endif /* CONFIG_PRINTK */
+extern struct printk_buffers printk_shared_pbufs;
+
/**
* struct printk_buffers - Buffers to read/format/output printk messages.
* @outbuf: After formatting, contains text to output.
@@ -103,3 +128,11 @@ struct printk_message {
u64 seq;
unsigned long dropped;
};
+
+bool other_cpu_in_panic(void);
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_supress);
+
+#ifdef CONFIG_PRINTK
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped);
+#endif
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
new file mode 100644
index 000000000000..b96077152f49
--- /dev/null
+++ b/kernel/printk/nbcon.c
@@ -0,0 +1,1029 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2022 Linutronix GmbH, John Ogness
+// Copyright (C) 2022 Intel, Thomas Gleixner
+
+#include <linux/kernel.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include "internal.h"
+/*
+ * Printk console printing implementation for consoles which does not depend
+ * on the legacy style console_lock mechanism.
+ *
+ * The state of the console is maintained in the "nbcon_state" atomic
+ * variable.
+ *
+ * The console is locked when:
+ *
+ * - The 'prio' field contains the priority of the context that owns the
+ * console. Only higher priority contexts are allowed to take over the
+ * lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked.
+ *
+ * - The 'cpu' field denotes on which CPU the console is locked. It is used
+ * to prevent busy waiting on the same CPU. Also it informs the lock owner
+ * that it has lost the lock in a more complex scenario when the lock was
+ * taken over by a higher priority context, released, and taken on another
+ * CPU with the same priority as the interrupted owner.
+ *
+ * The acquire mechanism uses a few more fields:
+ *
+ * - The 'req_prio' field is used by the handover approach to make the
+ * current owner aware that there is a context with a higher priority
+ * waiting for the friendly handover.
+ *
+ * - The 'unsafe' field allows to take over the console in a safe way in the
+ * middle of emitting a message. The field is set only when accessing some
+ * shared resources or when the console device is manipulated. It can be
+ * cleared, for example, after emitting one character when the console
+ * device is in a consistent state.
+ *
+ * - The 'unsafe_takeover' field is set when a hostile takeover took the
+ * console in an unsafe state. The console will stay in the unsafe state
+ * until re-initialized.
+ *
+ * The acquire mechanism uses three approaches:
+ *
+ * 1) Direct acquire when the console is not owned or is owned by a lower
+ * priority context and is in a safe state.
+ *
+ * 2) Friendly handover mechanism uses a request/grant handshake. It is used
+ * when the current owner has lower priority and the console is in an
+ * unsafe state.
+ *
+ * The requesting context:
+ *
+ * a) Sets its priority into the 'req_prio' field.
+ *
+ * b) Waits (with a timeout) for the owning context to unlock the
+ * console.
+ *
+ * c) Takes the lock and clears the 'req_prio' field.
+ *
+ * The owning context:
+ *
+ * a) Observes the 'req_prio' field set on exit from the unsafe
+ * console state.
+ *
+ * b) Gives up console ownership by clearing the 'prio' field.
+ *
+ * 3) Unsafe hostile takeover allows to take over the lock even when the
+ * console is an unsafe state. It is used only in panic() by the final
+ * attempt to flush consoles in a try and hope mode.
+ *
+ * Note that separate record buffers are used in panic(). As a result,
+ * the messages can be read and formatted without any risk even after
+ * using the hostile takeover in unsafe state.
+ *
+ * The release function simply clears the 'prio' field.
+ *
+ * All operations on @console::nbcon_state are atomic cmpxchg based to
+ * handle concurrency.
+ *
+ * The acquire/release functions implement only minimal policies:
+ *
+ * - Preference for higher priority contexts.
+ * - Protection of the panic CPU.
+ *
+ * All other policy decisions must be made at the call sites:
+ *
+ * - What is marked as an unsafe section.
+ * - Whether to spin-wait if there is already an owner and the console is
+ * in an unsafe state.
+ * - Whether to attempt an unsafe hostile takeover.
+ *
+ * The design allows to implement the well known:
+ *
+ * acquire()
+ * output_one_printk_record()
+ * release()
+ *
+ * The output of one printk record might be interrupted with a higher priority
+ * context. The new owner is supposed to reprint the entire interrupted record
+ * from scratch.
+ */
+
+/**
+ * nbcon_state_set - Helper function to set the console state
+ * @con: Console to update
+ * @new: The new state to write
+ *
+ * Only to be used when the console is not yet or no longer visible in the
+ * system. Otherwise use nbcon_state_try_cmpxchg().
+ */
+static inline void nbcon_state_set(struct console *con, struct nbcon_state *new)
+{
+ atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom);
+}
+
+/**
+ * nbcon_state_read - Helper function to read the console state
+ * @con: Console to read
+ * @state: The state to store the result
+ */
+static inline void nbcon_state_read(struct console *con, struct nbcon_state *state)
+{
+ state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state));
+}
+
+/**
+ * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state
+ * @con: Console to update
+ * @cur: Old/expected state
+ * @new: New state
+ *
+ * Return: True on success. False on fail and @cur is updated.
+ */
+static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur,
+ struct nbcon_state *new)
+{
+ return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
+}
+
+#ifdef CONFIG_64BIT
+
+#define __seq_to_nbcon_seq(seq) (seq)
+#define __nbcon_seq_to_seq(seq) (seq)
+
+#else /* CONFIG_64BIT */
+
+#define __seq_to_nbcon_seq(seq) ((u32)seq)
+
+static inline u64 __nbcon_seq_to_seq(u32 nbcon_seq)
+{
+ u64 seq;
+ u64 rb_next_seq;
+
+ /*
+ * The provided sequence is only the lower 32 bits of the ringbuffer
+ * sequence. It needs to be expanded to 64bit. Get the next sequence
+ * number from the ringbuffer and fold it.
+ *
+ * Having a 32bit representation in the console is sufficient.
+ * If a console ever gets more than 2^31 records behind
+ * the ringbuffer then this is the least of the problems.
+ *
+ * Also the access to the ring buffer is always safe.
+ */
+ rb_next_seq = prb_next_seq(prb);
+ seq = rb_next_seq - ((u32)rb_next_seq - nbcon_seq);
+
+ return seq;
+}
+
+#endif /* CONFIG_64BIT */
+
+/**
+ * nbcon_seq_read - Read the current console sequence
+ * @con: Console to read the sequence of
+ *
+ * Return: Sequence number of the next record to print on @con.
+ */
+u64 nbcon_seq_read(struct console *con)
+{
+ unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq));
+
+ return __nbcon_seq_to_seq(nbcon_seq);
+}
+
+/**
+ * nbcon_seq_force - Force console sequence to a specific value
+ * @con: Console to work on
+ * @seq: Sequence number value to set
+ *
+ * Only to be used during init (before registration) or in extreme situations
+ * (such as panic with CONSOLE_REPLAY_ALL).
+ */
+void nbcon_seq_force(struct console *con, u64 seq)
+{
+ /*
+ * If the specified record no longer exists, the oldest available record
+ * is chosen. This is especially important on 32bit systems because only
+ * the lower 32 bits of the sequence number are stored. The upper 32 bits
+ * are derived from the sequence numbers available in the ringbuffer.
+ */
+ u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));
+
+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __seq_to_nbcon_seq(valid_seq));
+
+ /* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */
+ con->seq = 0;
+}
+
+/**
+ * nbcon_seq_try_update - Try to update the console sequence number
+ * @ctxt: Pointer to an acquire context that contains
+ * all information about the acquire mode
+ * @new_seq: The new sequence number to set
+ *
+ * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to
+ * the 64bit value). This could be a different value than @new_seq if
+ * nbcon_seq_force() was used or the current context no longer owns the
+ * console. In the later case, it will stop printing anyway.
+ */
+static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
+{
+ unsigned long nbcon_seq = __seq_to_nbcon_seq(ctxt->seq);
+ struct console *con = ctxt->console;
+
+ if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq,
+ __seq_to_nbcon_seq(new_seq))) {
+ ctxt->seq = new_seq;
+ } else {
+ ctxt->seq = nbcon_seq_read(con);
+ }
+}
+
+/**
+ * nbcon_context_try_acquire_direct - Try to acquire directly
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * Acquire the console when it is released. Also acquire the console when
+ * the current owner has a lower priority and the console is in a safe state.
+ *
+ * Return: 0 on success. Otherwise, an error code on failure. Also @cur
+ * is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU.
+ * Or the current owner or waiter has the same or higher
+ * priority. No acquire method can be successful in
+ * this case.
+ *
+ * -EBUSY: The current owner has a lower priority but the console
+ * in an unsafe state. The caller should try using
+ * the handover acquire method.
+ */
+static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ do {
+ if (other_cpu_in_panic())
+ return -EPERM;
+
+ if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio)
+ return -EPERM;
+
+ if (cur->unsafe)
+ return -EBUSY;
+
+ /*
+ * The console should never be safe for a direct acquire
+ * if an unsafe hostile takeover has ever happened.
+ */
+ WARN_ON_ONCE(cur->unsafe_takeover);
+
+ new.atom = cur->atom;
+ new.prio = ctxt->prio;
+ new.req_prio = NBCON_PRIO_NONE;
+ new.unsafe = cur->unsafe_takeover;
+ new.cpu = cpu;
+
+ } while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+ return 0;
+}
+
+static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
+{
+ /*
+ * The request context is well defined by the @req_prio because:
+ *
+ * - Only a context with a higher priority can take over the request.
+ * - There are only three priorities.
+ * - Only one CPU is allowed to request PANIC priority.
+ * - Lower priorities are ignored during panic() until reboot.
+ *
+ * As a result, the following scenario is *not* possible:
+ *
+ * 1. Another context with a higher priority directly takes ownership.
+ * 2. The higher priority context releases the ownership.
+ * 3. A lower priority context takes the ownership.
+ * 4. Another context with the same priority as this context
+ * creates a request and starts waiting.
+ */
+
+ return (cur->req_prio == expected_prio);
+}
+
+/**
+ * nbcon_context_try_acquire_requested - Try to acquire after having
+ * requested a handover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * This is a helper function for nbcon_context_try_acquire_handover().
+ * It is called when the console is in an unsafe state. The current
+ * owner will release the console on exit from the unsafe region.
+ *
+ * Return: 0 on success and @cur is updated to the new console state.
+ * Otherwise an error code on failure.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU
+ * or this context is no longer the waiter.
+ *
+ * -EBUSY: The console is still locked. The caller should
+ * continue waiting.
+ *
+ * Note: The caller must still remove the request when an error has occurred
+ * except when this context is no longer the waiter.
+ */
+static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ /* Note that the caller must still remove the request! */
+ if (other_cpu_in_panic())
+ return -EPERM;
+
+ /*
+ * Note that the waiter will also change if there was an unsafe
+ * hostile takeover.
+ */
+ if (!nbcon_waiter_matches(cur, ctxt->prio))
+ return -EPERM;
+
+ /* If still locked, caller should continue waiting. */
+ if (cur->prio != NBCON_PRIO_NONE)
+ return -EBUSY;
+
+ /*
+ * The previous owner should have never released ownership
+ * in an unsafe region.
+ */
+ WARN_ON_ONCE(cur->unsafe);
+
+ new.atom = cur->atom;
+ new.prio = ctxt->prio;
+ new.req_prio = NBCON_PRIO_NONE;
+ new.unsafe = cur->unsafe_takeover;
+ new.cpu = cpu;
+
+ if (!nbcon_state_try_cmpxchg(con, cur, &new)) {
+ /*
+ * The acquire could fail only when it has been taken
+ * over by a higher priority context.
+ */
+ WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio));
+ return -EPERM;
+ }
+
+ /* Handover success. This context now owns the console. */
+ return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_handover - Try to acquire via handover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * The function must be called only when the context has higher priority
+ * than the current owner and the console is in an unsafe state.
+ * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY.
+ *
+ * The function sets "req_prio" field to make the current owner aware of
+ * the request. Then it waits until the current owner releases the console,
+ * or an even higher context takes over the request, or timeout expires.
+ *
+ * The current owner checks the "req_prio" field on exit from the unsafe
+ * region and releases the console. It does not touch the "req_prio" field
+ * so that the console stays reserved for the waiter.
+ *
+ * Return: 0 on success. Otherwise, an error code on failure. Also @cur
+ * is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU.
+ * Or a higher priority context has taken over the
+ * console or the handover request.
+ *
+ * -EBUSY: The current owner is on the same CPU so that the hand
+ * shake could not work. Or the current owner is not
+ * willing to wait (zero timeout). Or the console does
+ * not enter the safe state before timeout passed. The
+ * caller might still use the unsafe hostile takeover
+ * when allowed.
+ *
+ * -EAGAIN: @cur has changed when creating the handover request.
+ * The caller should retry with direct acquire.
+ */
+static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+ int timeout;
+ int request_err = -EBUSY;
+
+ /*
+ * Check that the handover is called when the direct acquire failed
+ * with -EBUSY.
+ */
+ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+ WARN_ON_ONCE(!cur->unsafe);
+
+ /* Handover is not possible on the same CPU. */
+ if (cur->cpu == cpu)
+ return -EBUSY;
+
+ /*
+ * Console stays unsafe after an unsafe takeover until re-initialized.
+ * Waiting is not going to help in this case.
+ */
+ if (cur->unsafe_takeover)
+ return -EBUSY;
+
+ /* Is the caller willing to wait? */
+ if (ctxt->spinwait_max_us == 0)
+ return -EBUSY;
+
+ /*
+ * Setup a request for the handover. The caller should try to acquire
+ * the console directly when the current state has been modified.
+ */
+ new.atom = cur->atom;
+ new.req_prio = ctxt->prio;
+ if (!nbcon_state_try_cmpxchg(con, cur, &new))
+ return -EAGAIN;
+
+ cur->atom = new.atom;
+
+ /* Wait until there is no owner and then acquire the console. */
+ for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) {
+ /* On successful acquire, this request is cleared. */
+ request_err = nbcon_context_try_acquire_requested(ctxt, cur);
+ if (!request_err)
+ return 0;
+
+ /*
+ * If the acquire should be aborted, it must be ensured
+ * that the request is removed before returning to caller.
+ */
+ if (request_err == -EPERM)
+ break;
+
+ udelay(1);
+
+ /* Re-read the state because some time has passed. */
+ nbcon_state_read(con, cur);
+ }
+
+ /* Timed out or aborted. Carefully remove handover request. */
+ do {
+ /*
+ * No need to remove request if there is a new waiter. This
+ * can only happen if a higher priority context has taken over
+ * the console or the handover request.
+ */
+ if (!nbcon_waiter_matches(cur, ctxt->prio))
+ return -EPERM;
+
+ /* Unset request for handover. */
+ new.atom = cur->atom;
+ new.req_prio = NBCON_PRIO_NONE;
+ if (nbcon_state_try_cmpxchg(con, cur, &new)) {
+ /*
+ * Request successfully unset. Report failure of
+ * acquiring via handover.
+ */
+ cur->atom = new.atom;
+ return request_err;
+ }
+
+ /*
+ * Unable to remove request. Try to acquire in case
+ * the owner has released the lock.
+ */
+ } while (nbcon_context_try_acquire_requested(ctxt, cur));
+
+ /* Lucky timing. The acquire succeeded while removing the request. */
+ return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * Acquire the console even in the unsafe state.
+ *
+ * It can be permitted by setting the 'allow_unsafe_takeover' field only
+ * by the final attempt to flush messages in panic().
+ *
+ * Return: 0 on success. -EPERM when not allowed by the context.
+ */
+static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ if (!ctxt->allow_unsafe_takeover)
+ return -EPERM;
+
+ /* Ensure caller is allowed to perform unsafe hostile takeovers. */
+ if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC))
+ return -EPERM;
+
+ /*
+ * Check that try_acquire_direct() and try_acquire_handover() returned
+ * -EBUSY in the right situation.
+ */
+ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+ WARN_ON_ONCE(cur->unsafe != true);
+
+ do {
+ new.atom = cur->atom;
+ new.cpu = cpu;
+ new.prio = ctxt->prio;
+ new.unsafe |= cur->unsafe_takeover;
+ new.unsafe_takeover |= cur->unsafe;
+
+ } while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+ return 0;
+}
+
+static struct printk_buffers panic_nbcon_pbufs;
+
+/**
+ * nbcon_context_try_acquire - Try to acquire nbcon console
+ * @ctxt: The context of the caller
+ *
+ * Return: True if the console was acquired. False otherwise.
+ *
+ * If the caller allowed an unsafe hostile takeover, on success the
+ * caller should check the current console state to see if it is
+ * in an unsafe state. Otherwise, on success the caller may assume
+ * the console is not in an unsafe state.
+ */
+__maybe_unused
+static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ int err;
+
+ nbcon_state_read(con, &cur);
+try_again:
+ err = nbcon_context_try_acquire_direct(ctxt, &cur);
+ if (err != -EBUSY)
+ goto out;
+
+ err = nbcon_context_try_acquire_handover(ctxt, &cur);
+ if (err == -EAGAIN)
+ goto try_again;
+ if (err != -EBUSY)
+ goto out;
+
+ err = nbcon_context_try_acquire_hostile(ctxt, &cur);
+out:
+ if (err)
+ return false;
+
+ /* Acquire succeeded. */
+
+ /* Assign the appropriate buffer for this context. */
+ if (atomic_read(&panic_cpu) == cpu)
+ ctxt->pbufs = &panic_nbcon_pbufs;
+ else
+ ctxt->pbufs = con->pbufs;
+
+ /* Set the record sequence for this context to print. */
+ ctxt->seq = nbcon_seq_read(ctxt->console);
+
+ return true;
+}
+
+static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
+ int expected_prio)
+{
+ /*
+ * Since consoles can only be acquired by higher priorities,
+ * owning contexts are uniquely identified by @prio. However,
+ * since contexts can unexpectedly lose ownership, it is
+ * possible that later another owner appears with the same
+ * priority. For this reason @cpu is also needed.
+ */
+
+ if (cur->prio != expected_prio)
+ return false;
+
+ if (cur->cpu != expected_cpu)
+ return false;
+
+ return true;
+}
+
+/**
+ * nbcon_context_release - Release the console
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ */
+static void nbcon_context_release(struct nbcon_context *ctxt)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ struct nbcon_state new;
+
+ nbcon_state_read(con, &cur);
+
+ do {
+ if (!nbcon_owner_matches(&cur, cpu, ctxt->prio))
+ break;
+
+ new.atom = cur.atom;
+ new.prio = NBCON_PRIO_NONE;
+
+ /*
+ * If @unsafe_takeover is set, it is kept set so that
+ * the state remains permanently unsafe.
+ */
+ new.unsafe |= cur.unsafe_takeover;
+
+ } while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+
+ ctxt->pbufs = NULL;
+}
+
+/**
+ * nbcon_context_can_proceed - Check whether ownership can proceed
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ * @cur: The current console state
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * Must be invoked when entering the unsafe state to make sure that it still
+ * owns the lock. Also must be invoked when exiting the unsafe context
+ * to eventually free the lock for a higher priority context which asked
+ * for the friendly handover.
+ *
+ * It can be called inside an unsafe section when the console is just
+ * temporary in safe state instead of exiting and entering the unsafe
+ * state.
+ *
+ * Also it can be called in the safe context before doing an expensive
+ * safe operation. It does not make sense to do the operation when
+ * a higher priority context took the lock.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+
+ /* Make sure this context still owns the console. */
+ if (!nbcon_owner_matches(cur, cpu, ctxt->prio))
+ return false;
+
+ /* The console owner can proceed if there is no waiter. */
+ if (cur->req_prio == NBCON_PRIO_NONE)
+ return true;
+
+ /*
+ * A console owner within an unsafe region is always allowed to
+ * proceed, even if there are waiters. It can perform a handover
+ * when exiting the unsafe region. Otherwise the waiter will
+ * need to perform an unsafe hostile takeover.
+ */
+ if (cur->unsafe)
+ return true;
+
+ /* Waiters always have higher priorities than owners. */
+ WARN_ON_ONCE(cur->req_prio <= cur->prio);
+
+ /*
+ * Having a safe point for take over and eventually a few
+ * duplicated characters or a full line is way better than a
+ * hostile takeover. Post processing can take care of the garbage.
+ * Release and hand over.
+ */
+ nbcon_context_release(ctxt);
+
+ /*
+ * It is not clear whether the waiter really took over ownership. The
+ * outermost callsite must make the final decision whether console
+ * ownership is needed for it to proceed. If yes, it must reacquire
+ * ownership (possibly hostile) before carefully proceeding.
+ *
+ * The calling context no longer owns the console so go back all the
+ * way instead of trying to implement reacquire heuristics in tons of
+ * places.
+ */
+ return false;
+}
+
+/**
+ * nbcon_can_proceed - Check whether ownership can proceed
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * It is used in nbcon_enter_unsafe() to make sure that it still owns the
+ * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock
+ * for a higher priority context which asked for the friendly handover.
+ *
+ * It can be called inside an unsafe section when the console is just
+ * temporary in safe state instead of exiting and entering the unsafe state.
+ *
+ * Also it can be called in the safe context before doing an expensive safe
+ * operation. It does not make sense to do the operation when a higher
+ * priority context took the lock.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_can_proceed(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+
+ nbcon_state_read(con, &cur);
+
+ return nbcon_context_can_proceed(ctxt, &cur);
+}
+EXPORT_SYMBOL_GPL(nbcon_can_proceed);
+
+#define nbcon_context_enter_unsafe(c) __nbcon_context_update_unsafe(c, true)
+#define nbcon_context_exit_unsafe(c) __nbcon_context_update_unsafe(c, false)
+
+/**
+ * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ * @unsafe: The new value for the unsafe bit
+ *
+ * Return: True if the unsafe state was updated and this context still
+ * owns the console. Otherwise false if ownership was handed
+ * over or taken.
+ *
+ * This function allows console owners to modify the unsafe status of the
+ * console.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ *
+ * Internal helper to avoid duplicated code.
+ */
+static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe)
+{
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ struct nbcon_state new;
+
+ nbcon_state_read(con, &cur);
+
+ do {
+ /*
+ * The unsafe bit must not be cleared if an
+ * unsafe hostile takeover has occurred.
+ */
+ if (!unsafe && cur.unsafe_takeover)
+ goto out;
+
+ if (!nbcon_context_can_proceed(ctxt, &cur))
+ return false;
+
+ new.atom = cur.atom;
+ new.unsafe = unsafe;
+ } while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+
+ cur.atom = new.atom;
+out:
+ return nbcon_context_can_proceed(ctxt, &cur);
+}
+
+/**
+ * nbcon_enter_unsafe - Enter an unsafe region in the driver
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ return nbcon_context_enter_unsafe(ctxt);
+}
+EXPORT_SYMBOL_GPL(nbcon_enter_unsafe);
+
+/**
+ * nbcon_exit_unsafe - Exit an unsafe region in the driver
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ return nbcon_context_exit_unsafe(ctxt);
+}
+EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);
+
+/**
+ * nbcon_emit_next_record - Emit a record in the acquired context
+ * @wctxt: The write context that will be handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context. If the caller
+ * wants to do more it must reacquire the console first.
+ *
+ * When true is returned, @wctxt->ctxt.backlog indicates whether there are
+ * still records pending in the ringbuffer,
+ */
+__maybe_unused
+static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+ struct printk_message pmsg = {
+ .pbufs = ctxt->pbufs,
+ };
+ unsigned long con_dropped;
+ struct nbcon_state cur;
+ unsigned long dropped;
+ bool done;
+
+ /*
+ * The printk buffers are filled within an unsafe section. This
+ * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from
+ * clobbering each other.
+ */
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true);
+ if (!ctxt->backlog)
+ return nbcon_context_exit_unsafe(ctxt);
+
+ /*
+ * @con->dropped is not protected in case of an unsafe hostile
+ * takeover. In that situation the update can be racy so
+ * annotate it accordingly.
+ */
+ con_dropped = data_race(READ_ONCE(con->dropped));
+
+ dropped = con_dropped + pmsg.dropped;
+ if (dropped && !is_extended)
+ console_prepend_dropped(&pmsg, dropped);
+
+ if (!nbcon_context_exit_unsafe(ctxt))
+ return false;
+
+ /* For skipped records just update seq/dropped in @con. */
+ if (pmsg.outbuf_len == 0)
+ goto update_con;
+
+ /* Initialize the write context for driver callbacks. */
+ wctxt->outbuf = &pmsg.pbufs->outbuf[0];
+ wctxt->len = pmsg.outbuf_len;
+ nbcon_state_read(con, &cur);
+ wctxt->unsafe_takeover = cur.unsafe_takeover;
+
+ if (con->write_atomic) {
+ done = con->write_atomic(con, wctxt);
+ } else {
+ nbcon_context_release(ctxt);
+ WARN_ON_ONCE(1);
+ done = false;
+ }
+
+ /* If not done, the emit was aborted. */
+ if (!done)
+ return false;
+
+ /*
+ * Since any dropped message was successfully output, reset the
+ * dropped count for the console.
+ */
+ dropped = 0;
+update_con:
+ /*
+ * The dropped count and the sequence number are updated within an
+ * unsafe section. This limits update races to the panic context and
+ * allows the panic context to win.
+ */
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ if (dropped != con_dropped) {
+ /* Counterpart to the READ_ONCE() above. */
+ WRITE_ONCE(con->dropped, dropped);
+ }
+
+ nbcon_seq_try_update(ctxt, pmsg.seq + 1);
+
+ return nbcon_context_exit_unsafe(ctxt);
+}
+
+/**
+ * nbcon_alloc - Allocate buffers needed by the nbcon console
+ * @con: Console to allocate buffers for
+ *
+ * Return: True on success. False otherwise and the console cannot
+ * be used.
+ *
+ * This is not part of nbcon_init() because buffer allocation must
+ * be performed earlier in the console registration process.
+ */
+bool nbcon_alloc(struct console *con)
+{
+ if (con->flags & CON_BOOT) {
+ /*
+ * Boot console printing is synchronized with legacy console
+ * printing, so boot consoles can share the same global printk
+ * buffers.
+ */
+ con->pbufs = &printk_shared_pbufs;
+ } else {
+ con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL);
+ if (!con->pbufs) {
+ con_printk(KERN_ERR, con, "failed to allocate printing buffer\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * nbcon_init - Initialize the nbcon console specific data
+ * @con: Console to initialize
+ *
+ * nbcon_alloc() *must* be called and succeed before this function
+ * is called.
+ *
+ * This function expects that the legacy @con->seq has been set.
+ */
+void nbcon_init(struct console *con)
+{
+ struct nbcon_state state = { };
+
+ /* nbcon_alloc() must have been called and successful! */
+ BUG_ON(!con->pbufs);
+
+ nbcon_seq_force(con, con->seq);
+ nbcon_state_set(con, &state);
+}
+
+/**
+ * nbcon_free - Free and cleanup the nbcon console specific data
+ * @con: Console to free/cleanup nbcon data
+ */
+void nbcon_free(struct console *con)
+{
+ struct nbcon_state state = { };
+
+ nbcon_state_set(con, &state);
+
+ /* Boot consoles share global printk buffers. */
+ if (!(con->flags & CON_BOOT))
+ kfree(con->pbufs);
+
+ con->pbufs = NULL;
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 357a4d18f638..f2444b581e16 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -88,7 +88,7 @@ EXPORT_SYMBOL(oops_in_progress);
static DEFINE_MUTEX(console_mutex);
/*
- * console_sem protects updates to console->seq and console_suspended,
+ * console_sem protects updates to console->seq
* and also provides serialization for console printing.
*/
static DEFINE_SEMAPHORE(console_sem, 1);
@@ -102,12 +102,6 @@ DEFINE_STATIC_SRCU(console_srcu);
*/
int __read_mostly suppress_printk;
-/*
- * During panic, heavy printk by other CPUs can delay the
- * panic and risk deadlock on console resources.
- */
-static int __read_mostly suppress_panic_printk;
-
#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
@@ -361,7 +355,7 @@ static bool panic_in_progress(void)
* paths in the console code where we end up in places I want
* locked without the console semaphore held).
*/
-static int console_locked, console_suspended;
+static int console_locked;
/*
* Array of consoles built from command line options (console=)
@@ -445,6 +439,12 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
static DEFINE_MUTEX(syslog_lock);
#ifdef CONFIG_PRINTK
+/*
+ * During panic, heavy printk by other CPUs can delay the
+ * panic and risk deadlock on console resources.
+ */
+static int __read_mostly suppress_panic_printk;
+
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -494,7 +494,7 @@ _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
static struct printk_ringbuffer printk_rb_dynamic;
-static struct printk_ringbuffer *prb = &printk_rb_static;
+struct printk_ringbuffer *prb = &printk_rb_static;
/*
* We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
@@ -698,9 +698,6 @@ out:
return len;
}
-static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
- bool is_extended, bool may_supress);
-
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
atomic64_t seq;
@@ -1669,7 +1666,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
- len = 0;
prb_for_each_record(seq, prb, seq, &r) {
int textlen;
@@ -2308,7 +2304,11 @@ asmlinkage int vprintk_emit(int facility, int level,
preempt_enable();
}
- wake_up_klogd();
+ if (in_sched)
+ defer_console_output();
+ else
+ wake_up_klogd();
+
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
@@ -2345,22 +2345,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
static u64 syslog_seq;
-static size_t record_print_text(const struct printk_record *r,
- bool syslog, bool time)
-{
- return 0;
-}
-static ssize_t info_print_ext_header(char *buf, size_t size,
- struct printk_info *info)
-{
- return 0;
-}
-static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *text, size_t text_len,
- struct dev_printk_info *dev_info) { return 0; }
-static void console_lock_spinning_enable(void) { }
-static int console_lock_spinning_disable_and_check(int cookie) { return 0; }
-static bool suppress_message_printing(int level) { return false; }
static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
@@ -2400,13 +2384,21 @@ static void set_user_specified(struct console_cmdline *c, bool user_specified)
console_set_on_cmdline = 1;
}
-static int __add_preferred_console(char *name, int idx, char *options,
+static int __add_preferred_console(const char *name, const short idx, char *options,
char *brl_options, bool user_specified)
{
struct console_cmdline *c;
int i;
/*
+ * We use a signed short index for struct console for device drivers to
+ * indicate a not yet assigned index or port. However, a negative index
+ * value is not valid for preferred console.
+ */
+ if (idx < 0)
+ return -EINVAL;
+
+ /*
* See if this tty is not yet registered, and
* if we have a slot free.
*/
@@ -2509,7 +2501,7 @@ __setup("console=", console_setup);
* commonly to provide a default console (ie from PROM variables) when
* the user has not supplied one.
*/
-int add_preferred_console(char *name, int idx, char *options)
+int add_preferred_console(const char *name, const short idx, char *options)
{
return __add_preferred_console(name, idx, options, NULL, false);
}
@@ -2547,22 +2539,46 @@ MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to hig
*/
void suspend_console(void)
{
+ struct console *con;
+
if (!console_suspend_enabled)
return;
pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
pr_flush(1000, true);
- console_lock();
- console_suspended = 1;
- up_console_sem();
+
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see that they are suspended so that it
+ * is guaranteed that all printing has stopped when this function
+ * completes.
+ */
+ synchronize_srcu(&console_srcu);
}
void resume_console(void)
{
+ struct console *con;
+
if (!console_suspend_enabled)
return;
- down_console_sem();
- console_suspended = 0;
- console_unlock();
+
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see they are no longer suspended so
+ * that they are guaranteed to wake up and resume printing.
+ */
+ synchronize_srcu(&console_srcu);
+
pr_flush(1000, true);
}
@@ -2585,6 +2601,26 @@ static int console_cpu_notify(unsigned int cpu)
return 0;
}
+/*
+ * Return true if a panic is in progress on a remote CPU.
+ *
+ * On true, the local CPU should immediately release any printing resources
+ * that may be needed by the panic CPU.
+ */
+bool other_cpu_in_panic(void)
+{
+ if (!panic_in_progress())
+ return false;
+
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return atomic_read(&panic_cpu) != raw_smp_processor_id();
+}
+
/**
* console_lock - block the console subsystem from printing
*
@@ -2597,9 +2633,11 @@ void console_lock(void)
{
might_sleep();
+ /* On panic, the console_lock must be left to the panic cpu. */
+ while (other_cpu_in_panic())
+ msleep(1000);
+
down_console_sem();
- if (console_suspended)
- return;
console_locked = 1;
console_may_schedule = 1;
}
@@ -2615,12 +2653,11 @@ EXPORT_SYMBOL(console_lock);
*/
int console_trylock(void)
{
- if (down_trylock_console_sem())
+ /* On panic, the console_lock must be left to the panic cpu. */
+ if (other_cpu_in_panic())
return 0;
- if (console_suspended) {
- up_console_sem();
+ if (down_trylock_console_sem())
return 0;
- }
console_locked = 1;
console_may_schedule = 0;
return 1;
@@ -2634,25 +2671,6 @@ int is_console_locked(void)
EXPORT_SYMBOL(is_console_locked);
/*
- * Return true when this CPU should unlock console_sem without pushing all
- * messages to the console. This reduces the chance that the console is
- * locked when the panic CPU tries to use it.
- */
-static bool abandon_console_lock_in_panic(void)
-{
- if (!panic_in_progress())
- return false;
-
- /*
- * We can use raw_smp_processor_id() here because it is impossible for
- * the task to be migrated to the panic_cpu, or away from it. If
- * panic_cpu has already been set, and we're not currently executing on
- * that CPU, then we never will be.
- */
- return atomic_read(&panic_cpu) != raw_smp_processor_id();
-}
-
-/*
* Check if the given console is currently capable and allowed to print
* records.
*
@@ -2665,6 +2683,9 @@ static inline bool console_is_usable(struct console *con)
if (!(flags & CON_ENABLED))
return false;
+ if ((flags & CON_SUSPENDED))
+ return false;
+
if (!con->write)
return false;
@@ -2685,6 +2706,8 @@ static void __console_unlock(void)
up_console_sem();
}
+#ifdef CONFIG_PRINTK
+
/*
* Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This
* is achieved by shifting the existing message over and inserting the dropped
@@ -2699,8 +2722,7 @@ static void __console_unlock(void)
*
* If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
*/
-#ifdef CONFIG_PRINTK
-static void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
{
struct printk_buffers *pbufs = pmsg->pbufs;
const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
@@ -2731,9 +2753,6 @@ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d
memcpy(outbuf, scratchbuf, len);
pmsg->outbuf_len += len;
}
-#else
-#define console_prepend_dropped(pmsg, dropped)
-#endif /* CONFIG_PRINTK */
/*
* Read and format the specified record (or a later record if the specified
@@ -2754,8 +2773,8 @@ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d
* of @pmsg are valid. (See the documentation of struct printk_message
* for information about the @pmsg fields.)
*/
-static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
- bool is_extended, bool may_suppress)
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_suppress)
{
static int panic_console_dropped;
@@ -2814,6 +2833,13 @@ out:
}
/*
+ * Used as the printk buffers for non-panic, serialized console printing.
+ * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles.
+ * Its usage requires the console_lock held.
+ */
+struct printk_buffers printk_shared_pbufs;
+
+/*
* Print one record for the given console. The record printed is whatever
* record is the next available record for the given console.
*
@@ -2830,12 +2856,10 @@ out:
*/
static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
{
- static struct printk_buffers pbufs;
-
bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
- char *outbuf = &pbufs.outbuf[0];
+ char *outbuf = &printk_shared_pbufs.outbuf[0];
struct printk_message pmsg = {
- .pbufs = &pbufs,
+ .pbufs = &printk_shared_pbufs,
};
unsigned long flags;
@@ -2886,6 +2910,16 @@ skip:
return true;
}
+#else
+
+static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
+{
+ *handover = false;
+ return false;
+}
+
+#endif /* CONFIG_PRINTK */
+
/*
* Print out all remaining records to all consoles.
*
@@ -2948,7 +2982,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
any_progress = true;
/* Allow panic_cpu to take over the consoles safely. */
- if (abandon_console_lock_in_panic())
+ if (other_cpu_in_panic())
goto abandon;
if (do_cond_resched)
@@ -2983,11 +3017,6 @@ void console_unlock(void)
bool flushed;
u64 next_seq;
- if (console_suspended) {
- up_console_sem();
- return;
- }
-
/*
* Console drivers are called with interrupts disabled, so
* @console_may_schedule should be cleared before; however, we may
@@ -3045,10 +3074,28 @@ EXPORT_SYMBOL(console_conditional_schedule);
void console_unblank(void)
{
+ bool found_unblank = false;
struct console *c;
int cookie;
/*
+ * First check if there are any consoles implementing the unblank()
+ * callback. If not, there is no reason to continue and take the
+ * console lock, which in particular can be dangerous if
+ * @oops_in_progress is set.
+ */
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) {
+ found_unblank = true;
+ break;
+ }
+ }
+ console_srcu_read_unlock(cookie);
+ if (!found_unblank)
+ return;
+
+ /*
* Stop console printing because the unblank() callback may
* assume the console is not within its write() callback.
*
@@ -3056,6 +3103,16 @@ void console_unblank(void)
* In that case, attempt a trylock as best-effort.
*/
if (oops_in_progress) {
+ /* Semaphores are not NMI-safe. */
+ if (in_nmi())
+ return;
+
+ /*
+ * Attempting to trylock the console lock can deadlock
+ * if another CPU was stopped while modifying the
+ * semaphore. "Hope and pray" that this is not the
+ * current situation.
+ */
if (down_trylock_console_sem() != 0)
return;
} else
@@ -3085,18 +3142,29 @@ void console_unblank(void)
*/
void console_flush_on_panic(enum con_flush_mode mode)
{
+ bool handover;
+ u64 next_seq;
+
+ /*
+ * Ignore the console lock and flush out the messages. Attempting a
+ * trylock would not be useful because:
+ *
+ * - if it is contended, it must be ignored anyway
+ * - console_lock() and console_trylock() block and fail
+ * respectively in panic for non-panic CPUs
+ * - semaphores are not NMI-safe
+ */
+
/*
- * If someone else is holding the console lock, trylock will fail
- * and may_schedule may be set. Ignore and proceed to unlock so
- * that messages are flushed out. As this can be called from any
- * context and we don't want to get preempted while flushing,
- * ensure may_schedule is cleared.
+ * If another context is holding the console lock,
+ * @console_may_schedule might be set. Clear it so that
+ * this context does not call cond_resched() while flushing.
*/
- console_trylock();
console_may_schedule = 0;
if (mode == CONSOLE_REPLAY_ALL) {
struct console *c;
+ short flags;
int cookie;
u64 seq;
@@ -3104,16 +3172,22 @@ void console_flush_on_panic(enum con_flush_mode mode)
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
- /*
- * If the above console_trylock() failed, this is an
- * unsynchronized assignment. But in that case, the
- * kernel is in "hope and pray" mode anyway.
- */
- c->seq = seq;
+ flags = console_srcu_read_flags(c);
+
+ if (flags & CON_NBCON) {
+ nbcon_seq_force(c, seq);
+ } else {
+ /*
+ * This is an unsynchronized assignment. On
+ * panic legacy consoles are only best effort.
+ */
+ c->seq = seq;
+ }
}
console_srcu_read_unlock(cookie);
}
- console_unlock();
+
+ console_flush_all(false, &next_seq, &handover);
}
/*
@@ -3260,11 +3334,6 @@ static void try_enable_default_console(struct console *newcon)
newcon->flags |= CON_CONSDEV;
}
-#define con_printk(lvl, con, fmt, ...) \
- printk(lvl pr_fmt("%sconsole [%s%d] " fmt), \
- (con->flags & CON_BOOT) ? "boot" : "", \
- con->name, con->index, ##__VA_ARGS__)
-
static void console_init_seq(struct console *newcon, bool bootcon_registered)
{
struct console *con;
@@ -3378,6 +3447,15 @@ void register_console(struct console *newcon)
goto unlock;
}
+ if (newcon->flags & CON_NBCON) {
+ /*
+ * Ensure the nbcon console buffers can be allocated
+ * before modifying any global data.
+ */
+ if (!nbcon_alloc(newcon))
+ goto unlock;
+ }
+
/*
* See if we want to enable this console driver by default.
*
@@ -3405,8 +3483,11 @@ void register_console(struct console *newcon)
err = try_enable_preferred_console(newcon, false);
/* printk() messages are not printed to the Braille console. */
- if (err || newcon->flags & CON_BRL)
+ if (err || newcon->flags & CON_BRL) {
+ if (newcon->flags & CON_NBCON)
+ nbcon_free(newcon);
goto unlock;
+ }
/*
* If we have a bootconsole, and are switching to a real console,
@@ -3422,6 +3503,9 @@ void register_console(struct console *newcon)
newcon->dropped = 0;
console_init_seq(newcon, bootcon_registered);
+ if (newcon->flags & CON_NBCON)
+ nbcon_init(newcon);
+
/*
* Put this console in the list - keep the
* preferred driver at the head of the list.
@@ -3513,6 +3597,9 @@ static int unregister_console_locked(struct console *console)
*/
synchronize_srcu(&console_srcu);
+ if (console->flags & CON_NBCON)
+ nbcon_free(console);
+
console_sysfs_notify();
if (console->exit)
@@ -3662,10 +3749,12 @@ late_initcall(printk_late_init);
/* If @con is specified, only wait for that console. Otherwise wait for all. */
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
{
- int remaining = timeout_ms;
+ unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms);
+ unsigned long remaining_jiffies = timeout_jiffies;
struct console *c;
u64 last_diff = 0;
u64 printk_seq;
+ short flags;
int cookie;
u64 diff;
u64 seq;
@@ -3674,13 +3763,21 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
seq = prb_next_seq(prb);
+ /* Flush the consoles so that records up to @seq are printed. */
+ console_lock();
+ console_unlock();
+
for (;;) {
+ unsigned long begin_jiffies;
+ unsigned long slept_jiffies;
+
diff = 0;
/*
* Hold the console_lock to guarantee safe access to
- * console->seq and to prevent changes to @console_suspended
- * until all consoles have been processed.
+ * console->seq. Releasing console_lock flushes more
+ * records in case @seq is still not printed on all
+ * usable consoles.
*/
console_lock();
@@ -3688,39 +3785,43 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
for_each_console_srcu(c) {
if (con && con != c)
continue;
+
+ flags = console_srcu_read_flags(c);
+
+ /*
+ * If consoles are not usable, it cannot be expected
+ * that they make forward progress, so only increment
+ * @diff for usable consoles.
+ */
if (!console_is_usable(c))
continue;
- printk_seq = c->seq;
+
+ if (flags & CON_NBCON) {
+ printk_seq = nbcon_seq_read(c);
+ } else {
+ printk_seq = c->seq;
+ }
+
if (printk_seq < seq)
diff += seq - printk_seq;
}
console_srcu_read_unlock(cookie);
- /*
- * If consoles are suspended, it cannot be expected that they
- * make forward progress, so timeout immediately. @diff is
- * still used to return a valid flush status.
- */
- if (console_suspended)
- remaining = 0;
- else if (diff != last_diff && reset_on_progress)
- remaining = timeout_ms;
+ if (diff != last_diff && reset_on_progress)
+ remaining_jiffies = timeout_jiffies;
console_unlock();
- if (diff == 0 || remaining == 0)
+ /* Note: @diff is 0 if there are no usable consoles. */
+ if (diff == 0 || remaining_jiffies == 0)
break;
- if (remaining < 0) {
- /* no timeout limit */
- msleep(100);
- } else if (remaining < 100) {
- msleep(remaining);
- remaining = 0;
- } else {
- msleep(100);
- remaining -= 100;
- }
+ /* msleep(1) might sleep much longer. Check time by jiffies. */
+ begin_jiffies = jiffies;
+ msleep(1);
+ slept_jiffies = jiffies - begin_jiffies;
+
+ remaining_jiffies -= min(slept_jiffies, remaining_jiffies);
last_diff = diff;
}
@@ -3741,7 +3842,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
* printer has been seen to make some forward progress.
*
* Context: Process context. May sleep while acquiring console lock.
- * Return: true if all enabled printers are caught up.
+ * Return: true if all usable printers are caught up.
*/
static bool pr_flush(int timeout_ms, bool reset_on_progress)
{
@@ -3798,11 +3899,33 @@ static void __wake_up_klogd(int val)
preempt_enable();
}
+/**
+ * wake_up_klogd - Wake kernel logging daemon
+ *
+ * Use this function when new records have been added to the ringbuffer
+ * and the console printing of those records has already occurred or is
+ * known to be handled by some other context. This function will only
+ * wake the logging daemon.
+ *
+ * Context: Any context.
+ */
void wake_up_klogd(void)
{
__wake_up_klogd(PRINTK_PENDING_WAKEUP);
}
+/**
+ * defer_console_output - Wake kernel logging daemon and trigger
+ * console printing in a deferred context
+ *
+ * Use this function when new records have been added to the ringbuffer,
+ * this context is responsible for console printing those records, but
+ * the current context is not allowed to perform the console printing.
+ * Trigger an irq_work context to perform the console printing. This
+ * function also wakes the logging daemon.
+ *
+ * Context: Any context.
+ */
void defer_console_output(void)
{
/*
@@ -3819,12 +3942,7 @@ void printk_trigger_flush(void)
int vprintk_deferred(const char *fmt, va_list args)
{
- int r;
-
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
- defer_console_output();
-
- return r;
+ return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
}
int _printk_deferred(const char *fmt, ...)
@@ -4107,7 +4225,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
prb_rec_init_rd(&r, &info, buf, size);
- len = 0;
prb_for_each_record(seq, prb, seq, &r) {
if (r.info->seq >= iter->next_seq)
break;
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index 2dc4d5a1f1ff..fde338606ce8 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -1735,7 +1735,7 @@ static bool copy_data(struct prb_data_ring *data_ring,
if (!buf || !buf_size)
return true;
- data_size = min_t(u16, buf_size, len);
+ data_size = min_t(unsigned int, buf_size, len);
memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
return true;
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index ef0f9a2044da..6d10927a07d8 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -38,13 +38,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* Use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
*/
- if (this_cpu_read(printk_context) || in_nmi()) {
- int len;
-
- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
- defer_console_output();
- return len;
- }
+ if (this_cpu_read(printk_context) || in_nmi())
+ return vprintk_deferred(fmt, args);
/* No obstacles. */
return vprintk_default(fmt, args);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 443057bee87c..2fabd497d659 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -59,7 +59,7 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
return 0;
}
- ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
+ ret = access_remote_vm(mm, addr, buf, len, gup_flags);
mmput(mm);
return ret;
@@ -145,20 +145,9 @@ void __ptrace_unlink(struct task_struct *child)
*/
if (!(child->flags & PF_EXITING) &&
(child->signal->flags & SIGNAL_STOP_STOPPED ||
- child->signal->group_stop_count)) {
+ child->signal->group_stop_count))
child->jobctl |= JOBCTL_STOP_PENDING;
- /*
- * This is only possible if this thread was cloned by the
- * traced task running in the stopped group, set the signal
- * for the future reports.
- * FIXME: we should change ptrace_init_task() to handle this
- * case.
- */
- if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
- child->jobctl |= SIGSTOP;
- }
-
/*
* If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
* @child in the butt. Note that @resume should be used iff @child
@@ -386,6 +375,34 @@ static int check_ptrace_options(unsigned long data)
return 0;
}
+static inline void ptrace_set_stopped(struct task_struct *task)
+{
+ guard(spinlock)(&task->sighand->siglock);
+
+ /*
+ * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
+ * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
+ * will be cleared if the child completes the transition or any
+ * event which clears the group stop states happens. We'll wait
+ * for the transition to complete before returning from this
+ * function.
+ *
+ * This hides STOPPED -> RUNNING -> TRACED transition from the
+ * attaching thread but a different thread in the same group can
+ * still observe the transient RUNNING state. IOW, if another
+ * thread's WNOHANG wait(2) on the stopped tracee races against
+ * ATTACH, the wait(2) may fail due to the transient RUNNING.
+ *
+ * The following task_is_stopped() test is safe as both transitions
+ * in and out of STOPPED are protected by siglock.
+ */
+ if (task_is_stopped(task) &&
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_STOPPED;
+ signal_wake_up_state(task, __TASK_STOPPED);
+ }
+}
+
static int ptrace_attach(struct task_struct *task, long request,
unsigned long addr,
unsigned long flags)
@@ -393,17 +410,17 @@ static int ptrace_attach(struct task_struct *task, long request,
bool seize = (request == PTRACE_SEIZE);
int retval;
- retval = -EIO;
if (seize) {
if (addr != 0)
- goto out;
+ return -EIO;
/*
* This duplicates the check in check_ptrace_options() because
* ptrace_attach() and ptrace_setoptions() have historically
* used different error codes for unknown ptrace options.
*/
if (flags & ~(unsigned long)PTRACE_O_MASK)
- goto out;
+ return -EIO;
+
retval = check_ptrace_options(flags);
if (retval)
return retval;
@@ -414,88 +431,54 @@ static int ptrace_attach(struct task_struct *task, long request,
audit_ptrace(task);
- retval = -EPERM;
if (unlikely(task->flags & PF_KTHREAD))
- goto out;
+ return -EPERM;
if (same_thread_group(task, current))
- goto out;
+ return -EPERM;
/*
* Protect exec's credential calculations against our interference;
* SUID, SGID and LSM creds get determined differently
* under ptrace.
*/
- retval = -ERESTARTNOINTR;
- if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
- goto out;
+ scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR,
+ &task->signal->cred_guard_mutex) {
- task_lock(task);
- retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
- task_unlock(task);
- if (retval)
- goto unlock_creds;
+ scoped_guard (task_lock, task) {
+ retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+ if (retval)
+ return retval;
+ }
- write_lock_irq(&tasklist_lock);
- retval = -EPERM;
- if (unlikely(task->exit_state))
- goto unlock_tasklist;
- if (task->ptrace)
- goto unlock_tasklist;
+ scoped_guard (write_lock_irq, &tasklist_lock) {
+ if (unlikely(task->exit_state))
+ return -EPERM;
+ if (task->ptrace)
+ return -EPERM;
- task->ptrace = flags;
+ task->ptrace = flags;
- ptrace_link(task, current);
+ ptrace_link(task, current);
- /* SEIZE doesn't trap tracee on attach */
- if (!seize)
- send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
+ /* SEIZE doesn't trap tracee on attach */
+ if (!seize)
+ send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
- spin_lock(&task->sighand->siglock);
+ ptrace_set_stopped(task);
+ }
+ }
/*
- * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
- * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
- * will be cleared if the child completes the transition or any
- * event which clears the group stop states happens. We'll wait
- * for the transition to complete before returning from this
- * function.
- *
- * This hides STOPPED -> RUNNING -> TRACED transition from the
- * attaching thread but a different thread in the same group can
- * still observe the transient RUNNING state. IOW, if another
- * thread's WNOHANG wait(2) on the stopped tracee races against
- * ATTACH, the wait(2) may fail due to the transient RUNNING.
- *
- * The following task_is_stopped() test is safe as both transitions
- * in and out of STOPPED are protected by siglock.
+ * We do not bother to change retval or clear JOBCTL_TRAPPING
+ * if wait_on_bit() was interrupted by SIGKILL. The tracer will
+ * not return to user-mode, it will exit and clear this bit in
+ * __ptrace_unlink() if it wasn't already cleared by the tracee;
+ * and until then nobody can ptrace this task.
*/
- if (task_is_stopped(task) &&
- task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
- task->jobctl &= ~JOBCTL_STOPPED;
- signal_wake_up_state(task, __TASK_STOPPED);
- }
-
- spin_unlock(&task->sighand->siglock);
-
- retval = 0;
-unlock_tasklist:
- write_unlock_irq(&tasklist_lock);
-unlock_creds:
- mutex_unlock(&task->signal->cred_guard_mutex);
-out:
- if (!retval) {
- /*
- * We do not bother to change retval or clear JOBCTL_TRAPPING
- * if wait_on_bit() was interrupted by SIGKILL. The tracer will
- * not return to user-mode, it will exit and clear this bit in
- * __ptrace_unlink() if it wasn't already cleared by the tracee;
- * and until then nobody can ptrace this task.
- */
- wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
- proc_ptrace_connector(task, PTRACE_ATTACH);
- }
+ wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
+ proc_ptrace_connector(task, PTRACE_ATTACH);
- return retval;
+ return 0;
}
/**
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 2984de629f74..9b0b52e1836f 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -105,6 +105,31 @@ config RCU_CPU_STALL_CPUTIME
The boot option rcupdate.rcu_cpu_stall_cputime has the same function
as this one, but will override this if it exists.
+config RCU_CPU_STALL_NOTIFIER
+ bool "Provide RCU CPU-stall notifiers"
+ depends on RCU_STALL_COMMON
+ depends on DEBUG_KERNEL
+ depends on RCU_EXPERT
+ default n
+ help
+ WARNING: You almost certainly do not want this!!!
+
+ Enable RCU CPU-stall notifiers, which are invoked just before
+ printing the RCU CPU stall warning. As such, bugs in notifier
+ callbacks can prevent stall warnings from being printed.
+ And the whole reason that a stall warning is being printed is
+ that something is hung up somewhere. Therefore, the notifier
+ callbacks must be written extremely carefully, preferably
+ containing only lockless code. After all, it is quite possible
+ that the whole reason that the RCU CPU stall is happening in
+ the first place is that someone forgot to release whatever lock
+ that you are thinking of acquiring. In which case, having your
+ notifier callback acquire that lock will hang, preventing the
+ RCU CPU stall warning from appearing.
+
+ Say Y here if you want RCU CPU stall notifiers (you don't want them)
+ Say N if you are unsure.
+
config RCU_TRACE
bool "Enable tracing for RCU"
depends on DEBUG_KERNEL
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 98e13be411af..f94f65877f2b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -10,6 +10,7 @@
#ifndef __LINUX_RCU_H
#define __LINUX_RCU_H
+#include <linux/slab.h>
#include <trace/events/rcu.h>
/*
@@ -248,6 +249,12 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
}
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+static inline void debug_rcu_head_callback(struct rcu_head *rhp)
+{
+ if (unlikely(!rhp->func))
+ kmem_dump_obj(rhp);
+}
+
extern int rcu_cpu_stall_suppress_at_boot;
static inline bool rcu_stall_is_suppressed_at_boot(void)
@@ -255,6 +262,8 @@ static inline bool rcu_stall_is_suppressed_at_boot(void)
return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended();
}
+extern int rcu_cpu_stall_notifiers;
+
#ifdef CONFIG_RCU_STALL_COMMON
extern int rcu_cpu_stall_ftrace_dump;
@@ -493,6 +502,7 @@ static inline void rcu_expedite_gp(void) { }
static inline void rcu_unexpedite_gp(void) { }
static inline void rcu_async_hurry(void) { }
static inline void rcu_async_relax(void) { }
+static inline bool rcu_cpu_online(int cpu) { return true; }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
@@ -502,6 +512,7 @@ void rcu_unexpedite_gp(void);
void rcu_async_hurry(void);
void rcu_async_relax(void);
void rcupdate_announce_bootup_oddness(void);
+bool rcu_cpu_online(int cpu);
#ifdef CONFIG_TASKS_RCU_GENERIC
void show_rcu_tasks_gp_kthreads(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
@@ -568,10 +579,6 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
static inline void rcu_gp_set_torture_wait(int duration) { }
#endif
-#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
-long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
-#endif
-
#ifdef CONFIG_TINY_SRCU
static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
@@ -654,4 +661,10 @@ static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
bool rcu_cpu_beenfullyonline(int cpu);
#endif
+#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+int rcu_stall_notifier_call_chain(unsigned long val, void *v);
+#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; }
+#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index f71fac422c8f..1693ea22ef1b 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -368,7 +368,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
smp_mb(); /* Ensure counts are updated before callback is entrained. */
rhp->next = NULL;
for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
- if (rsclp->tails[i] != rsclp->tails[i - 1])
+ if (!rcu_segcblist_segempty(rsclp, i))
break;
rcu_segcblist_inc_seglen(rsclp, i);
WRITE_ONCE(*rsclp->tails[i], rhp);
@@ -551,7 +551,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
* as their ->gp_seq[] grace-period completion sequence number.
*/
for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
- if (rsclp->tails[i] != rsclp->tails[i - 1] &&
+ if (!rcu_segcblist_segempty(rsclp, i) &&
ULONG_CMP_LT(rsclp->gp_seq[i], seq))
break;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index ade42d6a9d9b..7567ca8e743c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -21,6 +21,7 @@
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/rcupdate_wait.h>
+#include <linux/rcu_notifier.h>
#include <linux/interrupt.h>
#include <linux/sched/signal.h>
#include <uapi/linux/sched/types.h>
@@ -810,7 +811,7 @@ static void synchronize_rcu_trivial(void)
int cpu;
for_each_online_cpu(cpu) {
- rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
+ torture_sched_setaffinity(current->pid, cpumask_of(cpu));
WARN_ON_ONCE(raw_smp_processor_id() != cpu);
}
}
@@ -1149,7 +1150,7 @@ static int rcu_torture_boost(void *arg)
mutex_unlock(&boost_mutex);
break;
}
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
}
/* Go do the stutter. */
@@ -1160,7 +1161,7 @@ checkwait: if (stutter_wait("rcu_torture_boost"))
/* Clean up and exit. */
while (!kthread_should_stop()) {
torture_shutdown_absorb("rcu_torture_boost");
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
}
torture_kthread_stopping("rcu_torture_boost");
return 0;
@@ -1183,7 +1184,7 @@ rcu_torture_fqs(void *arg)
fqs_resume_time = jiffies + fqs_stutter * HZ;
while (time_before(jiffies, fqs_resume_time) &&
!kthread_should_stop()) {
- schedule_timeout_interruptible(1);
+ schedule_timeout_interruptible(HZ / 20);
}
fqs_burst_remaining = fqs_duration;
while (fqs_burst_remaining > 0 &&
@@ -2126,7 +2127,7 @@ static int rcu_nocb_toggle(void *arg)
VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started");
while (!rcu_inkernel_boot_has_ended())
schedule_timeout_interruptible(HZ / 10);
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
maxcpu = cpu;
WARN_ON(maxcpu < 0);
if (toggle_interval > ULONG_MAX)
@@ -2428,6 +2429,16 @@ static int rcutorture_booster_init(unsigned int cpu)
return 0;
}
+static int rcu_torture_stall_nf(struct notifier_block *nb, unsigned long v, void *ptr)
+{
+ pr_info("%s: v=%lu, duration=%lu.\n", __func__, v, (unsigned long)ptr);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcu_torture_stall_block = {
+ .notifier_call = rcu_torture_stall_nf,
+};
+
/*
* CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
* induces a CPU stall for the time specified by stall_cpu.
@@ -2435,9 +2446,16 @@ static int rcutorture_booster_init(unsigned int cpu)
static int rcu_torture_stall(void *args)
{
int idx;
+ int ret;
unsigned long stop_at;
VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
+ if (rcu_cpu_stall_notifiers) {
+ ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
+ if (ret)
+ pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
+ __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
+ }
if (stall_cpu_holdoff > 0) {
VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
@@ -2481,6 +2499,11 @@ static int rcu_torture_stall(void *args)
cur_ops->readunlock(idx);
}
pr_alert("%s end.\n", __func__);
+ if (rcu_cpu_stall_notifiers && !ret) {
+ ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block);
+ if (ret)
+ pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret);
+ }
torture_shutdown_absorb("rcu_torture_stall");
while (!kthread_should_stop())
schedule_timeout_interruptible(10 * HZ);
@@ -2899,7 +2922,7 @@ static int rcu_torture_fwd_prog(void *args)
WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
} else {
while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop())
- schedule_timeout_interruptible(1);
+ schedule_timeout_interruptible(HZ / 20);
oldseq = READ_ONCE(rcu_fwd_seq);
}
pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
@@ -3200,7 +3223,7 @@ static int rcu_torture_read_exit_child(void *trsp_in)
set_user_nice(current, MAX_NICE);
// Minimize time between reading and exiting.
while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
(void)rcu_torture_one_read(trsp, -1);
return 0;
}
@@ -3248,7 +3271,7 @@ static int rcu_torture_read_exit(void *unused)
smp_mb(); // Store before wakeup.
wake_up(&read_exit_wq);
while (!torture_must_stop())
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
torture_kthread_stopping("rcu_torture_read_exit");
return 0;
}
@@ -3851,7 +3874,9 @@ rcu_torture_init(void)
}
if (fqs_duration < 0)
fqs_duration = 0;
- if (fqs_duration) {
+ if (fqs_holdoff < 0)
+ fqs_holdoff = 0;
+ if (fqs_duration && fqs_holdoff) {
/* Create the fqs thread */
firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
fqs_task);
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 91a0fd0d4d9a..2c2648a3ad30 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -655,12 +655,12 @@ retry:
goto retry;
}
un_delay(udl, ndl);
+ b = READ_ONCE(rtsp->a);
// Remember, seqlock read-side release can fail.
if (!rts_release(rtsp, start)) {
rcu_read_unlock();
goto retry;
}
- b = READ_ONCE(rtsp->a);
WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b);
b = rtsp->b;
rcu_read_unlock();
@@ -1025,8 +1025,8 @@ static void
ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
- "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
- verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay);
+ "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+ verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay);
}
static void
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 336af24e0fe3..c38e5933a5d6 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
while (lh) {
rhp = lh;
lh = lh->next;
+ debug_rcu_head_callback(rhp);
local_bh_disable();
rhp->func(rhp);
local_bh_enable();
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 20d7a238d675..0351a4e83529 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -223,7 +223,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
snp->grplo = cpu;
snp->grphi = cpu;
}
- sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
+ sdp->grpmask = 1UL << (cpu - sdp->mynode->grplo);
}
smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
return true;
@@ -255,29 +255,31 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
ssp->srcu_sup->sda_is_static = is_static;
if (!is_static)
ssp->sda = alloc_percpu(struct srcu_data);
- if (!ssp->sda) {
- if (!is_static)
- kfree(ssp->srcu_sup);
- return -ENOMEM;
- }
+ if (!ssp->sda)
+ goto err_free_sup;
init_srcu_struct_data(ssp);
ssp->srcu_sup->srcu_gp_seq_needed_exp = 0;
ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
- if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
- if (!ssp->srcu_sup->sda_is_static) {
- free_percpu(ssp->sda);
- ssp->sda = NULL;
- kfree(ssp->srcu_sup);
- return -ENOMEM;
- }
- } else {
- WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
- }
+ if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC))
+ goto err_free_sda;
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
}
ssp->srcu_sup->srcu_ssp = ssp;
smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
return 0;
+
+err_free_sda:
+ if (!is_static) {
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
+ }
+err_free_sup:
+ if (!is_static) {
+ kfree(ssp->srcu_sup);
+ ssp->srcu_sup = NULL;
+ }
+ return -ENOMEM;
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -770,21 +772,10 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
*/
static void srcu_gp_start(struct srcu_struct *ssp)
{
- struct srcu_data *sdp;
int state;
- if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
- sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
- else
- sdp = this_cpu_ptr(ssp->sda);
lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed));
- spin_lock_rcu_node(sdp); /* Interrupts already disabled. */
- rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
- spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */
WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0);
smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
@@ -833,7 +824,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
int cpu;
for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
- if (!(mask & (1 << (cpu - snp->grplo))))
+ if (!(mask & (1UL << (cpu - snp->grplo))))
continue;
srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
}
@@ -1242,10 +1233,39 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
spin_lock_irqsave_sdp_contention(sdp, &flags);
if (rhp)
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
- rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ /*
+ * The snapshot for acceleration must be taken _before_ the read of the
+ * current gp sequence used for advancing, otherwise advancing may fail
+ * and acceleration may then fail too.
+ *
+ * This could happen if:
+ *
+ * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the
+ * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8).
+ *
+ * 2) The grace period for RCU_WAIT_TAIL is seen as started but not
+ * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1.
+ *
+ * 3) This value is passed to rcu_segcblist_advance() which can't move
+ * any segment forward and fails.
+ *
+ * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration.
+ * But then the call to rcu_seq_snap() observes the grace period for the
+ * RCU_WAIT_TAIL segment as completed and the subsequent one for the
+ * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1)
+ * so it returns a snapshot of the next grace period, which is X + 12.
+ *
+ * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the
+ * freshly enqueued callback in RCU_NEXT_TAIL can't move to
+ * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace
+ * period (gp_num = X + 8). So acceleration fails.
+ */
s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
+ if (rhp) {
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s));
+ }
if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
sdp->srcu_gp_seq_needed = s;
needgp = true;
@@ -1692,8 +1712,14 @@ static void srcu_invoke_callbacks(struct work_struct *work)
ssp = sdp->ssp;
rcu_cblist_init(&ready_cbs);
spin_lock_irq_rcu_node(sdp);
+ WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
rcu_segcblist_advance(&sdp->srcu_cblist,
rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ /*
+ * Although this function is theoretically re-entrant, concurrent
+ * callbacks invocation is disallowed to avoid executing an SRCU barrier
+ * too early.
+ */
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
spin_unlock_irq_rcu_node(sdp);
@@ -1708,6 +1734,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
rhp = rcu_cblist_dequeue(&ready_cbs);
for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
debug_rcu_head_unqueue(rhp);
+ debug_rcu_head_callback(rhp);
local_bh_disable();
rhp->func(rhp);
local_bh_enable();
@@ -1720,11 +1747,10 @@ static void srcu_invoke_callbacks(struct work_struct *work)
*/
spin_lock_irq_rcu_node(sdp);
rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
spin_unlock_irq_rcu_node(sdp);
+ /* An SRCU barrier or callbacks from previous nesting work pending */
if (more)
srcu_schedule_cbs_sdp(sdp, 0);
}
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 8d65f7d576a3..732ad5b39946 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -432,6 +432,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
{
int cpu;
+ int dequeue_limit;
unsigned long flags;
bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq);
long n;
@@ -439,7 +440,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
long ncbsnz = 0;
int needgpcb = 0;
- for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) {
+ dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim);
+ for (cpu = 0; cpu < dequeue_limit; cpu++) {
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
/* Advance and accelerate any new callbacks. */
@@ -538,6 +540,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
len = rcl.len;
for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+ debug_rcu_head_callback(rhp);
local_bh_disable();
rhp->func(rhp);
local_bh_enable();
@@ -892,10 +895,36 @@ static void rcu_tasks_pregp_step(struct list_head *hop)
synchronize_rcu();
}
+/* Check for quiescent states since the pregp's synchronize_rcu() */
+static bool rcu_tasks_is_holdout(struct task_struct *t)
+{
+ int cpu;
+
+ /* Has the task been seen voluntarily sleeping? */
+ if (!READ_ONCE(t->on_rq))
+ return false;
+
+ /*
+ * Idle tasks (or idle injection) within the idle loop are RCU-tasks
+ * quiescent states. But CPU boot code performed by the idle task
+ * isn't a quiescent state.
+ */
+ if (is_idle_task(t))
+ return false;
+
+ cpu = task_cpu(t);
+
+ /* Idle tasks on offline CPUs are RCU-tasks quiescent states. */
+ if (t == idle_task(cpu) && !rcu_cpu_online(cpu))
+ return false;
+
+ return true;
+}
+
/* Per-task initial processing. */
static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
{
- if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) {
+ if (t != current && rcu_tasks_is_holdout(t)) {
get_task_struct(t);
t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
WRITE_ONCE(t->rcu_tasks_holdout, true);
@@ -944,9 +973,9 @@ static void check_holdout_task(struct task_struct *t,
if (!READ_ONCE(t->rcu_tasks_holdout) ||
t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
- !READ_ONCE(t->on_rq) ||
+ !rcu_tasks_is_holdout(t) ||
(IS_ENABLED(CONFIG_NO_HZ_FULL) &&
- !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
+ !is_idle_task(t) && READ_ONCE(t->rcu_tasks_idle_cpu) >= 0)) {
WRITE_ONCE(t->rcu_tasks_holdout, false);
list_del_init(&t->rcu_tasks_holdout_list);
put_task_struct(t);
@@ -964,7 +993,7 @@ static void check_holdout_task(struct task_struct *t,
t, ".I"[is_idle_task(t)],
"N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
- t->rcu_tasks_idle_cpu, cpu);
+ data_race(t->rcu_tasks_idle_cpu), cpu);
sched_show_task(t);
}
@@ -1084,7 +1113,7 @@ void rcu_barrier_tasks(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
-int rcu_tasks_lazy_ms = -1;
+static int rcu_tasks_lazy_ms = -1;
module_param(rcu_tasks_lazy_ms, int, 0444);
static int __init rcu_spawn_tasks_kthread(void)
@@ -1522,7 +1551,7 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
} else {
// The task is not running, so C-language access is safe.
nesting = t->trc_reader_nesting;
- WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t));
+ WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t))));
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl)
n_heavy_reader_ofl_updates++;
}
@@ -1979,20 +2008,22 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
static void rcu_tasks_initiate_self_tests(void)
{
- pr_info("Running RCU-tasks wait API self tests\n");
#ifdef CONFIG_TASKS_RCU
+ pr_info("Running RCU Tasks wait API self tests\n");
tests[0].runstart = jiffies;
synchronize_rcu_tasks();
call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_RUDE_RCU
+ pr_info("Running RCU Tasks Rude wait API self tests\n");
tests[1].runstart = jiffies;
synchronize_rcu_tasks_rude();
call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
+ pr_info("Running RCU Tasks Trace wait API self tests\n");
tests[2].runstart = jiffies;
synchronize_rcu_tasks_trace();
call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 42f7589e51e0..fec804b79080 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
trace_rcu_invoke_callback("", head);
f = head->func;
+ debug_rcu_head_callback(head);
WRITE_ONCE(head->func, (rcu_callback_t)0L);
f(head);
rcu_lock_release(&rcu_callback_map);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cb1caefa8bd0..b2bccfd37c38 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -31,6 +31,7 @@
#include <linux/bitops.h>
#include <linux/export.h>
#include <linux/completion.h>
+#include <linux/kmemleak.h>
#include <linux/moduleparam.h>
#include <linux/panic.h>
#include <linux/panic_notifier.h>
@@ -754,14 +755,19 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
}
/*
- * Return true if the specified CPU has passed through a quiescent
- * state by virtue of being in or having passed through an dynticks
- * idle state since the last call to dyntick_save_progress_counter()
- * for this same CPU, or by virtue of having been offline.
+ * Returns positive if the specified CPU has passed through a quiescent state
+ * by virtue of being in or having passed through an dynticks idle state since
+ * the last call to dyntick_save_progress_counter() for this same CPU, or by
+ * virtue of having been offline.
+ *
+ * Returns negative if the specified CPU needs a force resched.
+ *
+ * Returns zero otherwise.
*/
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
unsigned long jtsq;
+ int ret = 0;
struct rcu_node *rnp = rdp->mynode;
/*
@@ -847,8 +853,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
(time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
rcu_state.cbovld)) {
WRITE_ONCE(rdp->rcu_urgent_qs, true);
- resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+ ret = -1;
}
/*
@@ -861,8 +867,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (time_after(jiffies, rcu_state.jiffies_resched)) {
if (time_after(jiffies,
READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
- resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+ ret = -1;
}
if (IS_ENABLED(CONFIG_IRQ_WORK) &&
!rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
@@ -891,7 +897,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
}
}
- return 0;
+ return ret;
}
/* Trace-event wrapper function for trace_rcu_future_grace_period. */
@@ -1007,6 +1013,38 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
return needmore;
}
+static void swake_up_one_online_ipi(void *arg)
+{
+ struct swait_queue_head *wqh = arg;
+
+ swake_up_one(wqh);
+}
+
+static void swake_up_one_online(struct swait_queue_head *wqh)
+{
+ int cpu = get_cpu();
+
+ /*
+ * If called from rcutree_report_cpu_starting(), wake up
+ * is dangerous that late in the CPU-down hotplug process. The
+ * scheduler might queue an ignored hrtimer. Defer the wake up
+ * to an online CPU instead.
+ */
+ if (unlikely(cpu_is_offline(cpu))) {
+ int target;
+
+ target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU),
+ cpu_online_mask);
+
+ smp_call_function_single(target, swake_up_one_online_ipi,
+ wqh, 0);
+ put_cpu();
+ } else {
+ put_cpu();
+ swake_up_one(wqh);
+ }
+}
+
/*
* Awaken the grace-period kthread. Don't do a self-awaken (unless in an
* interrupt or softirq handler, in which case we just might immediately
@@ -1031,7 +1069,7 @@ static void rcu_gp_kthread_wake(void)
return;
WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
- swake_up_one(&rcu_state.gp_wq);
+ swake_up_one_online(&rcu_state.gp_wq);
}
/*
@@ -1260,7 +1298,7 @@ EXPORT_SYMBOL_GPL(rcu_gp_slow_register);
/* Unregister a counter, with NULL for not caring which. */
void rcu_gp_slow_unregister(atomic_t *rgssp)
{
- WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress);
+ WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL);
WRITE_ONCE(rcu_gp_slow_suppress, NULL);
}
@@ -1556,10 +1594,22 @@ static bool rcu_gp_fqs_check_wake(int *gfp)
*/
static void rcu_gp_fqs(bool first_time)
{
+ int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall);
struct rcu_node *rnp = rcu_get_root();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
+
+ WARN_ON_ONCE(nr_fqs > 3);
+ /* Only countdown nr_fqs for stall purposes if jiffies moves. */
+ if (nr_fqs) {
+ if (nr_fqs == 1) {
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
+ }
+ WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs);
+ }
+
if (first_time) {
/* Collect dyntick-idle snapshots. */
force_qs_rnp(dyntick_save_progress_counter);
@@ -2135,6 +2185,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
trace_rcu_invoke_callback(rcu_state.name, rhp);
f = rhp->func;
+ debug_rcu_head_callback(rhp);
WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
f(rhp);
@@ -2257,15 +2308,15 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
{
int cpu;
unsigned long flags;
- unsigned long mask;
- struct rcu_data *rdp;
struct rcu_node *rnp;
rcu_state.cbovld = rcu_state.cbovldnext;
rcu_state.cbovldnext = false;
rcu_for_each_leaf_node(rnp) {
+ unsigned long mask = 0;
+ unsigned long rsmask = 0;
+
cond_resched_tasks_rcu_qs();
- mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rcu_state.cbovldnext |= !!rnp->cbovldmask;
if (rnp->qsmask == 0) {
@@ -2283,11 +2334,17 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
continue;
}
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
+ struct rcu_data *rdp;
+ int ret;
+
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (f(rdp)) {
+ ret = f(rdp);
+ if (ret > 0) {
mask |= rdp->grpmask;
rcu_disable_urgency_upon_qs(rdp);
}
+ if (ret < 0)
+ rsmask |= rdp->grpmask;
}
if (mask != 0) {
/* Idle/offline CPUs, report (releases rnp->lock). */
@@ -2296,6 +2353,9 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
/* Nothing to do here, so just drop the lock. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+
+ for_each_leaf_node_cpu_mask(rnp, cpu, rsmask)
+ resched_cpu(cpu);
}
}
@@ -2310,6 +2370,8 @@ void rcu_force_quiescent_state(void)
struct rcu_node *rnp;
struct rcu_node *rnp_old = NULL;
+ if (!rcu_gp_in_progress())
+ return;
/* Funnel through hierarchy to reduce memory contention. */
rnp = raw_cpu_read(rcu_data.mynode);
for (; rnp != NULL; rnp = rnp->parent) {
@@ -2713,7 +2775,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
*/
void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
{
- return __call_rcu_common(head, func, false);
+ __call_rcu_common(head, func, false);
}
EXPORT_SYMBOL_GPL(call_rcu_hurry);
#endif
@@ -2764,7 +2826,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
+ __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -3388,6 +3450,14 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
success = true;
}
+ /*
+ * The kvfree_rcu() caller considers the pointer freed at this point
+ * and likely removes any references to it. Since the actual slab
+ * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
+ * this object (no scanning or false positives reporting).
+ */
+ kmemleak_ignore(ptr);
+
// Set timer to drain after KFREE_DRAIN_JIFFIES.
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
schedule_delayed_monitor_work(krcp);
@@ -3449,13 +3519,6 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
return freed == 0 ? SHRINK_STOP : freed;
}
-static struct shrinker kfree_rcu_shrinker = {
- .count_objects = kfree_rcu_shrink_count,
- .scan_objects = kfree_rcu_shrink_scan,
- .batch = 0,
- .seeks = DEFAULT_SEEKS,
-};
-
void __init kfree_rcu_scheduler_running(void)
{
int cpu;
@@ -4083,6 +4146,82 @@ retry:
}
EXPORT_SYMBOL_GPL(rcu_barrier);
+static unsigned long rcu_barrier_last_throttle;
+
+/**
+ * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second
+ *
+ * This can be thought of as guard rails around rcu_barrier() that
+ * permits unrestricted userspace use, at least assuming the hardware's
+ * try_cmpxchg() is robust. There will be at most one call per second to
+ * rcu_barrier() system-wide from use of this function, which means that
+ * callers might needlessly wait a second or three.
+ *
+ * This is intended for use by test suites to avoid OOM by flushing RCU
+ * callbacks from the previous test before starting the next. See the
+ * rcutree.do_rcu_barrier module parameter for more information.
+ *
+ * Why not simply make rcu_barrier() more scalable? That might be
+ * the eventual endpoint, but let's keep it simple for the time being.
+ * Note that the module parameter infrastructure serializes calls to a
+ * given .set() function, but should concurrent .set() invocation ever be
+ * possible, we are ready!
+ */
+static void rcu_barrier_throttled(void)
+{
+ unsigned long j = jiffies;
+ unsigned long old = READ_ONCE(rcu_barrier_last_throttle);
+ unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
+
+ while (time_in_range(j, old, old + HZ / 16) ||
+ !try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) {
+ schedule_timeout_idle(HZ / 16);
+ if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
+ smp_mb(); /* caller's subsequent code after above check. */
+ return;
+ }
+ j = jiffies;
+ old = READ_ONCE(rcu_barrier_last_throttle);
+ }
+ rcu_barrier();
+}
+
+/*
+ * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier
+ * request arrives. We insist on a true value to allow for possible
+ * future expansion.
+ */
+static int param_set_do_rcu_barrier(const char *val, const struct kernel_param *kp)
+{
+ bool b;
+ int ret;
+
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
+ return -EAGAIN;
+ ret = kstrtobool(val, &b);
+ if (!ret && b) {
+ atomic_inc((atomic_t *)kp->arg);
+ rcu_barrier_throttled();
+ atomic_dec((atomic_t *)kp->arg);
+ }
+ return ret;
+}
+
+/*
+ * Output the number of outstanding rcutree.do_rcu_barrier requests.
+ */
+static int param_get_do_rcu_barrier(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%d\n", atomic_read((atomic_t *)kp->arg));
+}
+
+static const struct kernel_param_ops do_rcu_barrier_ops = {
+ .set = param_set_do_rcu_barrier,
+ .get = param_get_do_rcu_barrier,
+};
+static atomic_t do_rcu_barrier;
+module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, 0644);
+
/*
* Compute the mask of online CPUs for the specified rcu_node structure.
* This will not be stable unless the rcu_node structure's ->lock is
@@ -4104,6 +4243,13 @@ static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
}
+bool rcu_cpu_online(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return rcu_rdp_cpu_online(rdp);
+}
+
#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
/*
@@ -4130,7 +4276,7 @@ bool rcu_lockdep_current_cpu_online(void)
rdp = this_cpu_ptr(&rcu_data);
/*
* Strictly, we care here about the case where the current CPU is
- * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
+ * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask
* not being up to date. So arch_spin_is_locked() might have a
* false positive if it's held by some *other* CPU, but that's
* OK because that just means a false *negative* on the warning.
@@ -4152,25 +4298,6 @@ static bool rcu_init_invoked(void)
}
/*
- * Near the end of the offline process. Trace the fact that this CPU
- * is going offline.
- */
-int rcutree_dying_cpu(unsigned int cpu)
-{
- bool blkd;
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp = rdp->mynode;
-
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return 0;
-
- blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
- trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
- blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
- return 0;
-}
-
-/*
* All CPUs for the specified rcu_node structure have gone offline,
* and all tasks that were preempted within an RCU read-side critical
* section while running on one of those CPUs have since exited their RCU
@@ -4216,23 +4343,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
}
/*
- * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup.
- * There can only be one CPU hotplug operation at a time, so no need for
- * explicit locking.
- */
-int rcutree_dead_cpu(unsigned int cpu)
-{
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return 0;
-
- WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
- // Stop-machine done, so allow nohz_full to disable tick.
- tick_dep_clear(TICK_DEP_BIT_RCU);
- return 0;
-}
-
-/*
* Propagate ->qsinitmask bits up the rcu_node tree to account for the
* first CPU in a given leaf rcu_node structure coming online. The caller
* must hold the corresponding leaf rcu_node ->lock with interrupts
@@ -4385,29 +4495,6 @@ int rcutree_online_cpu(unsigned int cpu)
}
/*
- * Near the beginning of the process. The CPU is still very much alive
- * with pretty much all services enabled.
- */
-int rcutree_offline_cpu(unsigned int cpu)
-{
- unsigned long flags;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
-
- rdp = per_cpu_ptr(&rcu_data, cpu);
- rnp = rdp->mynode;
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- rnp->ffmask &= ~rdp->grpmask;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
- rcutree_affinity_setting(cpu, cpu);
-
- // nohz_full CPUs need the tick for stop-machine to work quickly
- tick_dep_set(TICK_DEP_BIT_RCU);
- return 0;
-}
-
-/*
* Mark the specified CPU as being online so that subsequent grace periods
* (both expedited and normal) will wait on it. Note that this means that
* incoming CPUs are not allowed to use RCU read-side critical sections
@@ -4418,8 +4505,10 @@ int rcutree_offline_cpu(unsigned int cpu)
* from the incoming CPU rather than from the cpuhp_step mechanism.
* This is because this function must be invoked at a precise location.
* This incoming CPU must not have enabled interrupts yet.
+ *
+ * This mirrors the effects of rcutree_report_cpu_dead().
*/
-void rcu_cpu_starting(unsigned int cpu)
+void rcutree_report_cpu_starting(unsigned int cpu)
{
unsigned long mask;
struct rcu_data *rdp;
@@ -4473,14 +4562,21 @@ void rcu_cpu_starting(unsigned int cpu)
* Note that this function is special in that it is invoked directly
* from the outgoing CPU rather than from the cpuhp_step mechanism.
* This is because this function must be invoked at a precise location.
+ *
+ * This mirrors the effect of rcutree_report_cpu_starting().
*/
-void rcu_report_dead(unsigned int cpu)
+void rcutree_report_cpu_dead(void)
{
- unsigned long flags, seq_flags;
+ unsigned long flags;
unsigned long mask;
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+ /*
+ * IRQS must be disabled from now on and until the CPU dies, or an interrupt
+ * may introduce a new READ-side while it is actually off the QS masks.
+ */
+ lockdep_assert_irqs_disabled();
// Do any dangling deferred wakeups.
do_nocb_deferred_wakeup(rdp);
@@ -4488,7 +4584,6 @@ void rcu_report_dead(unsigned int cpu)
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
- local_irq_save(seq_flags);
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4502,8 +4597,6 @@ void rcu_report_dead(unsigned int cpu)
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
arch_spin_unlock(&rcu_state.ofl_lock);
- local_irq_restore(seq_flags);
-
rdp->cpu_started = false;
}
@@ -4558,7 +4651,60 @@ void rcutree_migrate_callbacks(int cpu)
cpu, rcu_segcblist_n_cbs(&rdp->cblist),
rcu_segcblist_first_cb(&rdp->cblist));
}
-#endif
+
+/*
+ * The CPU has been completely removed, and some other CPU is reporting
+ * this fact from process context. Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
+ */
+int rcutree_dead_cpu(unsigned int cpu)
+{
+ WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
+ // Stop-machine done, so allow nohz_full to disable tick.
+ tick_dep_clear(TICK_DEP_BIT_RCU);
+ return 0;
+}
+
+/*
+ * Near the end of the offline process. Trace the fact that this CPU
+ * is going offline.
+ */
+int rcutree_dying_cpu(unsigned int cpu)
+{
+ bool blkd;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+
+ blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
+ trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+ blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
+ return 0;
+}
+
+/*
+ * Near the beginning of the process. The CPU is still very much alive
+ * with pretty much all services enabled.
+ */
+int rcutree_offline_cpu(unsigned int cpu)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->ffmask &= ~rdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ rcutree_affinity_setting(cpu, cpu);
+
+ // nohz_full CPUs need the tick for stop-machine to work quickly
+ tick_dep_set(TICK_DEP_BIT_RCU);
+ return 0;
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
/*
* On non-huge systems, use expedited RCU grace periods to make suspend
@@ -4931,6 +5077,7 @@ static void __init kfree_rcu_batch_init(void)
{
int cpu;
int i, j;
+ struct shrinker *kfree_rcu_shrinker;
/* Clamp it to [0:100] seconds interval. */
if (rcu_delay_page_cache_fill_msec < 0 ||
@@ -4962,8 +5109,17 @@ static void __init kfree_rcu_batch_init(void)
INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
- if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree"))
- pr_err("Failed to register kfree_rcu() shrinker!\n");
+
+ kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree");
+ if (!kfree_rcu_shrinker) {
+ pr_err("Failed to allocate kfree_rcu() shrinker!\n");
+ return;
+ }
+
+ kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
+ kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
+
+ shrinker_register(kfree_rcu_shrinker);
}
void __init rcu_init(void)
@@ -4990,7 +5146,7 @@ void __init rcu_init(void)
pm_notifier(rcu_pm_notify, 0);
WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
rcutree_prepare_cpu(cpu);
- rcu_cpu_starting(cpu);
+ rcutree_report_cpu_starting(cpu);
rcutree_online_cpu(cpu);
/* Create workqueue for Tree SRCU and for expedited GPs. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 192536916f9a..e9821a8422db 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -386,6 +386,10 @@ struct rcu_state {
/* in jiffies. */
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
+ int nr_fqs_jiffies_stall; /* Number of fqs loops after
+ * which read jiffies and set
+ * jiffies_stall. Stall
+ * warnings disabled if !0. */
unsigned long jiffies_resched; /* Time at which to resched */
/* a reluctant CPU. */
unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 8239b39d945b..2ac440bc7e10 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -173,7 +173,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
return ret;
}
-
/*
* Report the exit from RCU read-side critical section for the last task
* that queued itself during or before the current expedited preemptible-RCU
@@ -201,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (wake) {
smp_mb(); /* EGP done before wake_up(). */
- swake_up_one(&rcu_state.expedited_wq);
+ swake_up_one_online(&rcu_state.expedited_wq);
}
break;
}
@@ -621,10 +620,14 @@ static void synchronize_rcu_expedited_wait(void)
}
for (;;) {
+ unsigned long j;
+
if (synchronize_rcu_expedited_wait_once(jiffies_stall))
return;
if (rcu_stall_is_suppressed())
continue;
+ j = jiffies;
+ rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start));
trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
rcu_state.name);
@@ -647,7 +650,7 @@ static void synchronize_rcu_expedited_wait(void)
}
}
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
- jiffies - jiffies_start, rcu_state.expedited_sequence,
+ j - jiffies_start, rcu_state.expedited_sequence,
data_race(rnp_root->expmask),
".T"[!!data_race(rnp_root->exp_tasks)]);
if (ndetected) {
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 5598212d1f27..4efbf7333d4e 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1396,13 +1396,6 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
return count ? count : SHRINK_STOP;
}
-
-static struct shrinker lazy_rcu_shrinker = {
- .count_objects = lazy_rcu_shrink_count,
- .scan_objects = lazy_rcu_shrink_scan,
- .batch = 0,
- .seeks = DEFAULT_SEEKS,
-};
#endif // #ifdef CONFIG_RCU_LAZY
void __init rcu_init_nohz(void)
@@ -1410,6 +1403,7 @@ void __init rcu_init_nohz(void)
int cpu;
struct rcu_data *rdp;
const struct cpumask *cpumask = NULL;
+ struct shrinker * __maybe_unused lazy_rcu_shrinker;
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
@@ -1436,8 +1430,15 @@ void __init rcu_init_nohz(void)
return;
#ifdef CONFIG_RCU_LAZY
- if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy"))
- pr_err("Failed to register lazy_rcu shrinker!\n");
+ lazy_rcu_shrinker = shrinker_alloc(0, "rcu-lazy");
+ if (!lazy_rcu_shrinker) {
+ pr_err("Failed to allocate lazy_rcu shrinker!\n");
+ } else {
+ lazy_rcu_shrinker->count_objects = lazy_rcu_shrink_count;
+ lazy_rcu_shrinker->scan_objects = lazy_rcu_shrink_scan;
+
+ shrinker_register(lazy_rcu_shrinker);
+ }
#endif // #ifdef CONFIG_RCU_LAZY
if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 6f06dc12904a..5d666428546b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -8,6 +8,7 @@
*/
#include <linux/kvm_para.h>
+#include <linux/rcu_notifier.h>
//////////////////////////////////////////////////////////////////////////////
//
@@ -149,12 +150,17 @@ static void panic_on_rcu_stall(void)
/**
* rcu_cpu_stall_reset - restart stall-warning timeout for current grace period
*
+ * To perform the reset request from the caller, disable stall detection until
+ * 3 fqs loops have passed. This is required to ensure a fresh jiffies is
+ * loaded. It should be safe to do from the fqs loop as enough timer
+ * interrupts and context switches should have passed.
+ *
* The caller must disable hard irqs.
*/
void rcu_cpu_stall_reset(void)
{
- WRITE_ONCE(rcu_state.jiffies_stall,
- jiffies + rcu_jiffies_till_stall_check());
+ WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 3);
+ WRITE_ONCE(rcu_state.jiffies_stall, ULONG_MAX);
}
//////////////////////////////////////////////////////////////////////////////
@@ -170,6 +176,7 @@ static void record_gp_stall_check_time(void)
WRITE_ONCE(rcu_state.gp_start, j);
j1 = rcu_jiffies_till_stall_check();
smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq.
+ WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 0);
WRITE_ONCE(rcu_state.jiffies_stall, j + j1);
rcu_state.jiffies_resched = j + j1 / 2;
rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
@@ -534,16 +541,16 @@ static void rcu_check_gp_kthread_starvation(void)
data_race(READ_ONCE(rcu_state.gp_state)),
gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu);
if (gpk) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
pr_err("RCU grace-period kthread stack dump:\n");
sched_show_task(gpk);
- if (cpu >= 0) {
- if (cpu_is_offline(cpu)) {
- pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
- } else {
- pr_err("Stack dump where RCU GP kthread last ran:\n");
- dump_cpu_task(cpu);
- }
+ if (cpu_is_offline(cpu)) {
+ pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
+ } else if (!(data_race(READ_ONCE(rdp->mynode->qsmask)) & rdp->grpmask)) {
+ pr_err("Stack dump where RCU GP kthread last ran:\n");
+ dump_cpu_task(cpu);
}
wake_up_process(gpk);
}
@@ -711,7 +718,7 @@ static void print_cpu_stall(unsigned long gps)
static void check_cpu_stall(struct rcu_data *rdp)
{
- bool didstall = false;
+ bool self_detected;
unsigned long gs1;
unsigned long gs2;
unsigned long gps;
@@ -725,6 +732,16 @@ static void check_cpu_stall(struct rcu_data *rdp)
!rcu_gp_in_progress())
return;
rcu_stall_kick_kthreads();
+
+ /*
+ * Check if it was requested (via rcu_cpu_stall_reset()) that the FQS
+ * loop has to set jiffies to ensure a non-stale jiffies value. This
+ * is required to have good jiffies value after coming out of long
+ * breaks of jiffies updates. Not doing so can cause false positives.
+ */
+ if (READ_ONCE(rcu_state.nr_fqs_jiffies_stall) > 0)
+ return;
+
j = jiffies;
/*
@@ -758,10 +775,10 @@ static void check_cpu_stall(struct rcu_data *rdp)
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
jn = jiffies + ULONG_MAX / 2;
+ self_detected = READ_ONCE(rnp->qsmask) & rdp->grpmask;
if (rcu_gp_in_progress() &&
- (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
+ (self_detected || ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) &&
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
/*
* If a virtual machine is stopped by the host it can look to
* the watchdog like an RCU stall. Check to see if the host
@@ -770,39 +787,28 @@ static void check_cpu_stall(struct rcu_data *rdp)
if (kvm_check_and_clear_guest_paused())
return;
- /* We haven't checked in, so go dump stack. */
- print_cpu_stall(gps);
- if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
- rcu_ftrace_dump(DUMP_ALL);
- didstall = true;
-
- } else if (rcu_gp_in_progress() &&
- ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
- cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
- /*
- * If a virtual machine is stopped by the host it can look to
- * the watchdog like an RCU stall. Check to see if the host
- * stopped the vm.
- */
- if (kvm_check_and_clear_guest_paused())
- return;
+ rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps);
+ if (self_detected) {
+ /* We haven't checked in, so go dump stack. */
+ print_cpu_stall(gps);
+ } else {
+ /* They had a few time units to dump stack, so complain. */
+ print_other_cpu_stall(gs2, gps);
+ }
- /* They had a few time units to dump stack, so complain. */
- print_other_cpu_stall(gs2, gps);
if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
rcu_ftrace_dump(DUMP_ALL);
- didstall = true;
- }
- if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) {
- jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
- WRITE_ONCE(rcu_state.jiffies_stall, jn);
+
+ if (READ_ONCE(rcu_state.jiffies_stall) == jn) {
+ jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rcu_state.jiffies_stall, jn);
+ }
}
}
//////////////////////////////////////////////////////////////////////////////
//
-// RCU forward-progress mechanisms, including of callback invocation.
+// RCU forward-progress mechanisms, including for callback invocation.
/*
@@ -1054,3 +1060,67 @@ static int __init rcu_sysrq_init(void)
return 0;
}
early_initcall(rcu_sysrq_init);
+
+#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RCU CPU stall-warning notifiers
+
+static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list);
+
+/**
+ * rcu_stall_chain_notifier_register - Add an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Adds an RCU CPU stall notifier to an atomic notifier chain.
+ * The @action passed to a notifier will be @RCU_STALL_NOTIFY_NORM or
+ * friends. The @data will be the duration of the stalled grace period,
+ * in jiffies, coerced to a void* pointer.
+ *
+ * Returns 0 on success, %-EEXIST on error.
+ */
+int rcu_stall_chain_notifier_register(struct notifier_block *n)
+{
+ int rcsn = rcu_cpu_stall_notifiers;
+
+ WARN(1, "Adding %pS() to RCU stall notifier list (%s).\n", n->notifier_call,
+ rcsn ? "possibly suppressing RCU CPU stall warnings" : "failed, so all is well");
+ if (rcsn)
+ return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n);
+ return -EEXIST;
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register);
+
+/**
+ * rcu_stall_chain_notifier_unregister - Remove an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Removes an RCU CPU stall notifier from an atomic notifier chain.
+ *
+ * Returns zero on success, %-ENOENT on failure.
+ */
+int rcu_stall_chain_notifier_unregister(struct notifier_block *n)
+{
+ return atomic_notifier_chain_unregister(&rcu_cpu_stall_notifier_list, n);
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_unregister);
+
+/*
+ * rcu_stall_notifier_call_chain - Call functions in an RCU CPU stall notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ *
+ * Calls each function in the RCU CPU stall notifier chain in turn, which
+ * is an atomic call chain. See atomic_notifier_call_chain() for more
+ * information.
+ *
+ * This is for use within RCU, hence the omission of the extra asterisk
+ * to indicate a non-kerneldoc format header comment.
+ */
+int rcu_stall_notifier_call_chain(unsigned long val, void *v)
+{
+ return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v);
+}
+
+#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 19bf6fa3ee6a..46aaaa9fe339 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -25,6 +25,7 @@
#include <linux/interrupt.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
+#include <linux/torture.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/percpu.h>
@@ -524,22 +525,28 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
do { } while (0)
#endif
-#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST)
/* Get rcutorture access to sched_setaffinity(). */
-long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
int ret;
ret = sched_setaffinity(pid, in_mask);
- WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret);
+ WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret);
return ret;
}
-EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
+EXPORT_SYMBOL_GPL(torture_sched_setaffinity);
#endif
+int rcu_cpu_stall_notifiers __read_mostly; // !0 = provide stall notifiers (rarely useful)
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_notifiers);
+
#ifdef CONFIG_RCU_STALL_COMMON
int rcu_cpu_stall_ftrace_dump __read_mostly;
module_param(rcu_cpu_stall_ftrace_dump, int, 0644);
+#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
+module_param(rcu_cpu_stall_notifiers, int, 0444);
+#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings.
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
module_param(rcu_cpu_stall_suppress, int, 0644);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 3bba88c7ffc6..22c16e2564cc 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -55,9 +55,18 @@ struct sys_off_handler {
enum sys_off_mode mode;
bool blocking;
void *list;
+ struct device *dev;
};
/*
+ * This variable is used to indicate if a halt was initiated instead of a
+ * reboot when the reboot call was invoked with LINUX_REBOOT_CMD_POWER_OFF, but
+ * the system cannot be powered off. This allowes kernel_halt() to notify users
+ * of that.
+ */
+static bool poweroff_fallback_to_halt;
+
+/*
* Temporary stub that prevents linkage failure while we're in process
* of removing all uses of legacy pm_power_off() around the kernel.
*/
@@ -74,6 +83,7 @@ void __weak (*pm_power_off)(void);
void emergency_restart(void)
{
kmsg_dump(KMSG_DUMP_EMERG);
+ system_state = SYSTEM_RESTART;
machine_emergency_restart();
}
EXPORT_SYMBOL_GPL(emergency_restart);
@@ -295,7 +305,10 @@ void kernel_halt(void)
kernel_shutdown_prepare(SYSTEM_HALT);
migrate_to_reboot_cpu();
syscore_shutdown();
- pr_emerg("System halted\n");
+ if (poweroff_fallback_to_halt)
+ pr_emerg("Power off not available: System halted instead\n");
+ else
+ pr_emerg("System halted\n");
kmsg_dump(KMSG_DUMP_SHUTDOWN);
machine_halt();
}
@@ -323,6 +336,7 @@ static int sys_off_notify(struct notifier_block *nb,
data.cb_data = handler->cb_data;
data.mode = mode;
data.cmd = cmd;
+ data.dev = handler->dev;
return handler->sys_off_cb(&data);
}
@@ -510,6 +524,7 @@ int devm_register_sys_off_handler(struct device *dev,
handler = register_sys_off_handler(mode, priority, callback, cb_data);
if (IS_ERR(handler))
return PTR_ERR(handler);
+ handler->dev = dev;
return devm_add_action_or_reset(dev, devm_unregister_sys_off_handler,
handler);
@@ -728,8 +743,10 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
/* Instead of trying to make the power_off code look like
* halt when pm_power_off is not set do it the easy way.
*/
- if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off())
+ if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) {
+ poweroff_fallback_to_halt = true;
cmd = LINUX_REBOOT_CMD_HALT;
+ }
mutex_lock(&system_transition_mutex);
switch (cmd) {
@@ -953,21 +970,24 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
}
/**
- * hw_protection_shutdown - Trigger an emergency system poweroff
+ * __hw_protection_shutdown - Trigger an emergency system shutdown or reboot
*
- * @reason: Reason of emergency shutdown to be printed.
- * @ms_until_forced: Time to wait for orderly shutdown before tiggering a
- * forced shudown. Negative value disables the forced
- * shutdown.
+ * @reason: Reason of emergency shutdown or reboot to be printed.
+ * @ms_until_forced: Time to wait for orderly shutdown or reboot before
+ * triggering it. Negative value disables the forced
+ * shutdown or reboot.
+ * @shutdown: If true, indicates that a shutdown will happen
+ * after the critical tempeature is reached.
+ * If false, indicates that a reboot will happen
+ * after the critical tempeature is reached.
*
- * Initiate an emergency system shutdown in order to protect hardware from
- * further damage. Usage examples include a thermal protection or a voltage or
- * current regulator failures.
- * NOTE: The request is ignored if protection shutdown is already pending even
- * if the previous request has given a large timeout for forced shutdown.
- * Can be called from any context.
+ * Initiate an emergency system shutdown or reboot in order to protect
+ * hardware from further damage. Usage examples include a thermal protection.
+ * NOTE: The request is ignored if protection shutdown or reboot is already
+ * pending even if the previous request has given a large timeout for forced
+ * shutdown/reboot.
*/
-void hw_protection_shutdown(const char *reason, int ms_until_forced)
+void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown)
{
static atomic_t allow_proceed = ATOMIC_INIT(1);
@@ -982,9 +1002,12 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
* orderly_poweroff failure
*/
hw_failure_emergency_poweroff(ms_until_forced);
- orderly_poweroff(true);
+ if (shutdown)
+ orderly_poweroff(true);
+ else
+ orderly_reboot();
}
-EXPORT_SYMBOL_GPL(hw_protection_shutdown);
+EXPORT_SYMBOL_GPL(__hw_protection_shutdown);
static int __init reboot_setup(char *str)
{
diff --git a/kernel/relay.c b/kernel/relay.c
index 83fe0325cde1..a8e90e98bf2c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1073,167 +1073,6 @@ static ssize_t relay_file_read(struct file *filp,
return written;
}
-static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
-{
- rbuf->bytes_consumed += bytes_consumed;
-
- if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
- relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
- rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
- }
-}
-
-static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
-{
- struct rchan_buf *rbuf;
-
- rbuf = (struct rchan_buf *)page_private(buf->page);
- relay_consume_bytes(rbuf, buf->private);
-}
-
-static const struct pipe_buf_operations relay_pipe_buf_ops = {
- .release = relay_pipe_buf_release,
- .try_steal = generic_pipe_buf_try_steal,
- .get = generic_pipe_buf_get,
-};
-
-static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
-{
-}
-
-/*
- * subbuf_splice_actor - splice up to one subbuf's worth of data
- */
-static ssize_t subbuf_splice_actor(struct file *in,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len,
- unsigned int flags,
- int *nonpad_ret)
-{
- unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
- struct rchan_buf *rbuf = in->private_data;
- unsigned int subbuf_size = rbuf->chan->subbuf_size;
- uint64_t pos = (uint64_t) *ppos;
- uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size;
- size_t read_start = (size_t) do_div(pos, alloc_size);
- size_t read_subbuf = read_start / subbuf_size;
- size_t padding = rbuf->padding[read_subbuf];
- size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
- struct page *pages[PIPE_DEF_BUFFERS];
- struct partial_page partial[PIPE_DEF_BUFFERS];
- struct splice_pipe_desc spd = {
- .pages = pages,
- .nr_pages = 0,
- .nr_pages_max = PIPE_DEF_BUFFERS,
- .partial = partial,
- .ops = &relay_pipe_buf_ops,
- .spd_release = relay_page_release,
- };
- ssize_t ret;
-
- if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
- return 0;
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
- /*
- * Adjust read len, if longer than what is available
- */
- if (len > (subbuf_size - read_start % subbuf_size))
- len = subbuf_size - read_start % subbuf_size;
-
- subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
- pidx = (read_start / PAGE_SIZE) % subbuf_pages;
- poff = read_start & ~PAGE_MASK;
- nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
-
- for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
- unsigned int this_len, this_end, private;
- unsigned int cur_pos = read_start + total_len;
-
- if (!len)
- break;
-
- this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
- private = this_len;
-
- spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
- spd.partial[spd.nr_pages].offset = poff;
-
- this_end = cur_pos + this_len;
- if (this_end >= nonpad_end) {
- this_len = nonpad_end - cur_pos;
- private = this_len + padding;
- }
- spd.partial[spd.nr_pages].len = this_len;
- spd.partial[spd.nr_pages].private = private;
-
- len -= this_len;
- total_len += this_len;
- poff = 0;
- pidx = (pidx + 1) % subbuf_pages;
-
- if (this_end >= nonpad_end) {
- spd.nr_pages++;
- break;
- }
- }
-
- ret = 0;
- if (!spd.nr_pages)
- goto out;
-
- ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
- if (ret < 0 || ret < total_len)
- goto out;
-
- if (read_start + ret == nonpad_end)
- ret += padding;
-
-out:
- splice_shrink_spd(&spd);
- return ret;
-}
-
-static ssize_t relay_file_splice_read(struct file *in,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len,
- unsigned int flags)
-{
- ssize_t spliced;
- int ret;
- int nonpad_ret = 0;
-
- ret = 0;
- spliced = 0;
-
- while (len && !spliced) {
- ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
- if (ret < 0)
- break;
- else if (!ret) {
- if (flags & SPLICE_F_NONBLOCK)
- ret = -EAGAIN;
- break;
- }
-
- *ppos += ret;
- if (ret > len)
- len = 0;
- else
- len -= ret;
- spliced += nonpad_ret;
- nonpad_ret = 0;
- }
-
- if (spliced)
- return spliced;
-
- return ret;
-}
const struct file_operations relay_file_operations = {
.open = relay_file_open,
@@ -1242,6 +1081,5 @@ const struct file_operations relay_file_operations = {
.read = relay_file_read,
.llseek = no_llseek,
.release = relay_file_release,
- .splice_read = relay_file_splice_read,
};
EXPORT_SYMBOL_GPL(relay_file_operations);
diff --git a/kernel/resource.c b/kernel/resource.c
index b1763b2fd7ef..fcbca39dbc45 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -27,6 +27,8 @@
#include <linux/mount.h>
#include <linux/resource_ext.h>
#include <uapi/linux/magic.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
#include <asm/io.h>
@@ -56,33 +58,17 @@ struct resource_constraint {
static DEFINE_RWLOCK(resource_lock);
-static struct resource *next_resource(struct resource *p)
+static struct resource *next_resource(struct resource *p, bool skip_children)
{
- if (p->child)
+ if (!skip_children && p->child)
return p->child;
while (!p->sibling && p->parent)
p = p->parent;
return p->sibling;
}
-static struct resource *next_resource_skip_children(struct resource *p)
-{
- while (!p->sibling && p->parent)
- p = p->parent;
- return p->sibling;
-}
-
#define for_each_resource(_root, _p, _skip_children) \
- for ((_p) = (_root)->child; (_p); \
- (_p) = (_skip_children) ? next_resource_skip_children(_p) : \
- next_resource(_p))
-
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-{
- struct resource *p = v;
- (*pos)++;
- return (void *)next_resource(p);
-}
+ for ((_p) = (_root)->child; (_p); (_p) = next_resource(_p, _skip_children))
#ifdef CONFIG_PROC_FS
@@ -91,14 +77,28 @@ enum { MAX_IORES_LEVEL = 5 };
static void *r_start(struct seq_file *m, loff_t *pos)
__acquires(resource_lock)
{
- struct resource *p = pde_data(file_inode(m->file));
- loff_t l = 0;
+ struct resource *root = pde_data(file_inode(m->file));
+ struct resource *p;
+ loff_t l = *pos;
+
read_lock(&resource_lock);
- for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
- ;
+ for_each_resource(root, p, false) {
+ if (l-- == 0)
+ break;
+ }
+
return p;
}
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct resource *p = v;
+
+ (*pos)++;
+
+ return (void *)next_resource(p, false);
+}
+
static void r_stop(struct seq_file *m, void *v)
__releases(resource_lock)
{
@@ -336,7 +336,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
read_lock(&resource_lock);
- for (p = iomem_resource.child; p; p = next_resource(p)) {
+ for_each_resource(&iomem_resource, p, false) {
/* If we passed the resource we are looking for, stop */
if (p->start > end) {
p = NULL;
@@ -432,6 +432,61 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
}
/*
+ * This function, being a variant of walk_system_ram_res(), calls the @func
+ * callback against all memory ranges of type System RAM which are marked as
+ * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from
+ * higher to lower.
+ */
+int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ struct resource res, *rams;
+ int rams_size = 16, i;
+ unsigned long flags;
+ int ret = -1;
+
+ /* create a list */
+ rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL);
+ if (!rams)
+ return ret;
+
+ flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ i = 0;
+ while ((start < end) &&
+ (!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) {
+ if (i >= rams_size) {
+ /* re-alloc */
+ struct resource *rams_new;
+
+ rams_new = kvrealloc(rams, rams_size * sizeof(struct resource),
+ (rams_size + 16) * sizeof(struct resource),
+ GFP_KERNEL);
+ if (!rams_new)
+ goto out;
+
+ rams = rams_new;
+ rams_size += 16;
+ }
+
+ rams[i].start = res.start;
+ rams[i++].end = res.end;
+
+ start = res.end + 1;
+ }
+
+ /* go reverse */
+ for (i--; i >= 0; i--) {
+ ret = (*func)(&rams[i], arg);
+ if (ret)
+ break;
+ }
+
+out:
+ kvfree(rams);
+ return ret;
+}
+
+/*
* This function calls the @func callback against all memory ranges, which
* are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
*/
@@ -1641,13 +1696,12 @@ __setup("reserve=", reserve_setup);
*/
int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
{
- struct resource *p = &iomem_resource;
resource_size_t end = addr + size - 1;
+ struct resource *p;
int err = 0;
- loff_t l;
read_lock(&resource_lock);
- for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+ for_each_resource(&iomem_resource, p, false) {
/*
* We can probably skip the resources without
* IORESOURCE_IO attribute?
@@ -1847,8 +1901,8 @@ get_free_mem_region(struct device *dev, struct resource *base,
write_lock(&resource_lock);
for (addr = gfr_start(base, size, align, flags);
- gfr_continue(base, addr, size, flags);
- addr = gfr_next(addr, size, flags)) {
+ gfr_continue(base, addr, align, flags);
+ addr = gfr_next(addr, align, flags)) {
if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) !=
REGION_DISJOINT)
continue;
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index 99bdd96f454f..80a3df49ab47 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -34,7 +34,6 @@
#include <linux/nospec.h>
#include <linux/proc_fs.h>
#include <linux/psi.h>
-#include <linux/psi.h>
#include <linux/ptrace_api.h>
#include <linux/sched_clock.h>
#include <linux/security.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2299a5cfbfb9..9116bcc90346 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -57,6 +57,7 @@
#include <linux/profile.h>
#include <linux/psi.h>
#include <linux/rcuwait_api.h>
+#include <linux/rseq.h>
#include <linux/sched/wake_q.h>
#include <linux/scs.h>
#include <linux/slab.h>
@@ -85,7 +86,6 @@
#include "sched.h"
#include "stats.h"
-#include "autogroup.h"
#include "autogroup.h"
#include "pelt.h"
@@ -114,6 +114,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -919,14 +920,13 @@ static bool set_nr_if_polling(struct task_struct *p)
struct thread_info *ti = task_thread_info(p);
typeof(ti->flags) val = READ_ONCE(ti->flags);
- for (;;) {
+ do {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
- if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
- break;
- }
+ } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED));
+
return true;
}
@@ -1132,6 +1132,28 @@ static void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;
+ /*
+ * Set TIF_NEED_RESCHED and send an IPI if in the non-polling
+ * part of the idle loop. This forces an exit from the idle loop
+ * and a round trip to schedule(). Now this could be optimized
+ * because a simple new idle loop iteration is enough to
+ * re-evaluate the next tick. Provided some re-ordering of tick
+ * nohz functions that would need to follow TIF_NR_POLLING
+ * clearing:
+ *
+ * - On most archs, a simple fetch_or on ti::flags with a
+ * "0" value would be enough to know if an IPI needs to be sent.
+ *
+ * - x86 needs to perform a last need_resched() check between
+ * monitor and mwait which doesn't take timers into account.
+ * There a dedicated TIF_TIMER flag would be required to
+ * fetch_or here and be checked along with TIF_NEED_RESCHED
+ * before mwait().
+ *
+ * However, remote timer enqueue is not such a frequent event
+ * and testing of the above solutions didn't appear to report
+ * much benefits.
+ */
if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
else
@@ -1480,16 +1502,12 @@ static void __uclamp_update_util_min_rt_default(struct task_struct *p)
static void uclamp_update_util_min_rt_default(struct task_struct *p)
{
- struct rq_flags rf;
- struct rq *rq;
-
if (!rt_task(p))
return;
/* Protect updates to p->uclamp_* */
- rq = task_rq_lock(p, &rf);
+ guard(task_rq_lock)(p);
__uclamp_update_util_min_rt_default(p);
- task_rq_unlock(rq, p, &rf);
}
static inline struct uclamp_se
@@ -1785,9 +1803,8 @@ static void uclamp_update_root_tg(void)
uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
- rcu_read_lock();
+ guard(rcu)();
cpu_util_update_eff(&root_task_group.css);
- rcu_read_unlock();
}
#else
static void uclamp_update_root_tg(void) { }
@@ -1814,10 +1831,9 @@ static void uclamp_sync_util_min_rt_default(void)
smp_mb__after_spinlock();
read_unlock(&tasklist_lock);
- rcu_read_lock();
+ guard(rcu)();
for_each_process_thread(g, p)
uclamp_update_util_min_rt_default(p);
- rcu_read_unlock();
}
static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
@@ -2131,12 +2147,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
enqueue_task(rq, p, flags);
- p->on_rq = TASK_ON_RQ_QUEUED;
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
}
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
+ WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
dequeue_task(rq, p, flags);
}
@@ -2218,10 +2236,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
p->sched_class->prio_changed(rq, p, oldprio);
}
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
if (p->sched_class == rq->curr->sched_class)
- rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+ rq->curr->sched_class->wakeup_preempt(rq, p, flags);
else if (sched_class_above(p->sched_class, rq->curr->sched_class))
resched_curr(rq);
@@ -2239,31 +2257,21 @@ int __task_state_match(struct task_struct *p, unsigned int state)
if (READ_ONCE(p->__state) & state)
return 1;
-#ifdef CONFIG_PREEMPT_RT
if (READ_ONCE(p->saved_state) & state)
return -1;
-#endif
+
return 0;
}
static __always_inline
int task_state_match(struct task_struct *p, unsigned int state)
{
-#ifdef CONFIG_PREEMPT_RT
- int match;
-
/*
- * Serialize against current_save_and_set_rtlock_wait_state() and
- * current_restore_rtlock_saved_state().
+ * Serialize against current_save_and_set_rtlock_wait_state(),
+ * current_restore_rtlock_saved_state(), and __refrigerator().
*/
- raw_spin_lock_irq(&p->pi_lock);
- match = __task_state_match(p, state);
- raw_spin_unlock_irq(&p->pi_lock);
-
- return match;
-#else
+ guard(raw_spinlock_irq)(&p->pi_lock);
return __task_state_match(p, state);
-#endif
}
/*
@@ -2417,10 +2425,9 @@ void migrate_disable(void)
return;
}
- preempt_disable();
+ guard(preempt)();
this_rq()->nr_pinned++;
p->migration_disabled = 1;
- preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
@@ -2444,7 +2451,7 @@ void migrate_enable(void)
* Ensure stop_task runs either before or after this, and that
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
*/
- preempt_disable();
+ guard(preempt)();
if (p->cpus_ptr != &p->cpus_mask)
__set_cpus_allowed_ptr(p, &ac);
/*
@@ -2455,7 +2462,6 @@ void migrate_enable(void)
barrier();
p->migration_disabled = 0;
this_rq()->nr_pinned--;
- preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
@@ -2527,7 +2533,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
rq_lock(rq, rf);
WARN_ON_ONCE(task_cpu(p) != new_cpu);
activate_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
return rq;
}
@@ -2664,9 +2670,11 @@ static int migration_cpu_stop(void *data)
* it.
*/
WARN_ON_ONCE(!pending->stop_pending);
+ preempt_disable();
task_rq_unlock(rq, p, &rf);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
+ preempt_enable();
return 0;
}
out:
@@ -2986,12 +2994,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
complete = true;
}
+ preempt_disable();
task_rq_unlock(rq, p, rf);
-
if (push_task) {
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
p, &rq->push_work);
}
+ preempt_enable();
if (complete)
complete_all(&pending->done);
@@ -3057,12 +3066,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
if (flags & SCA_MIGRATE_ENABLE)
p->migration_flags &= ~MDF_PUSH;
+ preempt_disable();
task_rq_unlock(rq, p, rf);
-
if (!stop_pending) {
stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
&pending->arg, &pending->stop_work);
}
+ preempt_enable();
if (flags & SCA_MIGRATE_ENABLE)
return 0;
@@ -3409,7 +3419,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
- check_preempt_curr(dst_rq, p, 0);
+ wakeup_preempt(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
rq_unpin_lock(src_rq, &srf);
@@ -3516,13 +3526,11 @@ out:
*/
void kick_process(struct task_struct *p)
{
- int cpu;
+ guard(preempt)();
+ int cpu = task_cpu(p);
- preempt_disable();
- cpu = task_cpu(p);
if ((cpu != smp_processor_id()) && task_curr(p))
smp_send_reschedule(cpu);
- preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
@@ -3785,7 +3793,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
}
activate_task(rq, p, en_flags);
- check_preempt_curr(rq, p, wake_flags);
+ wakeup_preempt(rq, p, wake_flags);
ttwu_do_wakeup(p);
@@ -3809,12 +3817,11 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
if (rq->avg_idle > max)
rq->avg_idle = max;
- rq->wake_stamp = jiffies;
- rq->wake_avg_idle = rq->avg_idle / 2;
-
rq->idle_stamp = 0;
}
#endif
+
+ p->dl_server = NULL;
}
/*
@@ -3856,7 +3863,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
* it should preempt the task that is current now.
*/
update_rq_clock(rq);
- check_preempt_curr(rq, p, wake_flags);
+ wakeup_preempt(rq, p, wake_flags);
}
ttwu_do_wakeup(p);
ret = 1;
@@ -3956,6 +3963,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
+/*
+ * Whether CPUs are share cache resources, which means LLC on non-cluster
+ * machines and LLC tag or L2 on machines with clusters.
+ */
+bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+ if (this_cpu == that_cpu)
+ return true;
+
+ return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu);
+}
+
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
/*
@@ -4036,13 +4055,17 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* The caller holds p::pi_lock if p != current or has preemption
* disabled when p == current.
*
- * The rules of PREEMPT_RT saved_state:
+ * The rules of saved_state:
*
* The related locking code always holds p::pi_lock when updating
* p::saved_state, which means the code is fully serialized in both cases.
*
- * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
- * bits set. This allows to distinguish all wakeup scenarios.
+ * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT.
+ * No other bits set. This allows to distinguish all wakeup scenarios.
+ *
+ * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This
+ * allows us to prevent early wakeup of tasks before they can be run on
+ * asymmetric ISA architectures (eg ARMv9).
*/
static __always_inline
bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
@@ -4056,13 +4079,13 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
*success = !!(match = __task_state_match(p, state));
-#ifdef CONFIG_PREEMPT_RT
/*
* Saved state preserves the task state across blocking on
- * an RT lock. If the state matches, set p::saved_state to
- * TASK_RUNNING, but do not wake the task because it waits
- * for a lock wakeup. Also indicate success because from
- * the regular waker's point of view this has succeeded.
+ * an RT lock or TASK_FREEZABLE tasks. If the state matches,
+ * set p::saved_state to TASK_RUNNING, but do not wake the task
+ * because it waits for a lock wakeup or __thaw_task(). Also
+ * indicate success because from the regular waker's point of
+ * view this has succeeded.
*
* After acquiring the lock the task will restore p::__state
* from p::saved_state which ensures that the regular
@@ -4072,7 +4095,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
*/
if (match < 0)
p->saved_state = TASK_RUNNING;
-#endif
+
return match > 0;
}
@@ -4254,7 +4277,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
* __schedule(). See the comment for smp_mb__after_spinlock().
*
- * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
+ * A similar smp_rmb() lives in __task_needs_rq_lock().
*/
smp_rmb();
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
@@ -4513,10 +4536,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
memset(&p->stats, 0, sizeof(p->stats));
#endif
- RB_CLEAR_NODE(&p->dl.rb_node);
- init_dl_task_timer(&p->dl);
- init_dl_inactive_task_timer(&p->dl);
- __dl_clear_params(p);
+ init_dl_entity(&p->dl);
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
@@ -4871,7 +4891,7 @@ void wake_up_new_task(struct task_struct *p)
activate_task(rq, p, ENQUEUE_NOCLOCK);
trace_sched_wakeup_new(p);
- check_preempt_curr(rq, p, WF_FORK);
+ wakeup_preempt(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
@@ -5374,8 +5394,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
/* switch_mm_cid() requires the memory barriers above. */
switch_mm_cid(rq, prev, next);
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-
prepare_lock_switch(rq, next, rf);
/* Here we just switch the register state and the stack. */
@@ -5916,8 +5934,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
- if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
- && in_atomic_preempt_off()) {
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
pr_err("Preemption disabled at:");
print_ip_sym(KERN_ERR, preempt_disable_ip);
}
@@ -6011,12 +6028,27 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = pick_next_task_idle(rq);
}
+ /*
+ * This is the fast path; it cannot be a DL server pick;
+ * therefore even if @p == @prev, ->dl_server must be NULL.
+ */
+ if (p->dl_server)
+ p->dl_server = NULL;
+
return p;
}
restart:
put_prev_task_balance(rq, prev, rf);
+ /*
+ * We've updated @prev and no longer need the server link, clear it.
+ * Must be done before ->pick_next_task() because that can (re)set
+ * ->dl_server.
+ */
+ if (prev->dl_server)
+ prev->dl_server = NULL;
+
for_each_class(class) {
p = class->pick_next_task(rq);
if (p)
@@ -6368,8 +6400,9 @@ static void sched_core_balance(struct rq *rq)
struct sched_domain *sd;
int cpu = cpu_of(rq);
- preempt_disable();
- rcu_read_lock();
+ guard(preempt)();
+ guard(rcu)();
+
raw_spin_rq_unlock_irq(rq);
for_each_domain(cpu, sd) {
if (need_resched())
@@ -6379,8 +6412,6 @@ static void sched_core_balance(struct rq *rq)
break;
}
raw_spin_rq_lock_irq(rq);
- rcu_read_unlock();
- preempt_enable();
}
static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
@@ -6615,6 +6646,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
+ rq->clock_update_flags = RQCF_UPDATED;
switch_count = &prev->nivcsw;
@@ -6694,8 +6726,6 @@ static void __sched notrace __schedule(unsigned int sched_mode)
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
@@ -6720,22 +6750,24 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk)
{
+ static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
unsigned int task_flags;
- if (task_is_running(tsk))
- return;
+ /*
+ * Establish LD_WAIT_CONFIG context to ensure none of the code called
+ * will use a blocking primitive -- which would lead to recursion.
+ */
+ lock_map_acquire_try(&sched_map);
task_flags = tsk->flags;
/*
* If a worker goes to sleep, notify and ask workqueue whether it
* wants to wake up a task to maintain concurrency.
*/
- if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
- if (task_flags & PF_WQ_WORKER)
- wq_worker_sleeping(tsk);
- else
- io_wq_worker_sleeping(tsk);
- }
+ if (task_flags & PF_WQ_WORKER)
+ wq_worker_sleeping(tsk);
+ else if (task_flags & PF_IO_WORKER)
+ io_wq_worker_sleeping(tsk);
/*
* spinlock and rwlock must not flush block requests. This will
@@ -6749,6 +6781,8 @@ static inline void sched_submit_work(struct task_struct *tsk)
* make sure to submit it to avoid deadlocks.
*/
blk_flush_plug(tsk->plug, true);
+
+ lock_map_release(&sched_map);
}
static void sched_update_worker(struct task_struct *tsk)
@@ -6761,16 +6795,26 @@ static void sched_update_worker(struct task_struct *tsk)
}
}
-asmlinkage __visible void __sched schedule(void)
+static __always_inline void __schedule_loop(unsigned int sched_mode)
{
- struct task_struct *tsk = current;
-
- sched_submit_work(tsk);
do {
preempt_disable();
- __schedule(SM_NONE);
+ __schedule(sched_mode);
sched_preempt_enable_no_resched();
} while (need_resched());
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+ struct task_struct *tsk = current;
+
+#ifdef CONFIG_RT_MUTEXES
+ lockdep_assert(!tsk->sched_rt_mutex);
+#endif
+
+ if (!task_is_running(tsk))
+ sched_submit_work(tsk);
+ __schedule_loop(SM_NONE);
sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);
@@ -6834,11 +6878,7 @@ void __sched schedule_preempt_disabled(void)
#ifdef CONFIG_PREEMPT_RT
void __sched notrace schedule_rtlock(void)
{
- do {
- preempt_disable();
- __schedule(SM_RTLOCK_WAIT);
- sched_preempt_enable_no_resched();
- } while (need_resched());
+ __schedule_loop(SM_RTLOCK_WAIT);
}
NOKPROBE_SYMBOL(schedule_rtlock);
#endif
@@ -7034,6 +7074,32 @@ static void __setscheduler_prio(struct task_struct *p, int prio)
#ifdef CONFIG_RT_MUTEXES
+/*
+ * Would be more useful with typeof()/auto_type but they don't mix with
+ * bit-fields. Since it's a local thing, use int. Keep the generic sounding
+ * name such that if someone were to implement this function we get to compare
+ * notes.
+ */
+#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
+
+void rt_mutex_pre_schedule(void)
+{
+ lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1));
+ sched_submit_work(current);
+}
+
+void rt_mutex_schedule(void)
+{
+ lockdep_assert(current->sched_rt_mutex);
+ __schedule_loop(SM_NONE);
+}
+
+void rt_mutex_post_schedule(void)
+{
+ sched_update_worker(current);
+ lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
+}
+
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
{
if (pi_task)
@@ -7187,9 +7253,8 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
void set_user_nice(struct task_struct *p, long nice)
{
bool queued, running;
- int old_prio;
- struct rq_flags rf;
struct rq *rq;
+ int old_prio;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
return;
@@ -7197,7 +7262,9 @@ void set_user_nice(struct task_struct *p, long nice)
* We have to be careful, if called from sys_setpriority(),
* the task might be in the middle of scheduling on another CPU.
*/
- rq = task_rq_lock(p, &rf);
+ CLASS(task_rq_lock, rq_guard)(p);
+ rq = rq_guard.rq;
+
update_rq_clock(rq);
/*
@@ -7208,8 +7275,9 @@ void set_user_nice(struct task_struct *p, long nice)
*/
if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
- goto out_unlock;
+ return;
}
+
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
@@ -7232,9 +7300,6 @@ void set_user_nice(struct task_struct *p, long nice)
* lowered its priority, then reschedule its CPU:
*/
p->sched_class->prio_changed(rq, p, old_prio);
-
-out_unlock:
- task_rq_unlock(rq, p, &rf);
}
EXPORT_SYMBOL(set_user_nice);
@@ -7403,18 +7468,13 @@ int sched_core_idle_cpu(int cpu)
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- enum cpu_util_type type,
- struct task_struct *p)
+ unsigned long *min,
+ unsigned long *max)
{
- unsigned long dl_util, util, irq, max;
+ unsigned long util, irq, scale;
struct rq *rq = cpu_rq(cpu);
- max = arch_scale_cpu_capacity(cpu);
-
- if (!uclamp_is_used() &&
- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
- return max;
- }
+ scale = arch_scale_cpu_capacity(cpu);
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
@@ -7422,45 +7482,49 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
* update_irq_load_avg().
*/
irq = cpu_util_irq(rq);
- if (unlikely(irq >= max))
- return max;
+ if (unlikely(irq >= scale)) {
+ if (min)
+ *min = scale;
+ if (max)
+ *max = scale;
+ return scale;
+ }
+
+ if (min) {
+ /*
+ * The minimum utilization returns the highest level between:
+ * - the computed DL bandwidth needed with the IRQ pressure which
+ * steals time to the deadline task.
+ * - The minimum performance requirement for CFS and/or RT.
+ */
+ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+ /*
+ * When an RT task is runnable and uclamp is not used, we must
+ * ensure that the task will run at maximum compute capacity.
+ */
+ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ *min = max(*min, scale);
+ }
/*
* Because the time spend on RT/DL tasks is visible as 'lost' time to
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
- *
- * CFS and RT utilization can be boosted or capped, depending on
- * utilization clamp constraints requested by currently RUNNABLE
- * tasks.
- * When there are no CFS RUNNABLE tasks, clamps are released and
- * frequency will be gracefully reduced with the utilization decay.
*/
util = util_cfs + cpu_util_rt(rq);
- if (type == FREQUENCY_UTIL)
- util = uclamp_rq_util_with(rq, util, p);
-
- dl_util = cpu_util_dl(rq);
+ util += cpu_util_dl(rq);
/*
- * For frequency selection we do not make cpu_util_dl() a permanent part
- * of this sum because we want to use cpu_bw_dl() later on, but we need
- * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
- * that we select f_max when there is no idle time.
- *
- * NOTE: numerical errors or stop class might cause us to not quite hit
- * saturation when we should -- something for later.
+ * The maximum hint is a soft bandwidth requirement, which can be lower
+ * than the actual utilization because of uclamp_max requirements.
*/
- if (util + dl_util >= max)
- return max;
+ if (max)
+ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
- /*
- * OTOH, for energy computation we need the estimated running time, so
- * include util_dl and ignore dl_bw.
- */
- if (type == ENERGY_UTIL)
- util += dl_util;
+ if (util >= scale)
+ return scale;
/*
* There is still idle time; further improve the number by using the
@@ -7471,28 +7535,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
* U' = irq + --------- * U
* max
*/
- util = scale_irq_capacity(util, irq, max);
+ util = scale_irq_capacity(util, irq, scale);
util += irq;
- /*
- * Bandwidth required by DEADLINE must always be granted while, for
- * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
- * to gracefully reduce the frequency when no tasks show up for longer
- * periods of time.
- *
- * Ideally we would like to set bw_dl as min/guaranteed freq and util +
- * bw_dl as requested freq. However, cpufreq is not yet ready for such
- * an interface. So, we only do the latter for now.
- */
- if (type == FREQUENCY_UTIL)
- util += cpu_bw_dl(rq);
-
- return min(max, util);
+ return min(scale, util);
}
unsigned long sched_cpu_util(int cpu)
{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
}
#endif /* CONFIG_SMP */
@@ -7507,6 +7558,21 @@ static struct task_struct *find_process_by_pid(pid_t pid)
return pid ? find_task_by_vpid(pid) : current;
}
+static struct task_struct *find_get_task(pid_t pid)
+{
+ struct task_struct *p;
+ guard(rcu)();
+
+ p = find_process_by_pid(pid);
+ if (likely(p))
+ get_task_struct(p);
+
+ return p;
+}
+
+DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
+ find_get_task(pid), pid_t pid)
+
/*
* sched_setparam() passes in -1 for its policy, to let the functions
* it calls know not to change it.
@@ -7544,14 +7610,11 @@ static void __setscheduler_params(struct task_struct *p,
static bool check_same_owner(struct task_struct *p)
{
const struct cred *cred = current_cred(), *pcred;
- bool match;
+ guard(rcu)();
- rcu_read_lock();
pcred = __task_cred(p);
- match = (uid_eq(cred->euid, pcred->euid) ||
- uid_eq(cred->euid, pcred->uid));
- rcu_read_unlock();
- return match;
+ return (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
}
/*
@@ -7963,27 +8026,17 @@ static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
struct sched_param lparam;
- struct task_struct *p;
- int retval;
if (!param || pid < 0)
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
- rcu_read_unlock();
-
- if (likely(p)) {
- retval = sched_setscheduler(p, policy, &lparam);
- put_task_struct(p);
- }
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
- return retval;
+ return sched_setscheduler(p, policy, &lparam);
}
/*
@@ -8079,7 +8132,6 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
{
struct sched_attr attr;
- struct task_struct *p;
int retval;
if (!uattr || pid < 0 || flags)
@@ -8094,21 +8146,14 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
attr.sched_policy = SETPARAM_POLICY;
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
- rcu_read_unlock();
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
- if (likely(p)) {
- if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
- get_params(p, &attr);
- retval = sched_setattr(p, &attr);
- put_task_struct(p);
- }
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ get_params(p, &attr);
- return retval;
+ return sched_setattr(p, &attr);
}
/**
@@ -8126,16 +8171,17 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
if (pid < 0)
return -EINVAL;
- retval = -ESRCH;
- rcu_read_lock();
+ guard(rcu)();
p = find_process_by_pid(pid);
- if (p) {
- retval = security_task_getscheduler(p);
- if (!retval)
- retval = p->policy
- | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (!retval) {
+ retval = p->policy;
+ if (p->sched_reset_on_fork)
+ retval |= SCHED_RESET_ON_FORK;
}
- rcu_read_unlock();
return retval;
}
@@ -8156,30 +8202,23 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
if (!param || pid < 0)
return -EINVAL;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- if (task_has_rt_policy(p))
- lp.sched_priority = p->rt_priority;
- rcu_read_unlock();
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ }
/*
* This one might sleep, we cannot do it with a spinlock held ...
*/
- retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-
- return retval;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
+ return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
}
/*
@@ -8239,46 +8278,38 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- kattr.sched_policy = p->policy;
- if (p->sched_reset_on_fork)
- kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- get_params(p, &kattr);
- kattr.sched_flags &= SCHED_FLAG_ALL;
+ kattr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ get_params(p, &kattr);
+ kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
- /*
- * This could race with another potential updater, but this is fine
- * because it'll correctly read the old or the new value. We don't need
- * to guarantee who wins the race as long as it doesn't return garbage.
- */
- kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ /*
+ * This could race with another potential updater, but this is fine
+ * because it'll correctly read the old or the new value. We don't need
+ * to guarantee who wins the race as long as it doesn't return garbage.
+ */
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
-
- rcu_read_unlock();
+ }
return sched_attr_copy_to_user(uattr, &kattr, usize);
-
-out_unlock:
- rcu_read_unlock();
- return retval;
}
#ifdef CONFIG_SMP
int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
- int ret = 0;
-
/*
* If the task isn't a deadline task or admission control is
* disabled then we don't care about affinity changes.
@@ -8292,11 +8323,11 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
* tasks allowed to run on all the CPUs in the task's
* root_domain.
*/
- rcu_read_lock();
+ guard(rcu)();
if (!cpumask_subset(task_rq(p)->rd->span, mask))
- ret = -EBUSY;
- rcu_read_unlock();
- return ret;
+ return -EBUSY;
+
+ return 0;
}
#endif
@@ -8366,39 +8397,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
struct affinity_context ac;
struct cpumask *user_mask;
- struct task_struct *p;
int retval;
- rcu_read_lock();
-
- p = find_process_by_pid(pid);
- if (!p) {
- rcu_read_unlock();
+ CLASS(find_get_task, p)(pid);
+ if (!p)
return -ESRCH;
- }
-
- /* Prevent p going away */
- get_task_struct(p);
- rcu_read_unlock();
- if (p->flags & PF_NO_SETAFFINITY) {
- retval = -EINVAL;
- goto out_put_task;
- }
+ if (p->flags & PF_NO_SETAFFINITY)
+ return -EINVAL;
if (!check_same_owner(p)) {
- rcu_read_lock();
- if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
- rcu_read_unlock();
- retval = -EPERM;
- goto out_put_task;
- }
- rcu_read_unlock();
+ guard(rcu)();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
+ return -EPERM;
}
retval = security_task_setscheduler(p);
if (retval)
- goto out_put_task;
+ return retval;
/*
* With non-SMP configs, user_cpus_ptr/user_mask isn't used and
@@ -8408,8 +8424,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
if (user_mask) {
cpumask_copy(user_mask, in_mask);
} else if (IS_ENABLED(CONFIG_SMP)) {
- retval = -ENOMEM;
- goto out_put_task;
+ return -ENOMEM;
}
ac = (struct affinity_context){
@@ -8421,8 +8436,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
retval = __sched_setaffinity(p, &ac);
kfree(ac.user_mask);
-out_put_task:
- put_task_struct(p);
return retval;
}
@@ -8464,28 +8477,21 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
long sched_getaffinity(pid_t pid, struct cpumask *mask)
{
struct task_struct *p;
- unsigned long flags;
int retval;
- rcu_read_lock();
-
- retval = -ESRCH;
+ guard(rcu)();
p = find_process_by_pid(pid);
if (!p)
- goto out_unlock;
+ return -ESRCH;
retval = security_task_getscheduler(p);
if (retval)
- goto out_unlock;
+ return retval;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
+ guard(raw_spinlock_irqsave)(&p->pi_lock);
cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
-out_unlock:
- rcu_read_unlock();
- return retval;
+ return 0;
}
/**
@@ -8932,55 +8938,46 @@ int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
- unsigned long flags;
int yielded = 0;
- local_irq_save(flags);
- rq = this_rq();
+ scoped_guard (irqsave) {
+ rq = this_rq();
again:
- p_rq = task_rq(p);
- /*
- * If we're the only runnable task on the rq and target rq also
- * has only one task, there's absolutely no point in yielding.
- */
- if (rq->nr_running == 1 && p_rq->nr_running == 1) {
- yielded = -ESRCH;
- goto out_irq;
- }
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1)
+ return -ESRCH;
- double_rq_lock(rq, p_rq);
- if (task_rq(p) != p_rq) {
- double_rq_unlock(rq, p_rq);
- goto again;
- }
+ guard(double_rq_lock)(rq, p_rq);
+ if (task_rq(p) != p_rq)
+ goto again;
- if (!curr->sched_class->yield_to_task)
- goto out_unlock;
+ if (!curr->sched_class->yield_to_task)
+ return 0;
- if (curr->sched_class != p->sched_class)
- goto out_unlock;
+ if (curr->sched_class != p->sched_class)
+ return 0;
- if (task_on_cpu(p_rq, p) || !task_is_running(p))
- goto out_unlock;
+ if (task_on_cpu(p_rq, p) || !task_is_running(p))
+ return 0;
- yielded = curr->sched_class->yield_to_task(rq, p);
- if (yielded) {
- schedstat_inc(rq->yld_count);
- /*
- * Make p's CPU reschedule; pick_next_entity takes care of
- * fairness.
- */
- if (preempt && rq != p_rq)
- resched_curr(p_rq);
+ yielded = curr->sched_class->yield_to_task(rq, p);
+ if (yielded) {
+ schedstat_inc(rq->yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity
+ * takes care of fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_curr(p_rq);
+ }
}
-out_unlock:
- double_rq_unlock(rq, p_rq);
-out_irq:
- local_irq_restore(flags);
-
- if (yielded > 0)
+ if (yielded)
schedule();
return yielded;
@@ -9083,38 +9080,30 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
- struct task_struct *p;
- unsigned int time_slice;
- struct rq_flags rf;
- struct rq *rq;
+ unsigned int time_slice = 0;
int retval;
if (pid < 0)
return -EINVAL;
- retval = -ESRCH;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ struct task_struct *p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- rq = task_rq_lock(p, &rf);
- time_slice = 0;
- if (p->sched_class->get_rr_interval)
- time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, p, &rf);
+ scoped_guard (task_rq_lock, p) {
+ struct rq *rq = scope.rq;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
+ }
+ }
- rcu_read_unlock();
jiffies_to_timespec64(time_slice, t);
return 0;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
}
/**
@@ -9173,9 +9162,9 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
- free, task_pid_nr(p), ppid,
- read_task_thread_flags(p));
+ pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n",
+ free, task_pid_nr(p), task_tgid_nr(p),
+ ppid, read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
@@ -9269,7 +9258,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
* PF_KTHREAD should already be set at this point; regardless, make it
* look like a proper per-CPU kthread.
*/
- idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
+ idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
kthread_set_per_cpu(idle, cpu);
#ifdef CONFIG_SMP
@@ -9505,9 +9494,11 @@ static void balance_push(struct rq *rq)
* Temporarily drop rq->lock such that we can wake-up the stop task.
* Both preemption and IRQs are still disabled.
*/
+ preempt_disable();
raw_spin_rq_unlock(rq);
stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
this_cpu_ptr(&push_work));
+ preempt_enable();
/*
* At this point need_resched() is true and we'll take the loop in
* schedule(). The next pick is obviously going to be the stop task
@@ -9903,7 +9894,7 @@ struct task_group root_task_group;
LIST_HEAD(task_groups);
/* Cacheline aligned slab cache for task_group */
-static struct kmem_cache *task_group_cache __read_mostly;
+static struct kmem_cache *task_group_cache __ro_after_init;
#endif
void __init sched_init(void)
@@ -10013,7 +10004,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
+ rq->cpu_capacity = SCHED_CAPACITY_SCALE;
rq->balance_callback = &balance_push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -10022,8 +10013,6 @@ void __init sched_init(void)
rq->online = 0;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
- rq->wake_stamp = jiffies;
- rq->wake_avg_idle = rq->avg_idle;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -10289,9 +10278,9 @@ void normalize_rt_tasks(void)
#endif /* CONFIG_MAGIC_SYSRQ */
-#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+#if defined(CONFIG_KGDB_KDB)
/*
- * These functions are only useful for the IA64 MCA handling, or kdb.
+ * These functions are only useful for kdb.
*
* They can only be called when the whole system has been
* stopped - every CPU needs to be quiescent, and no scheduling
@@ -10313,30 +10302,7 @@ struct task_struct *curr_task(int cpu)
return cpu_curr(cpu);
}
-#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
-
-#ifdef CONFIG_IA64
-/**
- * ia64_set_curr_task - set the current task for a given CPU.
- * @cpu: the processor in question.
- * @p: the task pointer to set.
- *
- * Description: This function must only be used when non-maskable interrupts
- * are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a CPU in a non-blocking manner. This function
- * must be called with all CPU's synchronized, and interrupts disabled, the
- * and caller must save the original value of the current task (see
- * curr_task() above) and restore that value before reenabling interrupts and
- * re-starting the system.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-void ia64_set_curr_task(int cpu, struct task_struct *p)
-{
- cpu_curr(cpu) = p;
-}
-
-#endif
+#endif /* defined(CONFIG_KGDB_KDB) */
#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task groups */
@@ -10498,17 +10464,18 @@ void sched_move_task(struct task_struct *tsk)
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct task_group *group;
- struct rq_flags rf;
struct rq *rq;
- rq = task_rq_lock(tsk, &rf);
+ CLASS(task_rq_lock, rq_guard)(tsk);
+ rq = rq_guard.rq;
+
/*
* Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
* group changes.
*/
group = sched_get_task_group(tsk);
if (group == tsk->sched_task_group)
- goto unlock;
+ return;
update_rq_clock(rq);
@@ -10533,9 +10500,6 @@ void sched_move_task(struct task_struct *tsk)
*/
resched_curr(rq);
}
-
-unlock:
- task_rq_unlock(rq, tsk, &rf);
}
static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -10572,11 +10536,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
#ifdef CONFIG_UCLAMP_TASK_GROUP
/* Propagate the effective uclamp value for the new group */
- mutex_lock(&uclamp_mutex);
- rcu_read_lock();
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
cpu_util_update_eff(css);
- rcu_read_unlock();
- mutex_unlock(&uclamp_mutex);
#endif
return 0;
@@ -10727,8 +10689,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
static_branch_enable(&sched_uclamp_used);
- mutex_lock(&uclamp_mutex);
- rcu_read_lock();
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
tg = css_tg(of_css(of));
if (tg->uclamp_req[clamp_id].value != req.util)
@@ -10743,9 +10705,6 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
/* Update effective clamps to track the most restrictive value */
cpu_util_update_eff(of_css(of));
- rcu_read_unlock();
- mutex_unlock(&uclamp_mutex);
-
return nbytes;
}
@@ -10771,10 +10730,10 @@ static inline void cpu_uclamp_print(struct seq_file *sf,
u64 percent;
u32 rem;
- rcu_read_lock();
- tg = css_tg(seq_css(sf));
- util_clamp = tg->uclamp_req[clamp_id].value;
- rcu_read_unlock();
+ scoped_guard (rcu) {
+ tg = css_tg(seq_css(sf));
+ util_clamp = tg->uclamp_req[clamp_id].value;
+ }
if (util_clamp == SCHED_CAPACITY_SCALE) {
seq_puts(sf, "max\n");
@@ -10865,11 +10824,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
- cpus_read_lock();
- mutex_lock(&cfs_constraints_mutex);
+ guard(cpus_read_lock)();
+ guard(mutex)(&cfs_constraints_mutex);
+
ret = __cfs_schedulable(tg, period, quota);
if (ret)
- goto out_unlock;
+ return ret;
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
@@ -10879,39 +10839,38 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
*/
if (runtime_enabled && !runtime_was_enabled)
cfs_bandwidth_usage_inc();
- raw_spin_lock_irq(&cfs_b->lock);
- cfs_b->period = ns_to_ktime(period);
- cfs_b->quota = quota;
- cfs_b->burst = burst;
- __refill_cfs_bandwidth_runtime(cfs_b);
+ scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
+ cfs_b->period = ns_to_ktime(period);
+ cfs_b->quota = quota;
+ cfs_b->burst = burst;
- /* Restart the period timer (if active) to handle new period expiry: */
- if (runtime_enabled)
- start_cfs_bandwidth(cfs_b);
+ __refill_cfs_bandwidth_runtime(cfs_b);
- raw_spin_unlock_irq(&cfs_b->lock);
+ /*
+ * Restart the period timer (if active) to handle new
+ * period expiry:
+ */
+ if (runtime_enabled)
+ start_cfs_bandwidth(cfs_b);
+ }
for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
- struct rq_flags rf;
- rq_lock_irq(rq, &rf);
+ guard(rq_lock_irq)(rq);
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;
if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq);
- rq_unlock_irq(rq, &rf);
}
+
if (runtime_was_enabled && !runtime_enabled)
cfs_bandwidth_usage_dec();
-out_unlock:
- mutex_unlock(&cfs_constraints_mutex);
- cpus_read_unlock();
- return ret;
+ return 0;
}
static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
@@ -11096,7 +11055,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
{
- int ret;
struct cfs_schedulable_data data = {
.tg = tg,
.period = period,
@@ -11108,11 +11066,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
do_div(data.quota, NSEC_PER_USEC);
}
- rcu_read_lock();
- ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
- rcu_read_unlock();
-
- return ret;
+ guard(rcu)();
+ return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
}
static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
@@ -11717,14 +11672,12 @@ int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
* are not the last task to be migrated from this cpu for this mm, so
* there is no need to move src_cid to the destination cpu.
*/
- rcu_read_lock();
+ guard(rcu)();
src_task = rcu_dereference(src_rq->curr);
if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- rcu_read_unlock();
t->last_mm_cid = -1;
return -1;
}
- rcu_read_unlock();
return src_cid;
}
@@ -11768,18 +11721,17 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
* the lazy-put flag, this task will be responsible for transitioning
* from lazy-put flag set to MM_CID_UNSET.
*/
- rcu_read_lock();
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- rcu_read_unlock();
- /*
- * We observed an active task for this mm, there is therefore
- * no point in moving this cid to the destination cpu.
- */
- t->last_mm_cid = -1;
- return -1;
+ scoped_guard (rcu) {
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ /*
+ * We observed an active task for this mm, there is therefore
+ * no point in moving this cid to the destination cpu.
+ */
+ t->last_mm_cid = -1;
+ return -1;
+ }
}
- rcu_read_unlock();
/*
* The src_cid is unused, so it can be unset.
@@ -11852,7 +11804,6 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_
{
struct rq *rq = cpu_rq(cpu);
struct task_struct *t;
- unsigned long flags;
int cid, lazy_cid;
cid = READ_ONCE(pcpu_cid->cid);
@@ -11887,23 +11838,21 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_
* the lazy-put flag, that task will be responsible for transitioning
* from lazy-put flag set to MM_CID_UNSET.
*/
- rcu_read_lock();
- t = rcu_dereference(rq->curr);
- if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
- rcu_read_unlock();
- return;
+ scoped_guard (rcu) {
+ t = rcu_dereference(rq->curr);
+ if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
+ return;
}
- rcu_read_unlock();
/*
* The cid is unused, so it can be unset.
* Disable interrupts to keep the window of cid ownership without rq
* lock small.
*/
- local_irq_save(flags);
- if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- __mm_cid_put(mm, cid);
- local_irq_restore(flags);
+ scoped_guard (irqsave) {
+ if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ __mm_cid_put(mm, cid);
+ }
}
static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
@@ -11925,14 +11874,13 @@ static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
* snapshot associated with this cid if an active task using the mm is
* observed on this rq.
*/
- rcu_read_lock();
- curr = rcu_dereference(rq->curr);
- if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
- WRITE_ONCE(pcpu_cid->time, rq_clock);
- rcu_read_unlock();
- return;
+ scoped_guard (rcu) {
+ curr = rcu_dereference(rq->curr);
+ if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
+ WRITE_ONCE(pcpu_cid->time, rq_clock);
+ return;
+ }
}
- rcu_read_unlock();
if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
return;
@@ -12026,7 +11974,6 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
void sched_mm_cid_exit_signals(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq_flags rf;
struct rq *rq;
if (!mm)
@@ -12034,7 +11981,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
preempt_disable();
rq = this_rq();
- rq_lock_irqsave(rq, &rf);
+ guard(rq_lock_irqsave)(rq);
preempt_enable_no_resched(); /* holding spinlock */
WRITE_ONCE(t->mm_cid_active, 0);
/*
@@ -12044,13 +11991,11 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
smp_mb();
mm_cid_put(mm);
t->last_mm_cid = t->mm_cid = -1;
- rq_unlock_irqrestore(rq, &rf);
}
void sched_mm_cid_before_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq_flags rf;
struct rq *rq;
if (!mm)
@@ -12058,7 +12003,7 @@ void sched_mm_cid_before_execve(struct task_struct *t)
preempt_disable();
rq = this_rq();
- rq_lock_irqsave(rq, &rf);
+ guard(rq_lock_irqsave)(rq);
preempt_enable_no_resched(); /* holding spinlock */
WRITE_ONCE(t->mm_cid_active, 0);
/*
@@ -12068,13 +12013,11 @@ void sched_mm_cid_before_execve(struct task_struct *t)
smp_mb();
mm_cid_put(mm);
t->last_mm_cid = t->mm_cid = -1;
- rq_unlock_irqrestore(rq, &rf);
}
void sched_mm_cid_after_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq_flags rf;
struct rq *rq;
if (!mm)
@@ -12082,16 +12025,16 @@ void sched_mm_cid_after_execve(struct task_struct *t)
preempt_disable();
rq = this_rq();
- rq_lock_irqsave(rq, &rf);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 1);
- /*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
- */
- smp_mb();
- t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
- rq_unlock_irqrestore(rq, &rf);
+ scoped_guard (rq_lock_irqsave, rq) {
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 1);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+ }
rseq_set_notify_resume(t);
}
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 57c92d751bcd..95baa12a1029 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -131,7 +131,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
if (!dl_task_fits_capacity(p, cpu)) {
cpumask_clear_cpu(cpu, later_mask);
- cap = capacity_orig_of(cpu);
+ cap = arch_scale_cpu_capacity(cpu);
if (cap > max_cap ||
(cpu == task_cpu(p) && cap == max_cap)) {
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 4492608b7d7f..eece6244f9d2 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -47,7 +47,7 @@ struct sugov_cpu {
u64 last_update;
unsigned long util;
- unsigned long bw_dl;
+ unsigned long bw_min;
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
@@ -115,6 +115,32 @@ static void sugov_deferred_update(struct sugov_policy *sg_policy)
}
/**
+ * get_capacity_ref_freq - get the reference frequency that has been used to
+ * correlate frequency and compute capacity for a given cpufreq policy. We use
+ * the CPU managing it for the arch_scale_freq_ref() call in the function.
+ * @policy: the cpufreq policy of the CPU in question.
+ *
+ * Return: the reference CPU frequency to compute a capacity.
+ */
+static __always_inline
+unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy)
+{
+ unsigned int freq = arch_scale_freq_ref(policy->cpu);
+
+ if (freq)
+ return freq;
+
+ if (arch_scale_freq_invariant())
+ return policy->cpuinfo.max_freq;
+
+ /*
+ * Apply a 25% margin so that we select a higher frequency than
+ * the current one before the CPU is fully busy:
+ */
+ return policy->cur + (policy->cur >> 2);
+}
+
+/**
* get_next_freq - Compute a new frequency for a given cpufreq policy.
* @sg_policy: schedutil policy object to compute the new frequency for.
* @util: Current CPU utilization.
@@ -140,10 +166,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
unsigned long util, unsigned long max)
{
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned int freq = arch_scale_freq_invariant() ?
- policy->cpuinfo.max_freq : policy->cur;
+ unsigned int freq;
- util = map_util_perf(util);
+ freq = get_capacity_ref_freq(policy);
freq = map_util_freq(util, freq, max);
if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
@@ -153,14 +178,31 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
-static void sugov_get_util(struct sugov_cpu *sg_cpu)
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+ unsigned long min,
+ unsigned long max)
{
- unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu);
- struct rq *rq = cpu_rq(sg_cpu->cpu);
+ /* Add dvfs headroom to actual utilization */
+ actual = map_util_perf(actual);
+ /* Actually we don't need to target the max performance */
+ if (actual < max)
+ max = actual;
- sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util,
- FREQUENCY_UTIL, NULL);
+ /*
+ * Ensure at least minimum performance while providing more compute
+ * capacity when possible.
+ */
+ return max(min, max);
+}
+
+static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
+{
+ unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
+
+ util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
+ util = max(util, boost);
+ sg_cpu->bw_min = min;
+ sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
}
/**
@@ -251,18 +293,16 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* This mechanism is designed to boost high frequently IO waiting tasks, while
* being more conservative on tasks which does sporadic IO operations.
*/
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
unsigned long max_cap)
{
- unsigned long boost;
-
/* No boost currently required */
if (!sg_cpu->iowait_boost)
- return;
+ return 0;
/* Reset boost if the CPU appears to have been idle enough */
if (sugov_iowait_reset(sg_cpu, time, false))
- return;
+ return 0;
if (!sg_cpu->iowait_boost_pending) {
/*
@@ -271,7 +311,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
sg_cpu->iowait_boost >>= 1;
if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
sg_cpu->iowait_boost = 0;
- return;
+ return 0;
}
}
@@ -281,10 +321,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
* sg_cpu->util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
- boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
- boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
- if (sg_cpu->util < boost)
- sg_cpu->util = boost;
+ return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -306,7 +343,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
*/
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
{
- if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
+ if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min)
sg_cpu->sg_policy->limits_changed = true;
}
@@ -314,6 +351,8 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
u64 time, unsigned long max_cap,
unsigned int flags)
{
+ unsigned long boost;
+
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -322,8 +361,8 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
return false;
- sugov_get_util(sg_cpu);
- sugov_iowait_apply(sg_cpu, time, max_cap);
+ boost = sugov_iowait_apply(sg_cpu, time, max_cap);
+ sugov_get_util(sg_cpu, boost);
return true;
}
@@ -350,7 +389,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
* Except when the rq is capped by uclamp_max.
*/
if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
- sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
+ sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
+ !sg_policy->need_freq_update) {
next_f = sg_policy->next_freq;
/* Restore cached freq as next_freq has changed */
@@ -406,8 +446,8 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
sg_cpu->util = prev_util;
- cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
- map_util_perf(sg_cpu->util), max_cap);
+ cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
+ sg_cpu->util, max_cap);
sg_cpu->sg_policy->last_freq_update_time = time;
}
@@ -423,9 +463,10 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
+ unsigned long boost;
- sugov_get_util(j_sg_cpu);
- sugov_iowait_apply(j_sg_cpu, time, max_cap);
+ boost = sugov_iowait_apply(j_sg_cpu, time, max_cap);
+ sugov_get_util(j_sg_cpu, boost);
util = max(j_sg_cpu->util, util);
}
@@ -555,6 +596,31 @@ static const struct kobj_type sugov_tunables_ktype = {
/********************** cpufreq governor interface *********************/
+#ifdef CONFIG_ENERGY_MODEL
+static void rebuild_sd_workfn(struct work_struct *work)
+{
+ rebuild_sched_domains_energy();
+}
+
+static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
+
+/*
+ * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
+ * on governor changes to make sure the scheduler knows about it.
+ */
+static void sugov_eas_rebuild_sd(void)
+{
+ /*
+ * When called from the cpufreq_register_driver() path, the
+ * cpu_hotplug_lock is already held, so use a work item to
+ * avoid nested locking in rebuild_sched_domains().
+ */
+ schedule_work(&rebuild_sd_work);
+}
+#else
+static inline void sugov_eas_rebuild_sd(void) { };
+#endif
+
struct cpufreq_governor schedutil_gov;
static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
@@ -709,6 +775,8 @@ static int sugov_init(struct cpufreq_policy *policy)
if (ret)
goto fail;
+ sugov_eas_rebuild_sd();
+
out:
mutex_unlock(&global_tunables_lock);
return 0;
@@ -750,6 +818,8 @@ static void sugov_exit(struct cpufreq_policy *policy)
sugov_kthread_stop(sg_policy);
sugov_policy_free(sg_policy);
cpufreq_disable_fast_switch(policy);
+
+ sugov_eas_rebuild_sd();
}
static int sugov_start(struct cpufreq_policy *policy)
@@ -767,14 +837,6 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
- for_each_cpu(cpu, policy->cpus) {
- struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
-
- memset(sg_cpu, 0, sizeof(*sg_cpu));
- sg_cpu->cpu = cpu;
- sg_cpu->sg_policy = sg_policy;
- }
-
if (policy_is_shared(policy))
uu = sugov_update_shared;
else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
@@ -785,6 +847,9 @@ static int sugov_start(struct cpufreq_policy *policy)
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+ memset(sg_cpu, 0, sizeof(*sg_cpu));
+ sg_cpu->cpu = cpu;
+ sg_cpu->sg_policy = sg_policy;
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);
}
return 0;
@@ -838,29 +903,3 @@ struct cpufreq_governor *cpufreq_default_governor(void)
#endif
cpufreq_governor_init(schedutil_gov);
-
-#ifdef CONFIG_ENERGY_MODEL
-static void rebuild_sd_workfn(struct work_struct *work)
-{
- rebuild_sched_domains_energy();
-}
-static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
-
-/*
- * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
- * on governor changes to make sure the scheduler knows about it.
- */
-void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
- struct cpufreq_governor *old_gov)
-{
- if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
- /*
- * When called from the cpufreq_register_driver() path, the
- * cpu_hotplug_lock is already held, so use a work item to
- * avoid nested locking in rebuild_sched_domains().
- */
- schedule_work(&rebuild_sd_work);
- }
-
-}
-#endif
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index a286e726eb4b..42c40cfdf836 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -101,6 +101,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
if (lowest_mask) {
cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
+ cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
/*
* We have to ensure that we have at least one bit
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 58b542bf2893..a04a436af8cc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -54,8 +54,14 @@ static int __init sched_dl_sysctl_init(void)
late_initcall(sched_dl_sysctl_init);
#endif
+static bool dl_server(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_server;
+}
+
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
{
+ BUG_ON(dl_server(dl_se));
return container_of(dl_se, struct task_struct, dl);
}
@@ -64,12 +70,19 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
return container_of(dl_rq, struct rq, dl);
}
-static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se)
{
- struct task_struct *p = dl_task_of(dl_se);
- struct rq *rq = task_rq(p);
+ struct rq *rq = dl_se->rq;
- return &rq->dl;
+ if (!dl_server(dl_se))
+ rq = task_rq(dl_task_of(dl_se));
+
+ return rq;
+}
+
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+ return &rq_of_dl_se(dl_se)->dl;
}
static inline int on_dl_rq(struct sched_dl_entity *dl_se)
@@ -132,7 +145,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
int i;
for_each_cpu_and(i, mask, cpu_active_mask)
- cap += capacity_orig_of(i);
+ cap += arch_scale_cpu_capacity(i);
return cap;
}
@@ -144,7 +157,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
static inline unsigned long dl_bw_capacity(int i)
{
if (!sched_asym_cpucap_active() &&
- capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
+ arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) {
return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
} else {
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
@@ -335,6 +348,8 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
__add_rq_bw(new_bw, &rq->dl);
}
+static void __dl_clear_params(struct sched_dl_entity *dl_se);
+
/*
* The utilization of a task cannot be immediately removed from
* the rq active utilization (running_bw) when the task blocks.
@@ -389,12 +404,11 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
* up, and checks if the task is still in the "ACTIVE non contending"
* state or not (in the second case, it updates running_bw).
*/
-static void task_non_contending(struct task_struct *p)
+static void task_non_contending(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
struct hrtimer *timer = &dl_se->inactive_timer;
- struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq);
+ struct rq *rq = rq_of_dl_se(dl_se);
+ struct dl_rq *dl_rq = &rq->dl;
s64 zerolag_time;
/*
@@ -424,24 +438,33 @@ static void task_non_contending(struct task_struct *p)
* utilization now, instead of starting a timer
*/
if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
- if (dl_task(p))
+ if (dl_server(dl_se)) {
sub_running_bw(dl_se, dl_rq);
- if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-
- if (READ_ONCE(p->__state) == TASK_DEAD)
- sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_lock(&dl_b->lock);
- __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
- raw_spin_unlock(&dl_b->lock);
- __dl_clear_params(p);
+ } else {
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (dl_task(p))
+ sub_running_bw(dl_se, dl_rq);
+
+ if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (READ_ONCE(p->__state) == TASK_DEAD)
+ sub_rq_bw(dl_se, &rq->dl);
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
+ raw_spin_unlock(&dl_b->lock);
+ __dl_clear_params(dl_se);
+ }
}
return;
}
dl_se->dl_non_contending = 1;
- get_task_struct(p);
+ if (!dl_server(dl_se))
+ get_task_struct(dl_task_of(dl_se));
+
hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
}
@@ -468,8 +491,10 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
* will not touch the rq's active utilization,
* so we are still safe.
*/
- if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
- put_task_struct(dl_task_of(dl_se));
+ if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
+ if (!dl_server(dl_se))
+ put_task_struct(dl_task_of(dl_se));
+ }
} else {
/*
* Since "dl_non_contending" is not set, the
@@ -482,10 +507,8 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
}
}
-static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- struct sched_dl_entity *dl_se = &p->dl;
-
return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
}
@@ -509,7 +532,6 @@ void init_dl_rq(struct dl_rq *dl_rq)
/* zero means no -deadline tasks */
dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
- dl_rq->dl_nr_migratory = 0;
dl_rq->overloaded = 0;
dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
#else
@@ -553,39 +575,6 @@ static inline void dl_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
}
-static void update_dl_migration(struct dl_rq *dl_rq)
-{
- if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
- if (!dl_rq->overloaded) {
- dl_set_overload(rq_of_dl_rq(dl_rq));
- dl_rq->overloaded = 1;
- }
- } else if (dl_rq->overloaded) {
- dl_clear_overload(rq_of_dl_rq(dl_rq));
- dl_rq->overloaded = 0;
- }
-}
-
-static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
- struct task_struct *p = dl_task_of(dl_se);
-
- if (p->nr_cpus_allowed > 1)
- dl_rq->dl_nr_migratory++;
-
- update_dl_migration(dl_rq);
-}
-
-static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
- struct task_struct *p = dl_task_of(dl_se);
-
- if (p->nr_cpus_allowed > 1)
- dl_rq->dl_nr_migratory--;
-
- update_dl_migration(dl_rq);
-}
-
#define __node_2_pdl(node) \
rb_entry((node), struct task_struct, pushable_dl_tasks)
@@ -594,6 +583,11 @@ static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b)
return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl);
}
+static inline int has_pushable_dl_tasks(struct rq *rq)
+{
+ return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
+}
+
/*
* The list of pushable -deadline task is not a plist, like in
* sched_rt.c, it is an rb-tree with tasks ordered by deadline.
@@ -609,6 +603,11 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
__pushable_less);
if (leftmost)
rq->dl.earliest_dl.next = p->dl.deadline;
+
+ if (!rq->dl.overloaded) {
+ dl_set_overload(rq);
+ rq->dl.overloaded = 1;
+ }
}
static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
@@ -625,11 +624,11 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline;
RB_CLEAR_NODE(&p->pushable_dl_tasks);
-}
-static inline int has_pushable_dl_tasks(struct rq *rq)
-{
- return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
+ if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) {
+ dl_clear_overload(rq);
+ rq->dl.overloaded = 0;
+ }
}
static int push_dl_task(struct rq *rq);
@@ -761,9 +760,11 @@ static inline void deadline_queue_pull_task(struct rq *rq)
}
#endif /* CONFIG_SMP */
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags);
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags);
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags);
static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
struct rq *rq)
@@ -1010,8 +1011,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
*/
static void update_dl_entity(struct sched_dl_entity *dl_se)
{
- struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq);
+ struct rq *rq = rq_of_dl_se(dl_se);
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, rq_clock(rq))) {
@@ -1042,11 +1042,11 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
* actually started or not (i.e., the replenishment instant is in
* the future or in the past).
*/
-static int start_dl_timer(struct task_struct *p)
+static int start_dl_timer(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
struct hrtimer *timer = &dl_se->dl_timer;
- struct rq *rq = task_rq(p);
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
ktime_t now, act;
s64 delta;
@@ -1080,13 +1080,33 @@ static int start_dl_timer(struct task_struct *p)
* and observe our state.
*/
if (!hrtimer_is_queued(timer)) {
- get_task_struct(p);
+ if (!dl_server(dl_se))
+ get_task_struct(dl_task_of(dl_se));
hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
}
return 1;
}
+static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Queueing this task back might have overloaded rq, check if we need
+ * to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq)) {
+ /*
+ * Nothing relies on rq->lock after this, so its safe to drop
+ * rq->lock.
+ */
+ rq_unpin_lock(rq, rf);
+ push_dl_task(rq);
+ rq_repin_lock(rq, rf);
+ }
+#endif
+}
+
/*
* This is the bandwidth enforcement timer callback. If here, we know
* a task is not on its dl_rq, since the fact that the timer was running
@@ -1105,10 +1125,34 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity *dl_se = container_of(timer,
struct sched_dl_entity,
dl_timer);
- struct task_struct *p = dl_task_of(dl_se);
+ struct task_struct *p;
struct rq_flags rf;
struct rq *rq;
+ if (dl_server(dl_se)) {
+ struct rq *rq = rq_of_dl_se(dl_se);
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ if (dl_se->dl_throttled) {
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+ if (dl_se->server_has_tasks(dl_se)) {
+ enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+ resched_curr(rq);
+ __push_dl_task(rq, &rf);
+ } else {
+ replenish_dl_entity(dl_se);
+ }
+
+ }
+ rq_unlock(rq, &rf);
+
+ return HRTIMER_NORESTART;
+ }
+
+ p = dl_task_of(dl_se);
rq = task_rq_lock(p, &rf);
/*
@@ -1175,25 +1219,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ wakeup_preempt_dl(rq, p, 0);
else
resched_curr(rq);
-#ifdef CONFIG_SMP
- /*
- * Queueing this task back might have overloaded rq, check if we need
- * to kick someone away.
- */
- if (has_pushable_dl_tasks(rq)) {
- /*
- * Nothing relies on rq->lock after this, so its safe to drop
- * rq->lock.
- */
- rq_unpin_lock(rq, &rf);
- push_dl_task(rq);
- rq_repin_lock(rq, &rf);
- }
-#endif
+ __push_dl_task(rq, &rf);
unlock:
task_rq_unlock(rq, p, &rf);
@@ -1207,7 +1237,7 @@ unlock:
return HRTIMER_NORESTART;
}
-void init_dl_task_timer(struct sched_dl_entity *dl_se)
+static void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
@@ -1235,12 +1265,11 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
*/
static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
{
- struct task_struct *p = dl_task_of(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+ struct rq *rq = rq_of_dl_se(dl_se);
if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
- if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se)))
return;
dl_se->dl_throttled = 1;
if (dl_se->runtime > 0)
@@ -1291,44 +1320,19 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
return (delta * u_act) >> BW_SHIFT;
}
-/*
- * Update the current task's runtime statistics (provided it is still
- * a -deadline task and has not been removed from the dl_rq).
- */
-static void update_curr_dl(struct rq *rq)
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags);
+static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
{
- struct task_struct *curr = rq->curr;
- struct sched_dl_entity *dl_se = &curr->dl;
- u64 delta_exec, scaled_delta_exec;
- int cpu = cpu_of(rq);
- u64 now;
+ s64 scaled_delta_exec;
- if (!dl_task(curr) || !on_dl_rq(dl_se))
- return;
-
- /*
- * Consumed budget is computed considering the time as
- * observed by schedulable tasks (excluding time spent
- * in hardirq context, etc.). Deadlines are instead
- * computed using hard walltime. This seems to be the more
- * natural solution, but the full ramifications of this
- * approach need further study.
- */
- now = rq_clock_task(rq);
- delta_exec = now - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0)) {
+ if (unlikely(delta_exec <= 0)) {
if (unlikely(dl_se->dl_yielded))
goto throttle;
return;
}
- schedstat_set(curr->stats.exec_max,
- max(curr->stats.exec_max, delta_exec));
-
- trace_sched_stat_runtime(curr, delta_exec, 0);
-
- update_current_exec_runtime(curr, now, delta_exec);
-
if (dl_entity_is_special(dl_se))
return;
@@ -1340,10 +1344,9 @@ static void update_curr_dl(struct rq *rq)
* according to current frequency and CPU maximum capacity.
*/
if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
- scaled_delta_exec = grub_reclaim(delta_exec,
- rq,
- &curr->dl);
+ scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se);
} else {
+ int cpu = cpu_of(rq);
unsigned long scale_freq = arch_scale_freq_capacity(cpu);
unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
@@ -1362,11 +1365,20 @@ throttle:
(dl_se->flags & SCHED_FLAG_DL_OVERRUN))
dl_se->dl_overrun = 1;
- __dequeue_task_dl(rq, curr, 0);
- if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
- enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+ dequeue_dl_entity(dl_se, 0);
+ if (!dl_server(dl_se)) {
+ update_stats_dequeue_dl(&rq->dl, dl_se, 0);
+ dequeue_pushable_dl_task(rq, dl_task_of(dl_se));
+ }
- if (!is_leftmost(curr, &rq->dl))
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
+ if (dl_server(dl_se))
+ enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+ else
+ enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
+ }
+
+ if (!is_leftmost(dl_se, &rq->dl))
resched_curr(rq);
}
@@ -1396,20 +1408,82 @@ throttle:
}
}
+void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
+{
+ update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+}
+
+void dl_server_start(struct sched_dl_entity *dl_se)
+{
+ if (!dl_server(dl_se)) {
+ dl_se->dl_server = 1;
+ setup_new_dl_entity(dl_se);
+ }
+ enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
+}
+
+void dl_server_stop(struct sched_dl_entity *dl_se)
+{
+ dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
+}
+
+void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ dl_server_has_tasks_f has_tasks,
+ dl_server_pick_f pick)
+{
+ dl_se->rq = rq;
+ dl_se->server_has_tasks = has_tasks;
+ dl_se->server_pick = pick;
+}
+
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_dl_entity *dl_se = &curr->dl;
+ s64 delta_exec;
+
+ if (!dl_task(curr) || !on_dl_rq(dl_se))
+ return;
+
+ /*
+ * Consumed budget is computed considering the time as
+ * observed by schedulable tasks (excluding time spent
+ * in hardirq context, etc.). Deadlines are instead
+ * computed using hard walltime. This seems to be the more
+ * natural solution, but the full ramifications of this
+ * approach need further study.
+ */
+ delta_exec = update_curr_common(rq);
+ update_curr_dl_se(rq, dl_se, delta_exec);
+}
+
static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
{
struct sched_dl_entity *dl_se = container_of(timer,
struct sched_dl_entity,
inactive_timer);
- struct task_struct *p = dl_task_of(dl_se);
+ struct task_struct *p = NULL;
struct rq_flags rf;
struct rq *rq;
- rq = task_rq_lock(p, &rf);
+ if (!dl_server(dl_se)) {
+ p = dl_task_of(dl_se);
+ rq = task_rq_lock(p, &rf);
+ } else {
+ rq = dl_se->rq;
+ rq_lock(rq, &rf);
+ }
sched_clock_tick();
update_rq_clock(rq);
+ if (dl_server(dl_se))
+ goto no_task;
+
if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
@@ -1422,23 +1496,30 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
raw_spin_lock(&dl_b->lock);
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&dl_b->lock);
- __dl_clear_params(p);
+ __dl_clear_params(dl_se);
goto unlock;
}
+
+no_task:
if (dl_se->dl_non_contending == 0)
goto unlock;
sub_running_bw(dl_se, &rq->dl);
dl_se->dl_non_contending = 0;
unlock:
- task_rq_unlock(rq, p, &rf);
- put_task_struct(p);
+
+ if (!dl_server(dl_se)) {
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+ } else {
+ rq_unlock(rq, &rf);
+ }
return HRTIMER_NORESTART;
}
-void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
+static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->inactive_timer;
@@ -1496,29 +1577,22 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
static inline
void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- int prio = dl_task_of(dl_se)->prio;
u64 deadline = dl_se->deadline;
- WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
- inc_dl_migration(dl_se, dl_rq);
}
static inline
void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- int prio = dl_task_of(dl_se)->prio;
-
- WARN_ON(!dl_prio(prio));
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
- dec_dl_migration(dl_se, dl_rq);
}
static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
@@ -1635,6 +1709,41 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
/*
+ * Check if a constrained deadline task was activated
+ * after the deadline but before the next period.
+ * If that is the case, the task will be throttled and
+ * the replenishment timer will be set to the next period.
+ */
+ if (!dl_se->dl_throttled && !dl_is_implicit(dl_se))
+ dl_check_constrained_dl(dl_se);
+
+ if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING)) {
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ add_rq_bw(dl_se, dl_rq);
+ add_running_bw(dl_se, dl_rq);
+ }
+
+ /*
+ * If p is throttled, we do not enqueue it. In fact, if it exhausted
+ * its budget it needs a replenishment and, since it now is on
+ * its rq, the bandwidth timer callback (which clearly has not
+ * run yet) will take care of this.
+ * However, the active utilization does not depend on the fact
+ * that the task is on the runqueue or not (but depends on the
+ * task's state - in GRUB parlance, "inactive" vs "active contending").
+ * In other words, even if a task is throttled its utilization must
+ * be counted in the active utilization; hence, we need to call
+ * add_running_bw().
+ */
+ if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+ if (flags & ENQUEUE_WAKEUP)
+ task_contending(dl_se, flags);
+
+ return;
+ }
+
+ /*
* If this is a wakeup or a new instance, the scheduling
* parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime.
@@ -1645,17 +1754,35 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se);
} else if ((flags & ENQUEUE_RESTORE) &&
- dl_time_before(dl_se->deadline,
- rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+ dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
setup_new_dl_entity(dl_se);
}
__enqueue_dl_entity(dl_se);
}
-static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
__dequeue_dl_entity(dl_se);
+
+ if (flags & (DEQUEUE_SAVE|DEQUEUE_MIGRATING)) {
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ sub_running_bw(dl_se, dl_rq);
+ sub_rq_bw(dl_se, dl_rq);
+ }
+
+ /*
+ * This check allows to start the inactive timer (or to immediately
+ * decrease the active utilization, if needed) in two cases:
+ * when the task blocks and when it is terminating
+ * (p->state == TASK_DEAD). We can handle the two cases in the same
+ * way, because from GRUB's point of view the same thing is happening
+ * (the task moves from "active contending" to "active non contending"
+ * or "inactive")
+ */
+ if (flags & DEQUEUE_SLEEP)
+ task_non_contending(dl_se);
}
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -1700,76 +1827,31 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
return;
}
- /*
- * Check if a constrained deadline task was activated
- * after the deadline but before the next period.
- * If that is the case, the task will be throttled and
- * the replenishment timer will be set to the next period.
- */
- if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
- dl_check_constrained_dl(&p->dl);
-
- if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
- add_rq_bw(&p->dl, &rq->dl);
- add_running_bw(&p->dl, &rq->dl);
- }
-
- /*
- * If p is throttled, we do not enqueue it. In fact, if it exhausted
- * its budget it needs a replenishment and, since it now is on
- * its rq, the bandwidth timer callback (which clearly has not
- * run yet) will take care of this.
- * However, the active utilization does not depend on the fact
- * that the task is on the runqueue or not (but depends on the
- * task's state - in GRUB parlance, "inactive" vs "active contending").
- * In other words, even if a task is throttled its utilization must
- * be counted in the active utilization; hence, we need to call
- * add_running_bw().
- */
- if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
- if (flags & ENQUEUE_WAKEUP)
- task_contending(&p->dl, flags);
-
- return;
- }
-
check_schedstat_required();
update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl);
+ if (p->on_rq == TASK_ON_RQ_MIGRATING)
+ flags |= ENQUEUE_MIGRATING;
+
enqueue_dl_entity(&p->dl, flags);
- if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
- enqueue_pushable_dl_task(rq, p);
-}
+ if (dl_server(&p->dl))
+ return;
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
-{
- update_stats_dequeue_dl(&rq->dl, &p->dl, flags);
- dequeue_dl_entity(&p->dl);
- dequeue_pushable_dl_task(rq, p);
+ if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
}
static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
update_curr_dl(rq);
- __dequeue_task_dl(rq, p, flags);
- if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
- sub_running_bw(&p->dl, &rq->dl);
- sub_rq_bw(&p->dl, &rq->dl);
- }
+ if (p->on_rq == TASK_ON_RQ_MIGRATING)
+ flags |= DEQUEUE_MIGRATING;
- /*
- * This check allows to start the inactive timer (or to immediately
- * decrease the active utilization, if needed) in two cases:
- * when the task blocks and when it is terminating
- * (p->state == TASK_DEAD). We can handle the two cases in the same
- * way, because from GRUB's point of view the same thing is happening
- * (the task moves from "active contending" to "active non contending"
- * or "inactive")
- */
- if (flags & DEQUEUE_SLEEP)
- task_non_contending(p);
+ dequeue_dl_entity(&p->dl, flags);
+ if (!p->dl.dl_throttled && !dl_server(&p->dl))
+ dequeue_pushable_dl_task(rq, p);
}
/*
@@ -1939,7 +2021,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
* Only called when both the current and waking task are -deadline
* tasks.
*/
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
int flags)
{
if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
@@ -1959,12 +2041,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
}
#ifdef CONFIG_SCHED_HRTICK
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
- hrtick_start(rq, p->dl.runtime);
+ hrtick_start(rq, dl_se->runtime);
}
#else /* !CONFIG_SCHED_HRTICK */
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
}
#endif
@@ -1984,9 +2066,6 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
if (!first)
return;
- if (hrtick_enabled_dl(rq))
- start_hrtick_dl(rq, p);
-
if (rq->curr->sched_class != &dl_sched_class)
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
@@ -2009,12 +2088,25 @@ static struct task_struct *pick_task_dl(struct rq *rq)
struct dl_rq *dl_rq = &rq->dl;
struct task_struct *p;
+again:
if (!sched_dl_runnable(rq))
return NULL;
dl_se = pick_next_dl_entity(dl_rq);
WARN_ON_ONCE(!dl_se);
- p = dl_task_of(dl_se);
+
+ if (dl_server(dl_se)) {
+ p = dl_se->server_pick(dl_se);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ dl_se->dl_yielded = 1;
+ update_curr_dl_se(rq, dl_se, 0);
+ goto again;
+ }
+ p->dl_server = dl_se;
+ } else {
+ p = dl_task_of(dl_se);
+ }
return p;
}
@@ -2024,9 +2116,15 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
struct task_struct *p;
p = pick_task_dl(rq);
- if (p)
+ if (!p)
+ return p;
+
+ if (!p->dl_server)
set_next_task_dl(rq, p, true);
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, &p->dl);
+
return p;
}
@@ -2064,8 +2162,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
* be set and schedule() will start a new hrtick for the next task.
*/
if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 &&
- is_leftmost(p, &rq->dl))
- start_hrtick_dl(rq, p);
+ is_leftmost(&p->dl, &rq->dl))
+ start_hrtick_dl(rq, &p->dl);
}
static void task_fork_dl(struct task_struct *p)
@@ -2291,9 +2389,6 @@ static int push_dl_task(struct rq *rq)
struct rq *later_rq;
int ret = 0;
- if (!rq->dl.overloaded)
- return 0;
-
next_task = pick_next_pushable_dl_task(rq);
if (!next_task)
return 0;
@@ -2449,9 +2544,11 @@ skip:
double_unlock_balance(this_rq, src_rq);
if (push_task) {
+ preempt_disable();
raw_spin_rq_unlock(this_rq);
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
push_task, &src_rq->push_work);
+ preempt_enable();
raw_spin_rq_lock(this_rq);
}
}
@@ -2585,7 +2682,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
* will reset the task parameters.
*/
if (task_on_rq_queued(p) && p->dl.dl_runtime)
- task_non_contending(p);
+ task_non_contending(&p->dl);
/*
* In case a task is setscheduled out from SCHED_DEADLINE we need to
@@ -2652,7 +2749,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
deadline_queue_push_tasks(rq);
#endif
if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ wakeup_preempt_dl(rq, p, 0);
else
resched_curr(rq);
} else {
@@ -2721,7 +2818,7 @@ DEFINE_SCHED_CLASS(dl) = {
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
- .check_preempt_curr = check_preempt_curr_dl,
+ .wakeup_preempt = wakeup_preempt_dl,
.pick_next_task = pick_next_task_dl,
.put_prev_task = put_prev_task_dl,
@@ -2993,10 +3090,8 @@ bool __checkparam_dl(const struct sched_attr *attr)
/*
* This function clears the sched_dl_entity static params.
*/
-void __dl_clear_params(struct task_struct *p)
+static void __dl_clear_params(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
-
dl_se->dl_runtime = 0;
dl_se->dl_deadline = 0;
dl_se->dl_period = 0;
@@ -3008,12 +3103,21 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
+ dl_se->dl_server = 0;
#ifdef CONFIG_RT_MUTEXES
dl_se->pi_se = dl_se;
#endif
}
+void init_dl_entity(struct sched_dl_entity *dl_se)
+{
+ RB_CLEAR_NODE(&dl_se->rb_node);
+ init_dl_task_timer(dl_se);
+ init_dl_inactive_task_timer(dl_se);
+ __dl_clear_params(dl_se);
+}
+
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_dl_entity *dl_se = &p->dl;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4c3d0d9f3db6..8d5d98a5834d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,7 +8,7 @@
*/
/*
- * This allows printing both to /proc/sched_debug and
+ * This allows printing both to /sys/kernel/debug/sched/debug and
* to the console
*/
#define SEQ_printf(m, x...) \
@@ -628,8 +628,8 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
- struct sched_entity *last, *first;
+ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+ struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -644,15 +644,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_rq_lock_irqsave(rq, flags);
+ root = __pick_root_entity(cfs_rq);
+ if (root)
+ left_vruntime = root->min_vruntime;
first = __pick_first_entity(cfs_rq);
if (first)
- left_vruntime = first->vruntime;
+ left_deadline = first->deadline;
last = __pick_last_entity(cfs_rq);
if (last)
right_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
raw_spin_rq_unlock_irqrestore(rq, flags);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
+ SPLIT_NS(left_deadline));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
@@ -679,8 +684,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.runnable_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
- SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
- cfs_rq->avg.util_est.enqueued);
+ SEQ_printf(m, " .%-30s: %u\n", "util_est",
+ cfs_rq->avg.util_est);
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
@@ -724,9 +729,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
PU(rt_nr_running);
-#ifdef CONFIG_SMP
- PU(rt_nr_migratory);
-#endif
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
@@ -748,7 +750,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
PU(dl_nr_running);
#ifdef CONFIG_SMP
- PU(dl_nr_migratory);
dl_bw = &cpu_rq(cpu)->rd->dl_bw;
#else
dl_bw = &dl_rq->dl_bw;
@@ -864,7 +865,6 @@ static void sched_debug_header(struct seq_file *m)
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
PN(sysctl_sched_base_slice);
- P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
#undef PN
#undef P
@@ -1075,8 +1075,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.runnable_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
- P(se.avg.util_est.ewma);
- PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
+ PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
#endif
#ifdef CONFIG_UCLAMP_TASK
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8dbff6e7ad4f..533547e3c90a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -51,8 +51,6 @@
#include <asm/switch_to.h>
-#include <linux/sched/cond_resched.h>
-
#include "sched.h"
#include "stats.h"
#include "autogroup.h"
@@ -78,12 +76,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
unsigned int sysctl_sched_base_slice = 750000ULL;
static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
-/*
- * After fork, child runs first. If set to 0 (default) then
- * parent will (try to) run first.
- */
-unsigned int sysctl_sched_child_runs_first __read_mostly;
-
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
int sched_thermal_decay_shift;
@@ -145,13 +137,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
#ifdef CONFIG_SYSCTL
static struct ctl_table sched_fair_sysctls[] = {
- {
- .procname = "sched_child_runs_first",
- .data = &sysctl_sched_child_runs_first,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
#ifdef CONFIG_CFS_BANDWIDTH
{
.procname = "sched_cfs_bandwidth_slice_us",
@@ -566,7 +551,11 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
static inline bool entity_before(const struct sched_entity *a,
const struct sched_entity *b)
{
- return (s64)(a->vruntime - b->vruntime) < 0;
+ /*
+ * Tiebreak on vruntime seems unnecessary since it can
+ * hardly happen.
+ */
+ return (s64)(a->deadline - b->deadline) < 0;
}
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -664,6 +653,10 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
}
+/*
+ * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * For this to be so, the result of this function must have a left bias.
+ */
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
@@ -677,8 +670,12 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
load += weight;
}
- if (load)
+ if (load) {
+ /* sign flips effective floor / ceil */
+ if (avg < 0)
+ avg -= (load - 1);
avg = div_s64(avg, load);
+ }
return cfs_rq->min_vruntime + avg;
}
@@ -727,7 +724,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Note: using 'avg_vruntime() > se->vruntime' is inacurate due
* to the loss in precision caused by the division.
*/
-int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{
struct sched_entity *curr = cfs_rq->curr;
s64 avg = cfs_rq->avg_vruntime;
@@ -740,7 +737,12 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
load += weight;
}
- return avg >= entity_key(cfs_rq, se) * load;
+ return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
+}
+
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return vruntime_eligible(cfs_rq, se->vruntime);
}
static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
@@ -759,9 +761,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = __pick_first_entity(cfs_rq);
+ struct sched_entity *se = __pick_root_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
-
u64 vruntime = cfs_rq->min_vruntime;
if (curr) {
@@ -773,9 +774,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
if (se) {
if (!curr)
- vruntime = se->vruntime;
+ vruntime = se->min_vruntime;
else
- vruntime = min_vruntime(vruntime, se->vruntime);
+ vruntime = min_vruntime(vruntime, se->min_vruntime);
}
/* ensure we never gain time by being placed backwards. */
@@ -788,34 +789,34 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
return entity_before(__node_2_se(a), __node_2_se(b));
}
-#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
-static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
+static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
{
if (node) {
struct sched_entity *rse = __node_2_se(node);
- if (deadline_gt(min_deadline, se, rse))
- se->min_deadline = rse->min_deadline;
+ if (vruntime_gt(min_vruntime, se, rse))
+ se->min_vruntime = rse->min_vruntime;
}
}
/*
- * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
+ * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
*/
-static inline bool min_deadline_update(struct sched_entity *se, bool exit)
+static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
{
- u64 old_min_deadline = se->min_deadline;
+ u64 old_min_vruntime = se->min_vruntime;
struct rb_node *node = &se->run_node;
- se->min_deadline = se->deadline;
- __update_min_deadline(se, node->rb_right);
- __update_min_deadline(se, node->rb_left);
+ se->min_vruntime = se->vruntime;
+ __min_vruntime_update(se, node->rb_right);
+ __min_vruntime_update(se, node->rb_left);
- return se->min_deadline == old_min_deadline;
+ return se->min_vruntime == old_min_vruntime;
}
-RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
- run_node, min_deadline, min_deadline_update);
+RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
+ run_node, min_vruntime, min_vruntime_update);
/*
* Enqueue an entity into the rb-tree:
@@ -823,18 +824,28 @@ RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
- se->min_deadline = se->deadline;
+ se->min_vruntime = se->vruntime;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
- __entity_less, &min_deadline_cb);
+ __entity_less, &min_vruntime_cb);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
- &min_deadline_cb);
+ &min_vruntime_cb);
avg_vruntime_sub(cfs_rq, se);
}
+struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
+
+ if (!root)
+ return NULL;
+
+ return __node_2_se(root);
+}
+
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
@@ -857,19 +868,27 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
* with the earliest virtual deadline.
*
* We can do this in O(log n) time due to an augmented RB-tree. The
- * tree keeps the entries sorted on service, but also functions as a
- * heap based on the deadline by keeping:
+ * tree keeps the entries sorted on deadline, but also functions as a
+ * heap based on the vruntime by keeping:
*
- * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
+ * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
*
- * Which allows an EDF like search on (sub)trees.
+ * Which allows tree pruning through eligibility.
*/
static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
struct sched_entity *best = NULL;
+ /*
+ * We can safely skip eligibility check if there is only one entity
+ * in this cfs_rq, saving some cycles.
+ */
+ if (cfs_rq->nr_running == 1)
+ return curr && curr->on_rq ? curr : se;
+
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
@@ -880,52 +899,44 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
return curr;
+ /* Pick the leftmost entity if it's eligible */
+ if (se && entity_eligible(cfs_rq, se)) {
+ best = se;
+ goto found;
+ }
+
+ /* Heap search for the EEVD entity */
while (node) {
- struct sched_entity *se = __node_2_se(node);
+ struct rb_node *left = node->rb_left;
/*
- * If this entity is not eligible, try the left subtree.
+ * Eligible entities in left subtree are always better
+ * choices, since they have earlier deadlines.
*/
- if (!entity_eligible(cfs_rq, se)) {
- node = node->rb_left;
+ if (left && vruntime_eligible(cfs_rq,
+ __node_2_se(left)->min_vruntime)) {
+ node = left;
continue;
}
- /*
- * If this entity has an earlier deadline than the previous
- * best, take this one. If it also has the earliest deadline
- * of its subtree, we're done.
- */
- if (!best || deadline_gt(deadline, best, se)) {
- best = se;
- if (best->deadline == best->min_deadline)
- break;
- }
+ se = __node_2_se(node);
/*
- * If the earlest deadline in this subtree is in the fully
- * eligible left half of our space, go there.
+ * The left subtree either is empty or has no eligible
+ * entity, so check the current node since it is the one
+ * with earliest deadline that might be eligible.
*/
- if (node->rb_left &&
- __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
- node = node->rb_left;
- continue;
+ if (entity_eligible(cfs_rq, se)) {
+ best = se;
+ break;
}
node = node->rb_right;
}
-
- if (!best || (curr && deadline_gt(deadline, best, curr)))
+found:
+ if (!best || (curr && entity_before(curr, best)))
best = curr;
- if (unlikely(!best)) {
- struct sched_entity *left = __pick_first_entity(cfs_rq);
- if (left) {
- pr_err("EEVDF scheduling fail, picking leftmost\n");
- return left;
- }
- }
-
return best;
}
@@ -1092,23 +1103,17 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_SMP */
-/*
- * Update the current task's runtime statistics.
- */
-static void update_curr(struct cfs_rq *cfs_rq)
+static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
{
- struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_clock_task(rq_of(cfs_rq));
- u64 delta_exec;
-
- if (unlikely(!curr))
- return;
+ u64 now = rq_clock_task(rq);
+ s64 delta_exec;
delta_exec = now - curr->exec_start;
- if (unlikely((s64)delta_exec <= 0))
- return;
+ if (unlikely(delta_exec <= 0))
+ return delta_exec;
curr->exec_start = now;
+ curr->sum_exec_runtime += delta_exec;
if (schedstat_enabled()) {
struct sched_statistics *stats;
@@ -1118,20 +1123,54 @@ static void update_curr(struct cfs_rq *cfs_rq)
max(delta_exec, stats->exec_max));
}
- curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq->exec_clock, delta_exec);
+ return delta_exec;
+}
+
+static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
+{
+ trace_sched_stat_runtime(p, delta_exec);
+ account_group_exec_runtime(p, delta_exec);
+ cgroup_account_cputime(p, delta_exec);
+ if (p->dl_server)
+ dl_server_update(p->dl_server, delta_exec);
+}
+
+/*
+ * Used by other classes to account runtime.
+ */
+s64 update_curr_common(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ s64 delta_exec;
+
+ delta_exec = update_curr_se(rq, &curr->se);
+ if (likely(delta_exec > 0))
+ update_curr_task(curr, delta_exec);
+
+ return delta_exec;
+}
+
+/*
+ * Update the current task's runtime statistics.
+ */
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 delta_exec;
+
+ if (unlikely(!curr))
+ return;
+
+ delta_exec = update_curr_se(rq_of(cfs_rq), curr);
+ if (unlikely(delta_exec <= 0))
+ return;
curr->vruntime += calc_delta_fair(delta_exec, curr);
update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);
- if (entity_is_task(curr)) {
- struct task_struct *curtask = task_of(curr);
-
- trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
- cgroup_account_cputime(curtask, delta_exec);
- account_group_exec_runtime(curtask, delta_exec);
- }
+ if (entity_is_task(curr))
+ update_curr_task(task_of(curr), delta_exec);
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
@@ -1722,12 +1761,12 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat)
* The smaller the hint page fault latency, the higher the possibility
* for the page to be hot.
*/
-static int numa_hint_fault_latency(struct page *page)
+static int numa_hint_fault_latency(struct folio *folio)
{
int last_time, time;
time = jiffies_to_msecs(jiffies);
- last_time = xchg_page_access_time(page, time);
+ last_time = folio_xchg_access_time(folio, time);
return (time - last_time) & PAGE_ACCESS_TIME_MASK;
}
@@ -1784,7 +1823,7 @@ static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
}
}
-bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
int src_nid, int dst_cpu)
{
struct numa_group *ng = deref_curr_numa_group(p);
@@ -1814,16 +1853,16 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
th = pgdat->nbp_threshold ? : def_th;
- latency = numa_hint_fault_latency(page);
+ latency = numa_hint_fault_latency(folio);
if (latency >= th)
return false;
return !numa_promotion_rate_limit(pgdat, rate_limit,
- thp_nr_pages(page));
+ folio_nr_pages(folio));
}
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
- last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
!node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
@@ -2847,19 +2886,7 @@ static void task_numa_placement(struct task_struct *p)
}
/* Cannot migrate task to CPU-less node */
- if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
- int near_nid = max_nid;
- int distance, near_distance = INT_MAX;
-
- for_each_node_state(nid, N_CPU) {
- distance = node_distance(max_nid, nid);
- if (distance < near_distance) {
- near_nid = nid;
- near_distance = distance;
- }
- }
- max_nid = near_nid;
- }
+ max_nid = numa_nearest_node(max_nid, N_CPU);
if (ng) {
numa_group_count_active_nodes(ng);
@@ -3130,7 +3157,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
p->mm->numa_scan_offset = 0;
}
-static bool vma_is_accessed(struct vm_area_struct *vma)
+static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
{
unsigned long pids;
/*
@@ -3139,11 +3166,23 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
* This is also done to avoid any side effect of task scanning
* amplifying the unfairness of disjoint set of VMAs' access.
*/
- if (READ_ONCE(current->mm->numa_scan_seq) < 2)
+ if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
return true;
- pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
- return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
+ pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
+ if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+ return true;
+
+ /*
+ * Complete a scan that has already started regardless of PID access, or
+ * some VMAs may never be scanned in multi-threaded applications:
+ */
+ if (mm->numa_scan_offset > vma->vm_start) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
+ return true;
+ }
+
+ return false;
}
#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
@@ -3163,6 +3202,8 @@ static void task_numa_work(struct callback_head *work)
unsigned long nr_pte_updates = 0;
long pages, virtpages;
struct vma_iterator vmi;
+ bool vma_pids_skipped;
+ bool vma_pids_forced = false;
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
@@ -3205,7 +3246,6 @@ static void task_numa_work(struct callback_head *work)
*/
p->node_stamp += 2 * TICK_NSEC;
- start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
virtpages = pages * 8; /* Scan up to this much virtual space */
@@ -3215,6 +3255,16 @@ static void task_numa_work(struct callback_head *work)
if (!mmap_read_trylock(mm))
return;
+
+ /*
+ * VMAs are skipped if the current PID has not trapped a fault within
+ * the VMA recently. Allow scanning to be forced if there is no
+ * suitable VMA remaining.
+ */
+ vma_pids_skipped = false;
+
+retry_pids:
+ start = mm->numa_scan_offset;
vma_iter_init(&vmi, mm, start);
vma = vma_next(&vmi);
if (!vma) {
@@ -3227,6 +3277,7 @@ static void task_numa_work(struct callback_head *work)
do {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
continue;
}
@@ -3237,15 +3288,19 @@ static void task_numa_work(struct callback_head *work)
* as migrating the pages will be of marginal benefit.
*/
if (!vma->vm_mm ||
- (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+ (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
continue;
+ }
/*
* Skip inaccessible VMAs to avoid any confusion between
* PROT_NONE and NUMA hinting ptes
*/
- if (!vma_is_accessible(vma))
+ if (!vma_is_accessible(vma)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
continue;
+ }
/* Initialise new per-VMA NUMAB state. */
if (!vma->numab_state) {
@@ -3254,12 +3309,21 @@ static void task_numa_work(struct callback_head *work)
if (!vma->numab_state)
continue;
+ vma->numab_state->start_scan_seq = mm->numa_scan_seq;
+
vma->numab_state->next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
/* Reset happens after 4 times scan delay of scan start */
- vma->numab_state->next_pid_reset = vma->numab_state->next_scan +
+ vma->numab_state->pids_active_reset = vma->numab_state->next_scan +
msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+
+ /*
+ * Ensure prev_scan_seq does not match numa_scan_seq,
+ * to prevent VMAs being skipped prematurely on the
+ * first scan:
+ */
+ vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
}
/*
@@ -3267,23 +3331,35 @@ static void task_numa_work(struct callback_head *work)
* delay the scan for new VMAs.
*/
if (mm->numa_scan_seq && time_before(jiffies,
- vma->numab_state->next_scan))
+ vma->numab_state->next_scan)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
continue;
+ }
+
+ /* RESET access PIDs regularly for old VMAs. */
+ if (mm->numa_scan_seq &&
+ time_after(jiffies, vma->numab_state->pids_active_reset)) {
+ vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+ vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
+ vma->numab_state->pids_active[1] = 0;
+ }
- /* Do not scan the VMA if task has not accessed */
- if (!vma_is_accessed(vma))
+ /* Do not rescan VMAs twice within the same sequence. */
+ if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
+ mm->numa_scan_offset = vma->vm_end;
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
continue;
+ }
/*
- * RESET access PIDs regularly for old VMAs. Resetting after checking
- * vma for recent access to avoid clearing PID info before access..
+ * Do not scan the VMA if task has not accessed it, unless no other
+ * VMA candidate exists.
*/
- if (mm->numa_scan_seq &&
- time_after(jiffies, vma->numab_state->next_pid_reset)) {
- vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset +
- msecs_to_jiffies(VMA_PID_RESET_PERIOD);
- vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]);
- vma->numab_state->access_pids[1] = 0;
+ if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
+ vma_pids_skipped = true;
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
+ continue;
}
do {
@@ -3310,8 +3386,28 @@ static void task_numa_work(struct callback_head *work)
cond_resched();
} while (end != vma->vm_end);
+
+ /* VMA scan is complete, do not scan until next sequence. */
+ vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
+
+ /*
+ * Only force scan within one VMA at a time, to limit the
+ * cost of scanning a potentially uninteresting VMA.
+ */
+ if (vma_pids_forced)
+ break;
} for_each_vma(vmi, vma);
+ /*
+ * If no VMAs are remaining and VMAs were skipped due to the PID
+ * not accessing the VMA previously, then force a scan to ensure
+ * forward progress:
+ */
+ if (!vma && !vma_pids_forced && vma_pids_skipped) {
+ vma_pids_forced = true;
+ goto retry_pids;
+ }
+
out:
/*
* It is possible to reach the end of the VMA list but the last few
@@ -3574,39 +3670,140 @@ static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
+static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight)
+{
+ unsigned long old_weight = se->load.weight;
+ u64 avruntime = avg_vruntime(cfs_rq);
+ s64 vlag, vslice;
+
+ /*
+ * VRUNTIME
+ * ========
+ *
+ * COROLLARY #1: The virtual runtime of the entity needs to be
+ * adjusted if re-weight at !0-lag point.
+ *
+ * Proof: For contradiction assume this is not true, so we can
+ * re-weight without changing vruntime at !0-lag point.
+ *
+ * Weight VRuntime Avg-VRuntime
+ * before w v V
+ * after w' v' V'
+ *
+ * Since lag needs to be preserved through re-weight:
+ *
+ * lag = (V - v)*w = (V'- v')*w', where v = v'
+ * ==> V' = (V - v)*w/w' + v (1)
+ *
+ * Let W be the total weight of the entities before reweight,
+ * since V' is the new weighted average of entities:
+ *
+ * V' = (WV + w'v - wv) / (W + w' - w) (2)
+ *
+ * by using (1) & (2) we obtain:
+ *
+ * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+ * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+ * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+ * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+ *
+ * Since we are doing at !0-lag point which means V != v, we
+ * can simplify (3):
+ *
+ * ==> W / (W + w' - w) = w / w'
+ * ==> Ww' = Ww + ww' - ww
+ * ==> W * (w' - w) = w * (w' - w)
+ * ==> W = w (re-weight indicates w' != w)
+ *
+ * So the cfs_rq contains only one entity, hence vruntime of
+ * the entity @v should always equal to the cfs_rq's weighted
+ * average vruntime @V, which means we will always re-weight
+ * at 0-lag point, thus breach assumption. Proof completed.
+ *
+ *
+ * COROLLARY #2: Re-weight does NOT affect weighted average
+ * vruntime of all the entities.
+ *
+ * Proof: According to corollary #1, Eq. (1) should be:
+ *
+ * (V - v)*w = (V' - v')*w'
+ * ==> v' = V' - (V - v)*w/w' (4)
+ *
+ * According to the weighted average formula, we have:
+ *
+ * V' = (WV - wv + w'v') / (W - w + w')
+ * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+ * = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+ * = (WV + w'V' - Vw) / (W - w + w')
+ *
+ * ==> V'*(W - w + w') = WV + w'V' - Vw
+ * ==> V' * (W - w) = (W - w) * V (5)
+ *
+ * If the entity is the only one in the cfs_rq, then reweight
+ * always occurs at 0-lag point, so V won't change. Or else
+ * there are other entities, hence W != w, then Eq. (5) turns
+ * into V' = V. So V won't change in either case, proof done.
+ *
+ *
+ * So according to corollary #1 & #2, the effect of re-weight
+ * on vruntime should be:
+ *
+ * v' = V' - (V - v) * w / w' (4)
+ * = V - (V - v) * w / w'
+ * = V - vl * w / w'
+ * = V - vl'
+ */
+ if (avruntime != se->vruntime) {
+ vlag = (s64)(avruntime - se->vruntime);
+ vlag = div_s64(vlag * old_weight, weight);
+ se->vruntime = avruntime - vlag;
+ }
+
+ /*
+ * DEADLINE
+ * ========
+ *
+ * When the weight changes, the virtual time slope changes and
+ * we should adjust the relative virtual deadline accordingly.
+ *
+ * d' = v' + (d - v)*w/w'
+ * = V' - (V - v)*w/w' + (d - v)*w/w'
+ * = V - (V - v)*w/w' + (d - v)*w/w'
+ * = V + (d - V)*w/w'
+ */
+ vslice = (s64)(se->deadline - avruntime);
+ vslice = div_s64(vslice * old_weight, weight);
+ se->deadline = avruntime + vslice;
+}
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
- unsigned long old_weight = se->load.weight;
+ bool curr = cfs_rq->curr == se;
if (se->on_rq) {
/* commit outstanding execution time */
- if (cfs_rq->curr == se)
+ if (curr)
update_curr(cfs_rq);
else
- avg_vruntime_sub(cfs_rq, se);
+ __dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
- update_load_set(&se->load, weight);
-
if (!se->on_rq) {
/*
* Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
* we need to scale se->vlag when w_i changes.
*/
- se->vlag = div_s64(se->vlag * old_weight, weight);
+ se->vlag = div_s64(se->vlag * se->load.weight, weight);
} else {
- s64 deadline = se->deadline - se->vruntime;
- /*
- * When the weight changes, the virtual time slope changes and
- * we should adjust the relative virtual deadline accordingly.
- */
- deadline = div_s64(deadline * old_weight, weight);
- se->deadline = se->vruntime + deadline;
+ reweight_eevdf(cfs_rq, se, weight);
}
+ update_load_set(&se->load, weight);
+
#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
@@ -3618,8 +3815,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
update_load_add(&cfs_rq->load, se->load.weight);
- if (cfs_rq->curr != se)
- avg_vruntime_add(cfs_rq, se);
+ if (!curr)
+ __enqueue_entity(cfs_rq, se);
+
+ /*
+ * The entity's vruntime has been adjusted, so let's check
+ * whether the rq-wide min_vruntime needs updated too. Since
+ * the calculations above require stable min_vruntime rather
+ * than up-to-date one, we do the update at the end of the
+ * reweight process.
+ */
+ update_min_vruntime(cfs_rq);
}
}
@@ -3763,14 +3969,11 @@ static void update_cfs_group(struct sched_entity *se)
#ifndef CONFIG_SMP
shares = READ_ONCE(gcfs_rq->tg->shares);
-
- if (likely(se->load.weight == shares))
- return;
#else
- shares = calc_group_shares(gcfs_rq);
+ shares = calc_group_shares(gcfs_rq);
#endif
-
- reweight_entity(cfs_rq_of(se), se, shares);
+ if (unlikely(se->load.weight != shares))
+ reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -3888,7 +4091,8 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
- long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ long delta;
+ u64 now;
/*
* No need to update load_avg for root_task_group as it is not used.
@@ -3896,10 +4100,67 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
if (cfs_rq->tg == &root_task_group)
return;
+ /* rq has been offline and doesn't contribute to the share anymore: */
+ if (!cpu_active(cpu_of(rq_of(cfs_rq))))
+ return;
+
+ /*
+ * For migration heavy workloads, access to tg->load_avg can be
+ * unbound. Limit the update rate to at most once per ms.
+ */
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
+ return;
+
+ delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+ cfs_rq->last_update_tg_load_avg = now;
+ }
+}
+
+static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
+{
+ long delta;
+ u64 now;
+
+ /*
+ * No need to update load_avg for root_task_group, as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ delta = 0 - cfs_rq->tg_load_avg_contrib;
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = 0;
+ cfs_rq->last_update_tg_load_avg = now;
+}
+
+/* CPU offline callback: */
+static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
+{
+ struct task_group *tg;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * The rq clock has already been updated in
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ clear_tg_load_avg(cfs_rq);
}
+ rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
}
/*
@@ -4198,6 +4459,8 @@ static inline bool skip_blocked_update(struct sched_entity *se)
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
+static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
+
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
return 0;
@@ -4560,33 +4823,20 @@ static inline unsigned long task_util(struct task_struct *p)
return READ_ONCE(p->se.avg.util_avg);
}
-static inline unsigned long _task_util_est(struct task_struct *p)
+static inline unsigned long task_runnable(struct task_struct *p)
{
- struct util_est ue = READ_ONCE(p->se.avg.util_est);
-
- return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
+ return READ_ONCE(p->se.avg.runnable_avg);
}
-static inline unsigned long task_util_est(struct task_struct *p)
+static inline unsigned long _task_util_est(struct task_struct *p)
{
- return max(task_util(p), _task_util_est(p));
+ return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
}
-#ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p,
- unsigned long uclamp_min,
- unsigned long uclamp_max)
-{
- return clamp(task_util_est(p), uclamp_min, uclamp_max);
-}
-#else
-static inline unsigned long uclamp_task_util(struct task_struct *p,
- unsigned long uclamp_min,
- unsigned long uclamp_max)
+static inline unsigned long task_util_est(struct task_struct *p)
{
- return task_util_est(p);
+ return max(task_util(p), _task_util_est(p));
}
-#endif
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
@@ -4597,9 +4847,9 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
return;
/* Update root cfs_rq's estimated utilization */
- enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued = cfs_rq->avg.util_est;
enqueued += _task_util_est(p);
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+ WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
}
@@ -4613,34 +4863,20 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
return;
/* Update root cfs_rq's estimated utilization */
- enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued = cfs_rq->avg.util_est;
enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+ WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
}
#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
-/*
- * Check if a (signed) value is within a specified (unsigned) margin,
- * based on the observation that:
- *
- * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
- *
- * NOTE: this only works when value + margin < INT_MAX.
- */
-static inline bool within_margin(int value, int margin)
-{
- return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
-}
-
static inline void util_est_update(struct cfs_rq *cfs_rq,
struct task_struct *p,
bool task_sleep)
{
- long last_ewma_diff, last_enqueued_diff;
- struct util_est ue;
+ unsigned int ewma, dequeued, last_ewma_diff;
if (!sched_feat(UTIL_EST))
return;
@@ -4652,71 +4888,73 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
if (!task_sleep)
return;
+ /* Get current estimate of utilization */
+ ewma = READ_ONCE(p->se.avg.util_est);
+
/*
* If the PELT values haven't changed since enqueue time,
* skip the util_est update.
*/
- ue = p->se.avg.util_est;
- if (ue.enqueued & UTIL_AVG_UNCHANGED)
+ if (ewma & UTIL_AVG_UNCHANGED)
return;
- last_enqueued_diff = ue.enqueued;
+ /* Get utilization at dequeue */
+ dequeued = task_util(p);
/*
* Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases.
*/
- ue.enqueued = task_util(p);
- if (sched_feat(UTIL_EST_FASTUP)) {
- if (ue.ewma < ue.enqueued) {
- ue.ewma = ue.enqueued;
- goto done;
- }
+ if (ewma <= dequeued) {
+ ewma = dequeued;
+ goto done;
}
/*
* Skip update of task's estimated utilization when its members are
* already ~1% close to its last activation value.
*/
- last_ewma_diff = ue.enqueued - ue.ewma;
- last_enqueued_diff -= ue.enqueued;
- if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
- if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
- goto done;
-
- return;
- }
+ last_ewma_diff = ewma - dequeued;
+ if (last_ewma_diff < UTIL_EST_MARGIN)
+ goto done;
/*
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
- if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
+ if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
return;
/*
+ * To avoid underestimate of task utilization, skip updates of EWMA if
+ * we cannot grant that thread got all CPU time it wanted.
+ */
+ if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
+ goto done;
+
+
+ /*
* Update Task's estimated utilization
*
* When *p completes an activation we can consolidate another sample
- * of the task size. This is done by storing the current PELT value
- * as ue.enqueued and by using this value to update the Exponential
- * Weighted Moving Average (EWMA):
+ * of the task size. This is done by using this value to update the
+ * Exponential Weighted Moving Average (EWMA):
*
* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
- * = w * ( last_ewma_diff ) + ewma(t-1)
- * = w * (last_ewma_diff + ewma(t-1) / w)
+ * = w * ( -last_ewma_diff ) + ewma(t-1)
+ * = w * (-last_ewma_diff + ewma(t-1) / w)
*
* Where 'w' is the weight of new samples, which is configured to be
* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
*/
- ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
- ue.ewma += last_ewma_diff;
- ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+ ewma <<= UTIL_EST_WEIGHT_SHIFT;
+ ewma -= last_ewma_diff;
+ ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
- ue.enqueued |= UTIL_AVG_UNCHANGED;
- WRITE_ONCE(p->se.avg.util_est, ue);
+ ewma |= UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(p->se.avg.util_est, ewma);
trace_sched_util_est_se_tp(&p->se);
}
@@ -4739,14 +4977,14 @@ static inline int util_fits_cpu(unsigned long util,
return fits;
/*
- * We must use capacity_orig_of() for comparing against uclamp_min and
+ * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
* uclamp_max. We only care about capacity pressure (by using
* capacity_of()) for comparing against the real util.
*
* If a task is boosted to 1024 for example, we don't want a tiny
* pressure to skew the check whether it fits a CPU or not.
*
- * Similarly if a task is capped to capacity_orig_of(little_cpu), it
+ * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
* should fit a little cpu even if there's some pressure.
*
* Only exception is for thermal pressure since it has a direct impact
@@ -4758,7 +4996,7 @@ static inline int util_fits_cpu(unsigned long util,
* For uclamp_max, we can tolerate a drop in performance level as the
* goal is to cap the task. So it's okay if it's getting less.
*/
- capacity_orig = capacity_orig_of(cpu);
+ capacity_orig = arch_scale_cpu_capacity(cpu);
capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
/*
@@ -4878,7 +5116,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{
- return true;
+ return !cfs_rq->nr_running;
}
#define UPDATE_TG 0x0
@@ -4919,10 +5157,12 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- u64 vslice = calc_delta_fair(se->slice, se);
- u64 vruntime = avg_vruntime(cfs_rq);
+ u64 vslice, vruntime = avg_vruntime(cfs_rq);
s64 lag = 0;
+ se->slice = sysctl_sched_base_slice;
+ vslice = calc_delta_fair(se->slice, se);
+
/*
* Due to how V is constructed as the weighted average of entities,
* adding tasks with positive lag, or removing tasks with negative lag
@@ -5211,7 +5451,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+pick_next_entity(struct cfs_rq *cfs_rq)
{
/*
* Enabling NEXT_BUDDY will affect latency but not fairness.
@@ -5755,13 +5995,13 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
{
- struct cfs_rq *local_unthrottle = NULL;
int this_cpu = smp_processor_id();
u64 runtime, remaining = 1;
bool throttled = false;
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *tmp;
struct rq_flags rf;
struct rq *rq;
+ LIST_HEAD(local_unthrottle);
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -5777,11 +6017,9 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
if (!cfs_rq_throttled(cfs_rq))
goto next;
-#ifdef CONFIG_SMP
/* Already queued for async unthrottle */
if (!list_empty(&cfs_rq->throttled_csd_list))
goto next;
-#endif
/* By the above checks, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
@@ -5798,11 +6036,17 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0) {
- if (cpu_of(rq) != this_cpu ||
- SCHED_WARN_ON(local_unthrottle))
+ if (cpu_of(rq) != this_cpu) {
unthrottle_cfs_rq_async(cfs_rq);
- else
- local_unthrottle = cfs_rq;
+ } else {
+ /*
+ * We currently only expect to be unthrottling
+ * a single cfs_rq locally.
+ */
+ SCHED_WARN_ON(!list_empty(&local_unthrottle));
+ list_add_tail(&cfs_rq->throttled_csd_list,
+ &local_unthrottle);
+ }
} else {
throttled = true;
}
@@ -5810,15 +6054,23 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
next:
rq_unlock_irqrestore(rq, &rf);
}
- rcu_read_unlock();
- if (local_unthrottle) {
- rq = cpu_rq(this_cpu);
+ list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
+ throttled_csd_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
rq_lock_irqsave(rq, &rf);
- if (cfs_rq_throttled(local_unthrottle))
- unthrottle_cfs_rq(local_unthrottle);
+
+ list_del_init(&cfs_rq->throttled_csd_list);
+
+ if (cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
+
rq_unlock_irqrestore(rq, &rf);
}
+ SCHED_WARN_ON(!list_empty(&local_unthrottle));
+
+ rcu_read_unlock();
return throttled;
}
@@ -6148,9 +6400,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
-#ifdef CONFIG_SMP
INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
-#endif
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -6619,6 +6869,7 @@ dequeue_throttle:
/* Working cpumask for: load_balance, load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
+static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
#ifdef CONFIG_NO_HZ_COMMON
@@ -7107,45 +7358,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
struct sched_domain_shared *sd_share;
- struct rq *this_rq = this_rq();
- int this = smp_processor_id();
- struct sched_domain *this_sd = NULL;
- u64 time = 0;
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
- if (sched_feat(SIS_PROP) && !has_idle_core) {
- u64 avg_cost, avg_idle, span_avg;
- unsigned long now = jiffies;
-
- this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
- if (!this_sd)
- return -1;
-
- /*
- * If we're busy, the assumption that the last idle period
- * predicts the future is flawed; age away the remaining
- * predicted idle time.
- */
- if (unlikely(this_rq->wake_stamp < now)) {
- while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
- this_rq->wake_stamp++;
- this_rq->wake_avg_idle >>= 1;
- }
- }
-
- avg_idle = this_rq->wake_avg_idle;
- avg_cost = this_sd->avg_scan_cost + 1;
-
- span_avg = sd->span_weight * avg_idle;
- if (span_avg > 4*avg_cost)
- nr = div_u64(span_avg, avg_cost);
- else
- nr = 4;
-
- time = cpu_clock(this);
- }
-
if (sched_feat(SIS_UTIL)) {
sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
if (sd_share) {
@@ -7157,6 +7372,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
}
}
+ if (static_branch_unlikely(&sched_cluster_active)) {
+ struct sched_group *sg = sd->groups;
+
+ if (sg->flags & SD_CLUSTER) {
+ for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
+ if (!cpumask_test_cpu(cpu, cpus))
+ continue;
+
+ if (has_idle_core) {
+ i = select_idle_core(p, cpu, cpus, &idle_cpu);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+ } else {
+ if (--nr <= 0)
+ return -1;
+ idle_cpu = __select_idle_cpu(cpu, p);
+ if ((unsigned int)idle_cpu < nr_cpumask_bits)
+ return idle_cpu;
+ }
+ }
+ cpumask_andnot(cpus, cpus, sched_group_span(sg));
+ }
+ }
+
for_each_cpu_wrap(cpu, cpus, target + 1) {
if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -7164,7 +7403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
return i;
} else {
- if (!--nr)
+ if (--nr <= 0)
return -1;
idle_cpu = __select_idle_cpu(cpu, p);
if ((unsigned int)idle_cpu < nr_cpumask_bits)
@@ -7175,18 +7414,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
if (has_idle_core)
set_idle_cores(target, false);
- if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) {
- time = cpu_clock(this) - time;
-
- /*
- * Account for the scan cost of wakeups against the average
- * idle time.
- */
- this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
-
- update_avg(&this_sd->avg_scan_cost, time);
- }
-
return idle_cpu;
}
@@ -7226,7 +7453,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
* Look for the CPU with best capacity.
*/
else if (fits < 0)
- cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
+ cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
/*
* First, select CPU which fits better (-1 being better than 0).
@@ -7266,7 +7493,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
bool has_idle_core = false;
struct sched_domain *sd;
unsigned long task_util, util_min, util_max;
- int i, recent_used_cpu;
+ int i, recent_used_cpu, prev_aff = -1;
/*
* On asymmetric system, update task utilization because we will check
@@ -7293,8 +7520,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
if (prev != target && cpus_share_cache(prev, target) &&
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
- asym_fits_cpu(task_util, util_min, util_max, prev))
- return prev;
+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
+
+ if (!static_branch_unlikely(&sched_cluster_active) ||
+ cpus_share_resources(prev, target))
+ return prev;
+
+ prev_aff = prev;
+ }
/*
* Allow a per-cpu kthread to stack with the wakee if the
@@ -7321,7 +7554,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
- return recent_used_cpu;
+
+ if (!static_branch_unlikely(&sched_cluster_active) ||
+ cpus_share_resources(recent_used_cpu, target))
+ return recent_used_cpu;
+
+ } else {
+ recent_used_cpu = -1;
}
/*
@@ -7362,6 +7601,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if ((unsigned)i < nr_cpumask_bits)
return i;
+ /*
+ * For cluster machines which have lower sharing cache like L2 or
+ * LLC Tag, we tend to find an idle CPU in the target's cluster
+ * first. But prev_cpu or recent_used_cpu may also be a good candidate,
+ * use them if possible when no idle CPU found in select_idle_cpu().
+ */
+ if ((unsigned int)prev_aff < nr_cpumask_bits)
+ return prev_aff;
+ if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
+ return recent_used_cpu;
+
return target;
}
@@ -7432,16 +7682,16 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
if (sched_feat(UTIL_EST)) {
unsigned long util_est;
- util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+ util_est = READ_ONCE(cfs_rq->avg.util_est);
/*
* During wake-up @p isn't enqueued yet and doesn't contribute
- * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
+ * to any cpu_rq(cpu)->cfs.avg.util_est.
* If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
* has been enqueued.
*
* During exec (@dst_cpu = -1) @p is enqueued and does
- * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
+ * contribute to cpu_rq(cpu)->cfs.util_est.
* Remove it to "simulate" cpu_util without @p's contribution.
*
* Despite the task_on_rq_queued(@p) check there is still a
@@ -7468,7 +7718,7 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
util = max(util, util_est);
}
- return min(util, capacity_orig_of(cpu));
+ return min(util, arch_scale_cpu_capacity(cpu));
}
unsigned long cpu_util_cfs(int cpu)
@@ -7570,7 +7820,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
for_each_cpu(cpu, pd_cpus) {
unsigned long util = cpu_util(cpu, p, -1, 0);
- busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
+ busy_time += effective_cpu_util(cpu, util, NULL, NULL);
}
eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
@@ -7593,7 +7843,7 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
for_each_cpu(cpu, pd_cpus) {
struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
- unsigned long eff_util;
+ unsigned long eff_util, min, max;
/*
* Performance domain frequency: utilization clamping
@@ -7602,7 +7852,23 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+ eff_util = effective_cpu_util(cpu, util, &min, &max);
+
+ /* Task's uclamp can modify min and max value */
+ if (tsk && uclamp_is_used()) {
+ min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
+
+ /*
+ * If there is no active max uclamp constraint,
+ * directly use task's one, otherwise keep max.
+ */
+ if (uclamp_rq_is_idle(cpu_rq(cpu)))
+ max = uclamp_eff_value(p, UCLAMP_MAX);
+ else
+ max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
+ }
+
+ eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
max_util = max(max_util, eff_util);
}
@@ -7620,11 +7886,16 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd,
{
unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
unsigned long busy_time = eenv->pd_busy_time;
+ unsigned long energy;
if (dst_cpu >= 0)
busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
- return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+ energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+
+ trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
+
+ return energy;
}
/*
@@ -7699,7 +7970,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
target = prev_cpu;
sync_entity_load_avg(&p->se);
- if (!uclamp_task_util(p, p_util_min, p_util_max))
+ if (!task_util_est(p) && p_util_min == 0)
goto unlock;
eenv_task_busy_time(&eenv, p, prev_cpu);
@@ -7707,11 +7978,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
for (; pd; pd = pd->next) {
unsigned long util_min = p_util_min, util_max = p_util_max;
unsigned long cpu_cap, cpu_thermal_cap, util;
- unsigned long cur_delta, max_spare_cap = 0;
+ long prev_spare_cap = -1, max_spare_cap = -1;
unsigned long rq_util_min, rq_util_max;
- unsigned long prev_spare_cap = 0;
+ unsigned long cur_delta, base_energy;
int max_spare_cap_cpu = -1;
- unsigned long base_energy;
int fits, max_fits = -1;
cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
@@ -7774,7 +8044,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
prev_spare_cap = cpu_cap;
prev_fits = fits;
} else if ((fits > max_fits) ||
- ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
+ ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
/*
* Find the CPU with the maximum spare capacity
* among the remaining CPUs in the performance
@@ -7786,7 +8056,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
}
}
- if (max_spare_cap_cpu < 0 && prev_spare_cap == 0)
+ if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
continue;
eenv_pd_busy_time(&eenv, cpus, p);
@@ -7794,7 +8064,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
base_energy = compute_energy(&eenv, pd, cpus, p, -1);
/* Evaluate the energy impact of using prev_cpu. */
- if (prev_spare_cap > 0) {
+ if (prev_spare_cap > -1) {
prev_delta = compute_energy(&eenv, pd, cpus, p,
prev_cpu);
/* CPU utilization has changed */
@@ -7995,12 +8265,11 @@ static void set_next_buddy(struct sched_entity *se)
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int next_buddy_marked = 0;
int cse_is_idle, pse_is_idle;
if (unlikely(se == pse))
@@ -8008,7 +8277,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
/*
* This is possible from callers such as attach_tasks(), in which we
- * unconditionally check_preempt_curr() after an enqueue (which may have
+ * unconditionally wakeup_preempt() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
@@ -8017,7 +8286,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
set_next_buddy(pse);
- next_buddy_marked = 1;
}
/*
@@ -8100,7 +8368,7 @@ again:
goto again;
}
- se = pick_next_entity(cfs_rq, curr);
+ se = pick_next_entity(cfs_rq);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
@@ -8163,7 +8431,7 @@ again:
}
}
- se = pick_next_entity(cfs_rq, curr);
+ se = pick_next_entity(cfs_rq);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
@@ -8202,7 +8470,7 @@ simple:
put_prev_task(rq, prev);
do {
- se = pick_next_entity(cfs_rq, NULL);
+ se = pick_next_entity(cfs_rq);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
@@ -8850,7 +9118,7 @@ static int detach_tasks(struct lb_env *env)
case migrate_util:
util = task_util_est(p);
- if (util > env->imbalance)
+ if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
goto next;
env->imbalance -= util;
@@ -8915,7 +9183,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
WARN_ON_ONCE(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
}
/*
@@ -9255,8 +9523,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
- cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
-
if (!capacity)
capacity = 1;
@@ -9332,7 +9598,7 @@ static inline int
check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
return ((rq->cpu_capacity * sd->imbalance_pct) <
- (rq->cpu_capacity_orig * 100));
+ (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}
/*
@@ -9343,7 +9609,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
{
return rq->misfit_task_load &&
- (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+ (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
check_cpu_capacity(rq, sd));
}
@@ -9495,7 +9761,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
* can only do it if @group is an SMT group and has exactly on busy CPU. Larger
* imbalances in the number of CPUS are dealt with in find_busiest_group().
*
- * If we are balancing load within an SMT core, or at DIE domain level, always
+ * If we are balancing load within an SMT core, or at PKG domain level, always
* proceed.
*
* Return: true if @env::dst_cpu can do with asym_packing load balance. False
@@ -9579,7 +9845,7 @@ static inline long sibling_imbalance(struct lb_env *env,
imbalance /= ncores_local + ncores_busiest;
/* Take advantage of resource in an empty sched group */
- if (imbalance == 0 && local->sum_nr_running == 0 &&
+ if (imbalance <= 1 && local->sum_nr_running == 0 &&
busiest->sum_nr_running > 1)
imbalance = 2;
@@ -9767,6 +10033,15 @@ static bool update_sd_pick_busiest(struct lb_env *env,
break;
case group_smt_balance:
+ /*
+ * Check if we have spare CPUs on either SMT group to
+ * choose has spare or fully busy handling.
+ */
+ if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
+ goto has_spare;
+
+ fallthrough;
+
case group_fully_busy:
/*
* Select the fully busy group with highest avg_load. In
@@ -9806,6 +10081,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
else
return true;
}
+has_spare:
/*
* Select not overloaded group with lowest number of idle cpus
@@ -10917,6 +11193,7 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
+ struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
struct sched_group *sg = env->sd->groups;
int cpu, idle_smt = -1;
@@ -10940,8 +11217,9 @@ static int should_we_balance(struct lb_env *env)
return 1;
}
+ cpumask_copy(swb_cpus, group_balance_mask(sg));
/* Try to find first idle CPU */
- for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+ for_each_cpu_and(cpu, swb_cpus, env->cpus) {
if (!idle_cpu(cpu))
continue;
@@ -10953,15 +11231,27 @@ static int should_we_balance(struct lb_env *env)
if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
if (idle_smt == -1)
idle_smt = cpu;
+ /*
+ * If the core is not idle, and first SMT sibling which is
+ * idle has been found, then its not needed to check other
+ * SMT siblings for idleness:
+ */
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
+#endif
continue;
}
- /* Are we the first idle CPU? */
+ /*
+ * Are we the first idle core in a non-SMT domain or higher,
+ * or the first idle CPU in a SMT domain?
+ */
return cpu == env->dst_cpu;
}
- if (idle_smt == env->dst_cpu)
- return true;
+ /* Are we the first idle CPU with busy siblings? */
+ if (idle_smt != -1)
+ return idle_smt == env->dst_cpu;
/* Are we the first CPU of this group ? */
return group_balance_cpu(sg) == env->dst_cpu;
@@ -11174,13 +11464,15 @@ more_balance:
busiest->push_cpu = this_cpu;
active_balance = 1;
}
- raw_spin_rq_unlock_irqrestore(busiest, flags);
+ preempt_disable();
+ raw_spin_rq_unlock_irqrestore(busiest, flags);
if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
}
+ preempt_enable();
}
} else {
sd->nr_balance_failed = 0;
@@ -11488,36 +11780,39 @@ static inline int on_null_domain(struct rq *rq)
#ifdef CONFIG_NO_HZ_COMMON
/*
- * idle load balancing details
- * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * NOHZ idle load balancing (ILB) details:
+ *
+ * - When one of the busy CPUs notices that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
- * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
+ *
+ * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
* anywhere yet.
*/
-
static inline int find_new_ilb(void)
{
- int ilb;
const struct cpumask *hk_mask;
+ int ilb_cpu;
hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
- for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
+ for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
- if (ilb == smp_processor_id())
+ if (ilb_cpu == smp_processor_id())
continue;
- if (idle_cpu(ilb))
- return ilb;
+ if (idle_cpu(ilb_cpu))
+ return ilb_cpu;
}
- return nr_cpu_ids;
+ return -1;
}
/*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
+ * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
+ * SMP function call (IPI).
+ *
+ * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
@@ -11531,8 +11826,7 @@ static void kick_ilb(unsigned int flags)
nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
-
- if (ilb_cpu >= nr_cpu_ids)
+ if (ilb_cpu < 0)
return;
/*
@@ -11545,7 +11839,7 @@ static void kick_ilb(unsigned int flags)
/*
* This way we generate an IPI on the target CPU which
- * is idle. And the softirq performing nohz idle load balance
+ * is idle, and the softirq performing NOHZ idle load balancing
* will be run before returning from the IPI.
*/
smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
@@ -11574,7 +11868,7 @@ static void nohz_balancer_kick(struct rq *rq)
/*
* None are in tickless mode and hence no need for NOHZ idle load
- * balancing.
+ * balancing:
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
return;
@@ -11596,9 +11890,8 @@ static void nohz_balancer_kick(struct rq *rq)
sd = rcu_dereference(rq->sd);
if (sd) {
/*
- * If there's a CFS task and the current CPU has reduced
- * capacity; kick the ILB to see if there's a better CPU to run
- * on.
+ * If there's a runnable CFS task and the current CPU has reduced
+ * capacity, kick the ILB to see if there's a better CPU to run on:
*/
if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
@@ -11650,11 +11943,11 @@ static void nohz_balancer_kick(struct rq *rq)
if (sds) {
/*
* If there is an imbalance between LLC domains (IOW we could
- * increase the overall cache use), we need some less-loaded LLC
- * domain to pull some load. Likewise, we may need to spread
+ * increase the overall cache utilization), we need a less-loaded LLC
+ * domain to pull some load from. Likewise, we may need to spread
* load within the current LLC domain (e.g. packed SMT cores but
* other CPUs are idle). We can't really know from here how busy
- * the others are - so just get a nohz balance going if it looks
+ * the others are - so just get a NOHZ balance going if it looks
* like this LLC domain has tasks we could move.
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
@@ -11924,8 +12217,19 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
}
/*
- * Check if we need to run the ILB for updating blocked load before entering
- * idle state.
+ * Check if we need to directly run the ILB for updating blocked load before
+ * entering idle state. Here we run ILB directly without issuing IPIs.
+ *
+ * Note that when this function is called, the tick may not yet be stopped on
+ * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
+ * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
+ * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
+ * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
+ * called from this function on (this) CPU that's not yet in the mask. That's
+ * OK because the goal of nohz_run_idle_balance() is to run ILB only for
+ * updating the blocked load of already idle CPUs without waking up one of
+ * those idle CPUs and outside the preempt disable / irq off phase of the local
+ * cpu about to enter idle, because it can take a long time.
*/
void nohz_run_idle_balance(int cpu)
{
@@ -12167,6 +12471,9 @@ static void rq_offline_fair(struct rq *rq)
/* Ensure any throttled groups are reachable by pick_next_task */
unthrottle_offline_cfs_rqs(rq);
+
+ /* Ensure that we remove rq contribution to group share: */
+ clear_tg_offline_cfs_rqs(rq);
}
#endif /* CONFIG_SMP */
@@ -12370,7 +12677,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (p->prio > oldprio)
resched_curr(rq);
} else
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -12472,7 +12779,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
if (task_current(rq, p))
resched_curr(rq);
else
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
}
}
@@ -12790,19 +13097,6 @@ next_cpu:
return 0;
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
-
-void free_fair_sched_group(struct task_group *tg) { }
-
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
-{
- return 1;
-}
-
-void online_fair_sched_group(struct task_group *tg) { }
-
-void unregister_fair_sched_group(struct task_group *tg) { }
-
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -12831,7 +13125,7 @@ DEFINE_SCHED_CLASS(fair) = {
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
- .check_preempt_curr = check_preempt_wakeup,
+ .wakeup_preempt = check_preempt_wakeup_fair,
.pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
@@ -12918,6 +13212,8 @@ __init void init_sched_fair_class(void)
for_each_possible_cpu(i) {
zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
+ GFP_KERNEL, cpu_to_node(i));
#ifdef CONFIG_CFS_BANDWIDTH
INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index f770168230ae..143f55df890b 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -49,7 +49,6 @@ SCHED_FEAT(TTWU_QUEUE, true)
/*
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
*/
-SCHED_FEAT(SIS_PROP, false)
SCHED_FEAT(SIS_UTIL, true)
/*
@@ -84,7 +83,6 @@ SCHED_FEAT(WA_BIAS, true)
* UtilEstimation. Use estimated CPU utilization.
*/
SCHED_FEAT(UTIL_EST, true)
-SCHED_FEAT(UTIL_EST_FASTUP, true)
SCHED_FEAT(LATENCY_WARN, false)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 342f58a329f5..31231925f1ec 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -258,6 +258,36 @@ static void do_idle(void)
while (!need_resched()) {
rmb();
+ /*
+ * Interrupts shouldn't be re-enabled from that point on until
+ * the CPU sleeping instruction is reached. Otherwise an interrupt
+ * may fire and queue a timer that would be ignored until the CPU
+ * wakes from the sleeping instruction. And testing need_resched()
+ * doesn't tell about pending needed timer reprogram.
+ *
+ * Several cases to consider:
+ *
+ * - SLEEP-UNTIL-PENDING-INTERRUPT based instructions such as
+ * "wfi" or "mwait" are fine because they can be entered with
+ * interrupt disabled.
+ *
+ * - sti;mwait() couple is fine because the interrupts are
+ * re-enabled only upon the execution of mwait, leaving no gap
+ * in-between.
+ *
+ * - ROLLBACK based idle handlers with the sleeping instruction
+ * called with interrupts enabled are NOT fine. In this scheme
+ * when the interrupt detects it has interrupted an idle handler,
+ * it rolls back to its beginning which performs the
+ * need_resched() check before re-executing the sleeping
+ * instruction. This can leak a pending needed timer reprogram.
+ * If such a scheme is really mandatory due to the lack of an
+ * appropriate CPU sleeping instruction, then a FAST-FORWARD
+ * must instead be applied: when the interrupt detects it has
+ * interrupted an idle handler, it must resume to the end of
+ * this idle handler so that the generic idle loop is iterated
+ * again to reprogram the tick.
+ */
local_irq_disable();
if (cpu_is_offline(cpu)) {
@@ -373,6 +403,7 @@ EXPORT_SYMBOL_GPL(play_idle_precise);
void cpu_startup_entry(enum cpuhp_state state)
{
+ current->flags |= PF_IDLE;
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
while (1)
@@ -400,7 +431,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/*
* Idle tasks are unconditionally rescheduled:
*/
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
{
resched_curr(rq);
}
@@ -481,7 +512,7 @@ DEFINE_SCHED_CLASS(idle) = {
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
- .check_preempt_curr = check_preempt_curr_idle,
+ .wakeup_preempt = wakeup_preempt_idle,
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 0f310768260c..63b6cf898220 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Per Entity Load Tracking
+ * Per Entity Load Tracking (PELT)
*
* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 3a0e0dc28721..9e1083465fbc 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -52,13 +52,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
return;
/* Avoid store if the flag has been already reset */
- enqueued = avg->util_est.enqueued;
+ enqueued = avg->util_est;
if (!(enqueued & UTIL_AVG_UNCHANGED))
return;
/* Reset flag to report util_avg has been updated */
enqueued &= ~UTIL_AVG_UNCHANGED;
- WRITE_ONCE(avg->util_est.enqueued, enqueued);
+ WRITE_ONCE(avg->util_est, enqueued);
}
static inline u64 rq_clock_pelt(struct rq *rq)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1d0f634725a6..7b4aa5809c0f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -434,14 +434,13 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
return growth;
}
-static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
+static void update_triggers(struct psi_group *group, u64 now,
enum psi_aggregators aggregator)
{
struct psi_trigger *t;
u64 *total = group->total[aggregator];
struct list_head *triggers;
u64 *aggregator_total;
- *update_total = false;
if (aggregator == PSI_AVGS) {
triggers = &group->avg_triggers;
@@ -471,14 +470,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
* events without dropping any).
*/
if (new_stall) {
- /*
- * Multiple triggers might be looking at the same state,
- * remember to update group->polling_total[] once we've
- * been through all of them. Also remember to extend the
- * polling time if we see new stall activity.
- */
- *update_total = true;
-
/* Calculate growth since last update */
growth = window_update(&t->win, now, total[t->state]);
if (!t->pending_event) {
@@ -503,8 +494,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
/* Reset threshold breach flag once event got generated */
t->pending_event = false;
}
-
- return now + group->rtpoll_min_period;
}
static u64 update_averages(struct psi_group *group, u64 now)
@@ -565,7 +554,6 @@ static void psi_avgs_work(struct work_struct *work)
struct delayed_work *dwork;
struct psi_group *group;
u32 changed_states;
- bool update_total;
u64 now;
dwork = to_delayed_work(work);
@@ -584,7 +572,7 @@ static void psi_avgs_work(struct work_struct *work)
* go - see calc_avgs() and missed_periods.
*/
if (now >= group->avg_next_update) {
- update_triggers(group, now, &update_total, PSI_AVGS);
+ update_triggers(group, now, PSI_AVGS);
group->avg_next_update = update_averages(group, now);
}
@@ -608,7 +596,7 @@ static void init_rtpoll_triggers(struct psi_group *group, u64 now)
group->rtpoll_next_update = now + group->rtpoll_min_period;
}
-/* Schedule polling if it's not already scheduled or forced. */
+/* Schedule rtpolling if it's not already scheduled or forced. */
static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay,
bool force)
{
@@ -640,7 +628,6 @@ static void psi_rtpoll_work(struct psi_group *group)
{
bool force_reschedule = false;
u32 changed_states;
- bool update_total;
u64 now;
mutex_lock(&group->rtpoll_trigger_lock);
@@ -649,37 +636,37 @@ static void psi_rtpoll_work(struct psi_group *group)
if (now > group->rtpoll_until) {
/*
- * We are either about to start or might stop polling if no
- * state change was recorded. Resetting poll_scheduled leaves
+ * We are either about to start or might stop rtpolling if no
+ * state change was recorded. Resetting rtpoll_scheduled leaves
* a small window for psi_group_change to sneak in and schedule
- * an immediate poll_work before we get to rescheduling. One
- * potential extra wakeup at the end of the polling window
- * should be negligible and polling_next_update still keeps
+ * an immediate rtpoll_work before we get to rescheduling. One
+ * potential extra wakeup at the end of the rtpolling window
+ * should be negligible and rtpoll_next_update still keeps
* updates correctly on schedule.
*/
atomic_set(&group->rtpoll_scheduled, 0);
/*
- * A task change can race with the poll worker that is supposed to
+ * A task change can race with the rtpoll worker that is supposed to
* report on it. To avoid missing events, ensure ordering between
- * poll_scheduled and the task state accesses, such that if the poll
- * worker misses the state update, the task change is guaranteed to
- * reschedule the poll worker:
+ * rtpoll_scheduled and the task state accesses, such that if the
+ * rtpoll worker misses the state update, the task change is
+ * guaranteed to reschedule the rtpoll worker:
*
- * poll worker:
- * atomic_set(poll_scheduled, 0)
+ * rtpoll worker:
+ * atomic_set(rtpoll_scheduled, 0)
* smp_mb()
* LOAD states
*
* task change:
* STORE states
- * if atomic_xchg(poll_scheduled, 1) == 0:
- * schedule poll worker
+ * if atomic_xchg(rtpoll_scheduled, 1) == 0:
+ * schedule rtpoll worker
*
* The atomic_xchg() implies a full barrier.
*/
smp_mb();
} else {
- /* Polling window is not over, keep rescheduling */
+ /* The rtpolling window is not over, keep rescheduling */
force_reschedule = true;
}
@@ -687,7 +674,7 @@ static void psi_rtpoll_work(struct psi_group *group)
collect_percpu_times(group, PSI_POLL, &changed_states);
if (changed_states & group->rtpoll_states) {
- /* Initialize trigger windows when entering polling mode */
+ /* Initialize trigger windows when entering rtpolling mode */
if (now > group->rtpoll_until)
init_rtpoll_triggers(group, now);
@@ -706,10 +693,12 @@ static void psi_rtpoll_work(struct psi_group *group)
}
if (now >= group->rtpoll_next_update) {
- group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL);
- if (update_total)
+ if (changed_states & group->rtpoll_states) {
+ update_triggers(group, now, PSI_POLL);
memcpy(group->rtpoll_total, group->total[PSI_POLL],
sizeof(group->rtpoll_total));
+ }
+ group->rtpoll_next_update = now + group->rtpoll_min_period;
}
psi_schedule_rtpoll_work(group,
@@ -1009,6 +998,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
struct psi_group_cpu *groupc;
u64 now;
+ if (static_branch_likely(&psi_disabled))
+ return;
+
if (!task->pid)
return;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0597ba0f85ff..3261b067b67e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -16,7 +16,7 @@ struct rt_bandwidth def_rt_bandwidth;
* period over which we measure -rt task CPU usage in us.
* default: 1s
*/
-unsigned int sysctl_sched_rt_period = 1000000;
+int sysctl_sched_rt_period = 1000000;
/*
* part of the period that we allow rt tasks to run in us.
@@ -34,9 +34,11 @@ static struct ctl_table sched_rt_sysctls[] = {
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
- .maxlen = sizeof(unsigned int),
+ .maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "sched_rt_runtime_us",
@@ -44,6 +46,8 @@ static struct ctl_table sched_rt_sysctls[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_NEG_ONE,
+ .extra2 = (void *)&sysctl_sched_rt_period,
},
{
.procname = "sched_rr_timeslice_ms",
@@ -143,7 +147,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
- rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif /* CONFIG_SMP */
@@ -358,53 +361,6 @@ static inline void rt_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}
-static void update_rt_migration(struct rt_rq *rt_rq)
-{
- if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
- if (!rt_rq->overloaded) {
- rt_set_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 1;
- }
- } else if (rt_rq->overloaded) {
- rt_clear_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 0;
- }
-}
-
-static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total++;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory++;
-
- update_rt_migration(rt_rq);
-}
-
-static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total--;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory--;
-
- update_rt_migration(rt_rq);
-}
-
static inline int has_pushable_tasks(struct rq *rq)
{
return !plist_head_empty(&rq->rt.pushable_tasks);
@@ -438,6 +394,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
/* Update the highest prio pushable task */
if (p->prio < rq->rt.highest_prio.next)
rq->rt.highest_prio.next = p->prio;
+
+ if (!rq->rt.overloaded) {
+ rt_set_overload(rq);
+ rq->rt.overloaded = 1;
+ }
}
static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -451,6 +412,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
rq->rt.highest_prio.next = p->prio;
} else {
rq->rt.highest_prio.next = MAX_RT_PRIO-1;
+
+ if (rq->rt.overloaded) {
+ rt_clear_overload(rq);
+ rq->rt.overloaded = 0;
+ }
}
}
@@ -464,16 +430,6 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{
}
-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
static inline void rt_queue_push_tasks(struct rq *rq)
{
}
@@ -515,7 +471,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
min_cap = uclamp_eff_value(p, UCLAMP_MIN);
max_cap = uclamp_eff_value(p, UCLAMP_MAX);
- cpu_cap = capacity_orig_of(cpu);
+ cpu_cap = arch_scale_cpu_capacity(cpu);
return cpu_cap >= min(min_cap, max_cap);
}
@@ -953,7 +909,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
/*
* When we're idle and a woken (rt) task is
- * throttled check_preempt_curr() will set
+ * throttled wakeup_preempt() will set
* skip_update and the time between the wakeup
* and this unthrottle will get accounted as
* 'runtime'.
@@ -1046,24 +1002,15 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
- u64 delta_exec;
- u64 now;
+ s64 delta_exec;
if (curr->sched_class != &rt_sched_class)
return;
- now = rq_clock_task(rq);
- delta_exec = now - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0))
+ delta_exec = update_curr_common(rq);
+ if (unlikely(delta_exec <= 0))
return;
- schedstat_set(curr->stats.exec_max,
- max(curr->stats.exec_max, delta_exec));
-
- trace_sched_stat_runtime(curr, delta_exec, 0);
-
- update_current_exec_runtime(curr, now, delta_exec);
-
if (!rt_bandwidth_enabled())
return;
@@ -1281,7 +1228,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
- inc_rt_migration(rt_se, rt_rq);
inc_rt_group(rt_se, rt_rq);
}
@@ -1294,7 +1240,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
- dec_rt_migration(rt_se, rt_rq);
dec_rt_group(rt_se, rt_rq);
}
@@ -1715,7 +1660,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{
if (p->prio < rq->curr->prio) {
resched_curr(rq);
@@ -2109,9 +2054,11 @@ retry:
*/
push_task = get_push_task(rq);
if (push_task) {
+ preempt_disable();
raw_spin_rq_unlock(rq);
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
push_task, &rq->push_work);
+ preempt_enable();
raw_spin_rq_lock(rq);
}
@@ -2448,9 +2395,11 @@ skip:
double_unlock_balance(this_rq, src_rq);
if (push_task) {
+ preempt_disable();
raw_spin_rq_unlock(this_rq);
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
push_task, &src_rq->push_work);
+ preempt_enable();
raw_spin_rq_lock(this_rq);
}
}
@@ -2702,7 +2651,7 @@ DEFINE_SCHED_CLASS(rt) = {
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
- .check_preempt_curr = check_preempt_curr_rt,
+ .wakeup_preempt = wakeup_preempt_rt,
.pick_next_task = pick_next_task_rt,
.put_prev_task = put_prev_task_rt,
@@ -2985,9 +2934,6 @@ static int sched_rt_global_constraints(void)
#ifdef CONFIG_SYSCTL
static int sched_rt_global_validate(void)
{
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
((u64)sysctl_sched_rt_runtime *
@@ -3018,7 +2964,7 @@ static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
old_period = sysctl_sched_rt_period;
old_runtime = sysctl_sched_rt_runtime;
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
ret = sched_rt_global_validate();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 04846272409c..001fe047bd5d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -74,15 +74,6 @@
#include "../workqueue_internal.h"
-#ifdef CONFIG_CGROUP_SCHED
-#include <linux/cgroup.h>
-#include <linux/psi.h>
-#endif
-
-#ifdef CONFIG_SCHED_DEBUG
-# include <linux/static_key.h>
-#endif
-
#ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h>
# include <asm/paravirt_api_clock.h>
@@ -109,14 +100,12 @@ extern __read_mostly int scheduler_running;
extern unsigned long calc_load_update;
extern atomic_long_t calc_load_tasks;
-extern unsigned int sysctl_sched_child_runs_first;
-
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
-extern unsigned int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
extern int sched_rr_timeslice;
@@ -284,8 +273,6 @@ struct rt_bandwidth {
unsigned int rt_period_active;
};
-void __dl_clear_params(struct task_struct *p);
-
static inline int dl_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
@@ -326,6 +313,33 @@ extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *att
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern int dl_bw_check_overflow(int cpu);
+/*
+ * SCHED_DEADLINE supports servers (nested scheduling) with the following
+ * interface:
+ *
+ * dl_se::rq -- runqueue we belong to.
+ *
+ * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
+ * server when it runs out of tasks to run.
+ *
+ * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
+ * returns NULL.
+ *
+ * dl_server_update() -- called from update_curr_common(), propagates runtime
+ * to the server.
+ *
+ * dl_server_start()
+ * dl_server_stop() -- start/stop the server when it has (no) tasks.
+ *
+ * dl_server_init() -- initializes the server.
+ */
+extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
+extern void dl_server_start(struct sched_dl_entity *dl_se);
+extern void dl_server_stop(struct sched_dl_entity *dl_se);
+extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ dl_server_has_tasks_f has_tasks,
+ dl_server_pick_f pick);
+
#ifdef CONFIG_CGROUP_SCHED
struct cfs_rq;
@@ -447,10 +461,21 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
extern int tg_nop(struct task_group *tg, void *data);
+#ifdef CONFIG_FAIR_GROUP_SCHED
extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg);
+#else
+static inline void free_fair_sched_group(struct task_group *tg) { }
+static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
+static inline void online_fair_sched_group(struct task_group *tg) { }
+static inline void unregister_fair_sched_group(struct task_group *tg) { }
+#endif
+
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent);
@@ -594,6 +619,7 @@ struct cfs_rq {
} removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ u64 last_update_tg_load_avg;
unsigned long tg_load_avg_contrib;
long propagate;
long prop_runnable_sum;
@@ -644,9 +670,7 @@ struct cfs_rq {
int throttled;
int throttle_count;
struct list_head throttled_list;
-#ifdef CONFIG_SMP
struct list_head throttled_csd_list;
-#endif
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -675,8 +699,6 @@ struct rt_rq {
} highest_prio;
#endif
#ifdef CONFIG_SMP
- unsigned int rt_nr_migratory;
- unsigned int rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
@@ -721,7 +743,6 @@ struct dl_rq {
u64 next;
} earliest_dl;
- unsigned int dl_nr_migratory;
int overloaded;
/*
@@ -963,10 +984,6 @@ struct rq {
/* runqueue lock: */
raw_spinlock_t __lock;
- /*
- * nr_running and cpu_load should be in the same cacheline because
- * remote CPUs use both these fields when doing load calculation.
- */
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -1048,7 +1065,6 @@ struct rq {
struct sched_domain __rcu *sd;
unsigned long cpu_capacity;
- unsigned long cpu_capacity_orig;
struct balance_callback *balance_callback;
@@ -1079,9 +1095,6 @@ struct rq {
u64 idle_stamp;
u64 avg_idle;
- unsigned long wake_stamp;
- u64 wake_avg_idle;
-
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
@@ -1658,6 +1671,11 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
+DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
+ _T->rq = task_rq_lock(_T->lock, &_T->rf),
+ task_rq_unlock(_T->rq, _T->lock, &_T->rf),
+ struct rq *rq; struct rq_flags rf)
+
static inline void
rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
@@ -1868,11 +1886,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(int, sd_share_id);
DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
extern struct static_key_false sched_asym_cpucapacity;
+extern struct static_key_false sched_cluster_active;
static __always_inline bool sched_asym_cpucap_active(void)
{
@@ -2195,6 +2215,10 @@ extern const u32 sched_prio_to_wmult[40];
* MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
* in the runqueue.
*
+ * NOCLOCK - skip the update_rq_clock() (avoids double updates)
+ *
+ * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
+ *
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
@@ -2205,6 +2229,7 @@ extern const u32 sched_prio_to_wmult[40];
#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
+#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
@@ -2219,6 +2244,7 @@ extern const u32 sched_prio_to_wmult[40];
#define ENQUEUE_MIGRATED 0x00
#endif
#define ENQUEUE_INITIAL 0x80
+#define ENQUEUE_MIGRATING 0x100
#define RETRY_TASK ((void *)-1UL)
@@ -2228,6 +2254,8 @@ struct affinity_context {
unsigned int flags;
};
+extern s64 update_curr_common(struct rq *rq);
+
struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
@@ -2239,7 +2267,7 @@ struct sched_class {
void (*yield_task) (struct rq *rq);
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
- void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
+ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
struct task_struct *(*pick_next_task)(struct rq *rq);
@@ -2441,8 +2469,7 @@ extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
-extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
+extern void init_dl_entity(struct sched_dl_entity *dl_se);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
@@ -2513,7 +2540,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
-extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
#ifdef CONFIG_PREEMPT_RT
#define SCHED_NR_MIGRATE_BREAK 8
@@ -2838,6 +2865,7 @@ DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
double_rq_lock(_T->lock, _T->lock2),
double_rq_unlock(_T->lock, _T->lock2))
+extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
@@ -2977,29 +3005,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif
#ifdef CONFIG_SMP
-static inline unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long *min,
+ unsigned long *max);
-/**
- * enum cpu_util_type - CPU utilization type
- * @FREQUENCY_UTIL: Utilization used to select frequency
- * @ENERGY_UTIL: Utilization used during energy calculation
- *
- * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
- * need to be aggregated differently depending on the usage made of them. This
- * enum is used within effective_cpu_util() to differentiate the types of
- * utilization expected by the callers, and adjust the aggregation accordingly.
- */
-enum cpu_util_type {
- FREQUENCY_UTIL,
- ENERGY_UTIL,
-};
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+ unsigned long min,
+ unsigned long max);
-unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- enum cpu_util_type type,
- struct task_struct *p);
/*
* Verify the fitness of task @p to run on @cpu taking into account the
@@ -3056,59 +3069,6 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
}
-/**
- * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
- * @rq: The rq to clamp against. Must not be NULL.
- * @util: The util value to clamp.
- * @p: The task to clamp against. Can be NULL if you want to clamp
- * against @rq only.
- *
- * Clamps the passed @util to the max(@rq, @p) effective uclamp values.
- *
- * If sched_uclamp_used static key is disabled, then just return the util
- * without any clamping since uclamp aggregation at the rq level in the fast
- * path is disabled, rendering this operation a NOP.
- *
- * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It
- * will return the correct effective uclamp value of the task even if the
- * static key is disabled.
- */
-static __always_inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
- struct task_struct *p)
-{
- unsigned long min_util = 0;
- unsigned long max_util = 0;
-
- if (!static_branch_likely(&sched_uclamp_used))
- return util;
-
- if (p) {
- min_util = uclamp_eff_value(p, UCLAMP_MIN);
- max_util = uclamp_eff_value(p, UCLAMP_MAX);
-
- /*
- * Ignore last runnable task's max clamp, as this task will
- * reset it. Similarly, no need to read the rq's min clamp.
- */
- if (uclamp_rq_is_idle(rq))
- goto out;
- }
-
- min_util = max_t(unsigned long, min_util, uclamp_rq_get(rq, UCLAMP_MIN));
- max_util = max_t(unsigned long, max_util, uclamp_rq_get(rq, UCLAMP_MAX));
-out:
- /*
- * Since CPU's {min,max}_util clamps are MAX aggregated considering
- * RUNNABLE tasks with _different_ clamps, we can end up with an
- * inversion. Fix it now when the clamps are applied.
- */
- if (unlikely(min_util >= max_util))
- return min_util;
-
- return clamp(util, min_util, max_util);
-}
-
/* Is the rq being capped/throttled by uclamp_max? */
static inline bool uclamp_rq_is_capped(struct rq *rq)
{
@@ -3146,13 +3106,6 @@ static inline unsigned long uclamp_eff_value(struct task_struct *p,
return SCHED_CAPACITY_SCALE;
}
-static inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
- struct task_struct *p)
-{
- return util;
-}
-
static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; }
static inline bool uclamp_is_used(void)
@@ -3219,6 +3172,8 @@ static inline bool sched_energy_enabled(void)
return static_branch_unlikely(&sched_energy_present);
}
+extern struct cpufreq_governor schedutil_gov;
+
#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
#define perf_domain_span(pd) NULL
@@ -3280,16 +3235,6 @@ extern int sched_dynamic_mode(const char *str);
extern void sched_dynamic_update(int mode);
#endif
-static inline void update_current_exec_runtime(struct task_struct *curr,
- u64 now, u64 delta_exec)
-{
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = now;
- cgroup_account_cputime(curr, delta_exec);
-}
-
#ifdef CONFIG_SCHED_MM_CID
#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 85590599b4d6..b1b8fe61c532 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
#endif /* CONFIG_SMP */
static void
-check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags)
{
/* we're never preempted */
}
@@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq)
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *curr = rq->curr;
- u64 now, delta_exec;
-
- now = rq_clock_task(rq);
- delta_exec = now - curr->se.exec_start;
- if (unlikely((s64)delta_exec < 0))
- delta_exec = 0;
-
- schedstat_set(curr->stats.exec_max,
- max(curr->stats.exec_max, delta_exec));
-
- update_current_exec_runtime(curr, now, delta_exec);
+ update_curr_common(rq);
}
/*
@@ -120,7 +109,7 @@ DEFINE_SCHED_CLASS(stop) = {
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
- .check_preempt_curr = check_preempt_curr_stop,
+ .wakeup_preempt = wakeup_preempt_stop,
.pick_next_task = pick_next_task_stop,
.put_prev_task = put_prev_task_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 05a5bc678c08..10d1391e7416 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -212,6 +212,69 @@ static unsigned int sysctl_sched_energy_aware = 1;
static DEFINE_MUTEX(sched_energy_mutex);
static bool sched_energy_update;
+static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
+{
+ bool any_asym_capacity = false;
+ struct cpufreq_policy *policy;
+ struct cpufreq_governor *gov;
+ int i;
+
+ /* EAS is enabled for asymmetric CPU capacity topologies. */
+ for_each_cpu(i, cpu_mask) {
+ if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) {
+ any_asym_capacity = true;
+ break;
+ }
+ }
+ if (!any_asym_capacity) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ /* EAS definitely does *not* handle SMT */
+ if (sched_smt_active()) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, SMT is not supported\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ if (!arch_scale_freq_invariant()) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ /* Do not attempt EAS if schedutil is not being used. */
+ for_each_cpu(i, cpu_mask) {
+ policy = cpufreq_cpu_get(i);
+ if (!policy) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
+ cpumask_pr_args(cpu_mask), i);
+ }
+ return false;
+ }
+ gov = policy->governor;
+ cpufreq_cpu_put(policy);
+ if (gov != &schedutil_gov) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+ }
+
+ return true;
+}
+
void rebuild_sched_domains_energy(void)
{
mutex_lock(&sched_energy_mutex);
@@ -230,6 +293,15 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!sched_is_eas_possible(cpu_active_mask)) {
+ if (write) {
+ return -EOPNOTSUPP;
+ } else {
+ *lenp = 0;
+ return 0;
+ }
+ }
+
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
state = static_branch_unlikely(&sched_energy_present);
@@ -348,103 +420,33 @@ static void sched_energy_set(bool has_eas)
* 1. an Energy Model (EM) is available;
* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
* 3. no SMT is detected.
- * 4. the EM complexity is low enough to keep scheduling overheads low;
- * 5. schedutil is driving the frequency of all CPUs of the rd;
- * 6. frequency invariance support is present;
- *
- * The complexity of the Energy Model is defined as:
- *
- * C = nr_pd * (nr_cpus + nr_ps)
- *
- * with parameters defined as:
- * - nr_pd: the number of performance domains
- * - nr_cpus: the number of CPUs
- * - nr_ps: the sum of the number of performance states of all performance
- * domains (for example, on a system with 2 performance domains,
- * with 10 performance states each, nr_ps = 2 * 10 = 20).
- *
- * It is generally not a good idea to use such a model in the wake-up path on
- * very complex platforms because of the associated scheduling overheads. The
- * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
- * with per-CPU DVFS and less than 8 performance states each, for example.
+ * 4. schedutil is driving the frequency of all CPUs of the rd;
+ * 5. frequency invariance support is present;
*/
-#define EM_MAX_COMPLEXITY 2048
-
-extern struct cpufreq_governor schedutil_gov;
static bool build_perf_domains(const struct cpumask *cpu_map)
{
- int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
+ int i;
struct perf_domain *pd = NULL, *tmp;
int cpu = cpumask_first(cpu_map);
struct root_domain *rd = cpu_rq(cpu)->rd;
- struct cpufreq_policy *policy;
- struct cpufreq_governor *gov;
if (!sysctl_sched_energy_aware)
goto free;
- /* EAS is enabled for asymmetric CPU capacity topologies. */
- if (!per_cpu(sd_asym_cpucapacity, cpu)) {
- if (sched_debug()) {
- pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
- cpumask_pr_args(cpu_map));
- }
+ if (!sched_is_eas_possible(cpu_map))
goto free;
- }
-
- /* EAS definitely does *not* handle SMT */
- if (sched_smt_active()) {
- pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
- cpumask_pr_args(cpu_map));
- goto free;
- }
-
- if (!arch_scale_freq_invariant()) {
- if (sched_debug()) {
- pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
- cpumask_pr_args(cpu_map));
- }
- goto free;
- }
for_each_cpu(i, cpu_map) {
/* Skip already covered CPUs. */
if (find_pd(pd, i))
continue;
- /* Do not attempt EAS if schedutil is not being used. */
- policy = cpufreq_cpu_get(i);
- if (!policy)
- goto free;
- gov = policy->governor;
- cpufreq_cpu_put(policy);
- if (gov != &schedutil_gov) {
- if (rd->pd)
- pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
- cpumask_pr_args(cpu_map));
- goto free;
- }
-
/* Create the new pd and add it to the local list. */
tmp = pd_init(i);
if (!tmp)
goto free;
tmp->next = pd;
pd = tmp;
-
- /*
- * Count performance domains and performance states for the
- * complexity check.
- */
- nr_pd++;
- nr_ps += em_pd_nr_perf_states(pd->em_pd);
- }
-
- /* Bail out if the Energy Model complexity is too high. */
- if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
- WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
- cpumask_pr_args(cpu_map));
- goto free;
}
perf_domain_debug(cpu_map, pd);
@@ -666,11 +668,14 @@ static void destroy_sched_domains(struct sched_domain *sd)
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, sd_share_id);
DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
static void update_top_cache_domain(int cpu)
{
@@ -691,6 +696,17 @@ static void update_top_cache_domain(int cpu)
per_cpu(sd_llc_id, cpu) = id;
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+ sd = lowest_flag_domain(cpu, SD_CLUSTER);
+ if (sd)
+ id = cpumask_first(sched_domain_span(sd));
+
+ /*
+ * This assignment should be placed after the sd_llc_id as
+ * we want this id equals to cluster id on cluster machines
+ * but equals to LLC id on non-Cluster machines.
+ */
+ per_cpu(sd_share_id, cpu) = id;
+
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@@ -1117,7 +1133,7 @@ fail:
*
* - Simultaneous multithreading (SMT)
* - Multi-Core Cache (MC)
- * - Package (DIE)
+ * - Package (PKG)
*
* Where the last one more or less denotes everything up to a NUMA node.
*
@@ -1139,13 +1155,13 @@ fail:
*
* CPU 0 1 2 3 4 5 6 7
*
- * DIE [ ]
+ * PKG [ ]
* MC [ ] [ ]
* SMT [ ] [ ] [ ] [ ]
*
* - or -
*
- * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
+ * PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
* MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
* SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
*
@@ -1548,6 +1564,7 @@ static struct cpumask ***sched_domains_numa_masks;
*/
#define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUCAPACITY | \
+ SD_CLUSTER | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING)
@@ -1679,7 +1696,7 @@ static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { cpu_cpu_mask, SD_INIT_NAME(PKG) },
{ NULL, },
};
@@ -2112,22 +2129,31 @@ static int hop_cmp(const void *a, const void *b)
return -1;
}
-/*
- * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
- * closest to @cpu from @cpumask.
- * cpumask: cpumask to find a cpu from
- * cpu: Nth cpu to find
- *
- * returns: cpu, or nr_cpu_ids when nothing found.
+/**
+ * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU
+ * from @cpus to @cpu, taking into account distance
+ * from a given @node.
+ * @cpus: cpumask to find a cpu from
+ * @cpu: CPU to start searching
+ * @node: NUMA node to order CPUs by distance
+ *
+ * Return: cpu, or nr_cpu_ids when nothing found.
*/
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
{
- struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
+ struct __cmp_key k = { .cpus = cpus, .cpu = cpu };
struct cpumask ***hop_masks;
int hop, ret = nr_cpu_ids;
+ if (node == NUMA_NO_NODE)
+ return cpumask_nth_and(cpu, cpus, cpu_online_mask);
+
rcu_read_lock();
+ /* CPU-less node entries are uninitialized in sched_domains_numa_masks */
+ node = numa_nearest_node(node, N_CPU);
+ k.node = node;
+
k.masks = rcu_dereference(sched_domains_numa_masks);
if (!k.masks)
goto unlock;
@@ -2362,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
struct rq *rq = NULL;
int i, ret = -ENOMEM;
bool has_asym = false;
+ bool has_cluster = false;
if (WARN_ON(cpumask_empty(cpu_map)))
goto error;
@@ -2479,20 +2506,29 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
+ unsigned long capacity;
+
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
+ capacity = arch_scale_cpu_capacity(i);
/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
- if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
- WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+ if (capacity > READ_ONCE(d.rd->max_cpu_capacity))
+ WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
cpu_attach_domain(sd, d.rd, i);
+
+ if (lowest_flag_domain(i, SD_CLUSTER))
+ has_cluster = true;
}
rcu_read_unlock();
if (has_asym)
static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
+ if (has_cluster)
+ static_branch_inc_cpuslocked(&sched_cluster_active);
+
if (rq && sched_debug_verbose) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
@@ -2592,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map)
if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
+ if (static_branch_unlikely(&sched_cluster_active))
+ static_branch_dec_cpuslocked(&sched_cluster_active);
+
rcu_read_lock();
for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 802d98cf2de3..51e38f5f4701 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -58,13 +58,6 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
EXPORT_SYMBOL(remove_wait_queue);
/*
- * Scan threshold to break wait queue walk.
- * This allows a waker to take a break from holding the
- * wait queue lock during the wait queue walk.
- */
-#define WAITQUEUE_WALK_BREAK_CNT 64
-
-/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
* number) then we wake that number of exclusive tasks, and potentially all
@@ -78,21 +71,13 @@ EXPORT_SYMBOL(remove_wait_queue);
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key,
- wait_queue_entry_t *bookmark)
+ int nr_exclusive, int wake_flags, void *key)
{
wait_queue_entry_t *curr, *next;
- int cnt = 0;
lockdep_assert_held(&wq_head->lock);
- if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
- curr = list_next_entry(bookmark, entry);
-
- list_del(&bookmark->entry);
- bookmark->flags = 0;
- } else
- curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+ curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
if (&curr->entry == &wq_head->head)
return nr_exclusive;
@@ -101,21 +86,11 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
unsigned flags = curr->flags;
int ret;
- if (flags & WQ_FLAG_BOOKMARK)
- continue;
-
ret = curr->func(curr, mode, wake_flags, key);
if (ret < 0)
break;
if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
-
- if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
- (&next->entry != &wq_head->head)) {
- bookmark->flags = WQ_FLAG_BOOKMARK;
- list_add_tail(&bookmark->entry, &next->entry);
- break;
- }
}
return nr_exclusive;
@@ -125,20 +100,12 @@ static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m
int nr_exclusive, int wake_flags, void *key)
{
unsigned long flags;
- wait_queue_entry_t bookmark;
- int remaining = nr_exclusive;
+ int remaining;
- bookmark.flags = 0;
- bookmark.private = NULL;
- bookmark.func = NULL;
- INIT_LIST_HEAD(&bookmark.entry);
-
- do {
- spin_lock_irqsave(&wq_head->lock, flags);
- remaining = __wake_up_common(wq_head, mode, remaining,
- wake_flags, key, &bookmark);
- spin_unlock_irqrestore(&wq_head->lock, flags);
- } while (bookmark.flags & WQ_FLAG_BOOKMARK);
+ spin_lock_irqsave(&wq_head->lock, flags);
+ remaining = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags,
+ key);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
return nr_exclusive - remaining;
}
@@ -171,23 +138,16 @@ void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode
*/
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
{
- __wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
+ __wake_up_common(wq_head, mode, nr, 0, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, 0, key, NULL);
+ __wake_up_common(wq_head, mode, 1, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
- unsigned int mode, void *key, wait_queue_entry_t *bookmark)
-{
- __wake_up_common(wq_head, mode, 1, 0, key, bookmark);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
-
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
@@ -233,7 +193,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+ __wake_up_common(wq_head, mode, 1, WF_SYNC, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 255999ba9190..aca7b437882e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1072,7 +1072,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
*/
list_del_init(&addfd->list);
if (!addfd->setfd)
- fd = receive_fd(addfd->file, addfd->flags);
+ fd = receive_fd(addfd->file, NULL, addfd->flags);
else
fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
addfd->ret = fd;
diff --git a/kernel/signal.c b/kernel/signal.c
index 09019017d669..c9c57d053ce4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -171,16 +171,6 @@ static bool recalc_sigpending_tsk(struct task_struct *t)
return false;
}
-/*
- * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up.
- * This is superfluous when called on current, the wakeup is a harmless no-op.
- */
-void recalc_sigpending_and_wake(struct task_struct *t)
-{
- if (recalc_sigpending_tsk(t))
- signal_wake_up(t, 0);
-}
-
void recalc_sigpending(void)
{
if (!recalc_sigpending_tsk(current) && !freezing(current))
@@ -415,7 +405,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
int override_rlimit, const unsigned int sigqueue_flags)
{
struct sigqueue *q = NULL;
- struct ucounts *ucounts = NULL;
+ struct ucounts *ucounts;
long sigpending;
/*
@@ -1058,12 +1048,11 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
signal->flags = SIGNAL_GROUP_EXIT;
signal->group_exit_code = sig;
signal->group_stop_count = 0;
- t = p;
- do {
+ __for_each_thread(signal, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
- } while_each_thread(p, t);
+ }
return;
}
}
@@ -1349,10 +1338,8 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
action->sa.sa_handler = SIG_DFL;
if (handler == HANDLER_EXIT)
action->sa.sa_flags |= SA_IMMUTABLE;
- if (blocked) {
+ if (blocked)
sigdelset(&t->blocked, sig);
- recalc_sigpending_and_wake(t);
- }
}
/*
* Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
@@ -1362,6 +1349,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
(!t->ptrace || (handler == HANDLER_EXIT)))
t->signal->flags &= ~SIGNAL_UNKILLABLE;
ret = send_signal_locked(sig, info, t, PIDTYPE_PID);
+ /* This can happen if the signal was already pending and blocked */
+ if (!task_sigpending(t))
+ signal_wake_up(t, 0);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
return ret;
@@ -1377,12 +1367,12 @@ int force_sig_info(struct kernel_siginfo *info)
*/
int zap_other_threads(struct task_struct *p)
{
- struct task_struct *t = p;
+ struct task_struct *t;
int count = 0;
p->signal->group_stop_count = 0;
- while_each_thread(p, t) {
+ for_other_threads(p, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
/* Don't require de_thread to wait for the vhost_worker */
if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)
@@ -1471,16 +1461,21 @@ int group_send_sig_info(int sig, struct kernel_siginfo *info,
int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
struct task_struct *p = NULL;
- int retval, success;
+ int ret = -ESRCH;
- success = 0;
- retval = -ESRCH;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
- success |= !err;
- retval = err;
+ /*
+ * If group_send_sig_info() succeeds at least once ret
+ * becomes 0 and after that the code below has no effect.
+ * Otherwise we return the last err or -ESRCH if this
+ * process group is empty.
+ */
+ if (ret)
+ ret = err;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
- return success ? 0 : retval;
+
+ return ret;
}
int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
@@ -1718,9 +1713,8 @@ void force_sigsegv(int sig)
force_sig(SIGSEGV);
}
-int force_sig_fault_to_task(int sig, int code, void __user *addr
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- , struct task_struct *t)
+int force_sig_fault_to_task(int sig, int code, void __user *addr,
+ struct task_struct *t)
{
struct kernel_siginfo info;
@@ -1729,24 +1723,15 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
-#ifdef __ia64__
- info.si_imm = imm;
- info.si_flags = flags;
- info.si_isr = isr;
-#endif
return force_sig_info_to_task(&info, t, HANDLER_CURRENT);
}
-int force_sig_fault(int sig, int code, void __user *addr
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
+int force_sig_fault(int sig, int code, void __user *addr)
{
- return force_sig_fault_to_task(sig, code, addr
- ___ARCH_SI_IA64(imm, flags, isr), current);
+ return force_sig_fault_to_task(sig, code, addr, current);
}
-int send_sig_fault(int sig, int code, void __user *addr
- ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- , struct task_struct *t)
+int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t)
{
struct kernel_siginfo info;
@@ -1755,11 +1740,6 @@ int send_sig_fault(int sig, int code, void __user *addr
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
-#ifdef __ia64__
- info.si_imm = imm;
- info.si_flags = flags;
- info.si_isr = isr;
-#endif
return send_sig_info(info.si_signo, &info, t);
}
@@ -2329,15 +2309,38 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
do_notify_parent_cldstop(current, false, why);
/*
- * Don't want to allow preemption here, because
- * sys_ptrace() needs this task to be inactive.
+ * The previous do_notify_parent_cldstop() invocation woke ptracer.
+ * One a PREEMPTION kernel this can result in preemption requirement
+ * which will be fulfilled after read_unlock() and the ptracer will be
+ * put on the CPU.
+ * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for
+ * this task wait in schedule(). If this task gets preempted then it
+ * remains enqueued on the runqueue. The ptracer will observe this and
+ * then sleep for a delay of one HZ tick. In the meantime this task
+ * gets scheduled, enters schedule() and will wait for the ptracer.
*
- * XXX: implement read_unlock_no_resched().
+ * This preemption point is not bad from a correctness point of
+ * view but extends the runtime by one HZ tick time due to the
+ * ptracer's sleep. The preempt-disable section ensures that there
+ * will be no preemption between unlock and schedule() and so
+ * improving the performance since the ptracer will observe that
+ * the tracee is scheduled out once it gets on the CPU.
+ *
+ * On PREEMPT_RT locking tasklist_lock does not disable preemption.
+ * Therefore the task can be preempted after do_notify_parent_cldstop()
+ * before unlocking tasklist_lock so there is no benefit in doing this.
+ *
+ * In fact disabling preemption is harmful on PREEMPT_RT because
+ * the spinlock_t in cgroup_enter_frozen() must not be acquired
+ * with preemption disabled due to the 'sleeping' spinlock
+ * substitution of RT.
*/
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
read_unlock(&tasklist_lock);
cgroup_enter_frozen();
- preempt_enable_no_resched();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable_no_resched();
schedule();
cgroup_leave_frozen(true);
@@ -2453,12 +2456,10 @@ static bool do_signal_stop(int signr)
sig->group_exit_code = signr;
sig->group_stop_count = 0;
-
if (task_set_jobctl_pending(current, signr | gstop))
sig->group_stop_count++;
- t = current;
- while_each_thread(current, t) {
+ for_other_threads(current, t) {
/*
* Setting state to TASK_STOPPED for a group
* stop is always done with the siglock held,
@@ -2954,8 +2955,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
if (sigisemptyset(&retarget))
return;
- t = tsk;
- while_each_thread(tsk, t) {
+ for_other_threads(tsk, t) {
if (t->flags & PF_EXITING)
continue;
diff --git a/kernel/smp.c b/kernel/smp.c
index 8455a53465af..f085ebcdf9e7 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -127,7 +127,7 @@ send_call_function_ipi_mask(struct cpumask *mask)
}
static __always_inline void
-csd_do_func(smp_call_func_t func, void *info, struct __call_single_data *csd)
+csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd)
{
trace_csd_function_entry(func, csd);
func(info);
@@ -170,11 +170,13 @@ static DEFINE_PER_CPU(void *, cur_csd_info);
static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */
module_param(csd_lock_timeout, ulong, 0444);
+static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */
+module_param(panic_on_ipistall, int, 0444);
static atomic_t csd_bug_count = ATOMIC_INIT(0);
/* Record current CSD work for current CPU, NULL to erase. */
-static void __csd_lock_record(struct __call_single_data *csd)
+static void __csd_lock_record(call_single_data_t *csd)
{
if (!csd) {
smp_mb(); /* NULL cur_csd after unlock. */
@@ -189,13 +191,13 @@ static void __csd_lock_record(struct __call_single_data *csd)
/* Or before unlock, as the case may be. */
}
-static __always_inline void csd_lock_record(struct __call_single_data *csd)
+static __always_inline void csd_lock_record(call_single_data_t *csd)
{
if (static_branch_unlikely(&csdlock_debug_enabled))
__csd_lock_record(csd);
}
-static int csd_lock_wait_getcpu(struct __call_single_data *csd)
+static int csd_lock_wait_getcpu(call_single_data_t *csd)
{
unsigned int csd_type;
@@ -210,7 +212,7 @@ static int csd_lock_wait_getcpu(struct __call_single_data *csd)
* the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
* so waiting on other types gets much less information.
*/
-static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id)
+static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
{
int cpu = -1;
int cpux;
@@ -230,6 +232,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
}
ts2 = sched_clock();
+ /* How long since we last checked for a stuck CSD lock.*/
ts_delta = ts2 - *ts1;
if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
return false;
@@ -243,9 +246,17 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
else
cpux = cpu;
cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
+ /* How long since this CSD lock was stuck. */
+ ts_delta = ts2 - ts0;
pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
- firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0,
+ firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
cpu, csd->func, csd->info);
+ /*
+ * If the CSD lock is still stuck after 5 minutes, it is unlikely
+ * to become unstuck. Use a signed comparison to avoid triggering
+ * on underflows when the TSC is out of sync between sockets.
+ */
+ BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
if (cpu_cur_csd && csd != cpu_cur_csd) {
pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
*bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
@@ -276,7 +287,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
-static void __csd_lock_wait(struct __call_single_data *csd)
+static void __csd_lock_wait(call_single_data_t *csd)
{
int bug_id = 0;
u64 ts0, ts1;
@@ -290,7 +301,7 @@ static void __csd_lock_wait(struct __call_single_data *csd)
smp_acquire__after_ctrl_dep();
}
-static __always_inline void csd_lock_wait(struct __call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
if (static_branch_unlikely(&csdlock_debug_enabled)) {
__csd_lock_wait(csd);
@@ -300,17 +311,17 @@ static __always_inline void csd_lock_wait(struct __call_single_data *csd)
smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#else
-static void csd_lock_record(struct __call_single_data *csd)
+static void csd_lock_record(call_single_data_t *csd)
{
}
-static __always_inline void csd_lock_wait(struct __call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#endif
-static __always_inline void csd_lock(struct __call_single_data *csd)
+static __always_inline void csd_lock(call_single_data_t *csd)
{
csd_lock_wait(csd);
csd->node.u_flags |= CSD_FLAG_LOCK;
@@ -323,7 +334,7 @@ static __always_inline void csd_lock(struct __call_single_data *csd)
smp_wmb();
}
-static __always_inline void csd_unlock(struct __call_single_data *csd)
+static __always_inline void csd_unlock(call_single_data_t *csd)
{
WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
@@ -376,7 +387,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
-static int generic_exec_single(int cpu, struct __call_single_data *csd)
+static int generic_exec_single(int cpu, call_single_data_t *csd)
{
if (cpu == smp_processor_id()) {
smp_call_func_t func = csd->func;
@@ -667,7 +678,7 @@ EXPORT_SYMBOL(smp_call_function_single);
*
* Return: %0 on success or negative errno value on error
*/
-int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
int err = 0;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f47d8f375946..1992b62e980b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -272,8 +272,7 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
if (tsk) {
- kthread_stop(tsk);
- put_task_struct(tsk);
+ kthread_stop_put(tsk);
*per_cpu_ptr(ht->store, cpu) = NULL;
}
}
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9ed5ce989415..afb3c116da91 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(stack_trace_save);
/**
* stack_trace_save_tsk - Save a task stack trace into a storage array
- * @task: The task to examine
+ * @tsk: The task to examine
* @store: Pointer to storage array
* @size: Size of the storage array
* @skipnr: Number of entries to skip at the start of the stack trace
@@ -151,6 +151,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
put_task_stack(tsk);
return c.len;
}
+EXPORT_SYMBOL_GPL(stack_trace_save_tsk);
/**
* stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
@@ -301,6 +302,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
save_stack_trace_tsk(task, &trace);
return trace.nr_entries;
}
+EXPORT_SYMBOL_GPL(stack_trace_save_tsk);
/**
* stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
diff --git a/kernel/sys.c b/kernel/sys.c
index 2410e3999ebe..f8e543f1e38a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1785,74 +1785,87 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
struct task_struct *t;
unsigned long flags;
u64 tgutime, tgstime, utime, stime;
- unsigned long maxrss = 0;
+ unsigned long maxrss;
+ struct mm_struct *mm;
+ struct signal_struct *sig = p->signal;
+ unsigned int seq = 0;
- memset((char *)r, 0, sizeof (*r));
+retry:
+ memset(r, 0, sizeof(*r));
utime = stime = 0;
+ maxrss = 0;
if (who == RUSAGE_THREAD) {
task_cputime_adjusted(current, &utime, &stime);
accumulate_thread_rusage(p, r);
- maxrss = p->signal->maxrss;
- goto out;
+ maxrss = sig->maxrss;
+ goto out_thread;
}
- if (!lock_task_sighand(p, &flags))
- return;
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
switch (who) {
case RUSAGE_BOTH:
case RUSAGE_CHILDREN:
- utime = p->signal->cutime;
- stime = p->signal->cstime;
- r->ru_nvcsw = p->signal->cnvcsw;
- r->ru_nivcsw = p->signal->cnivcsw;
- r->ru_minflt = p->signal->cmin_flt;
- r->ru_majflt = p->signal->cmaj_flt;
- r->ru_inblock = p->signal->cinblock;
- r->ru_oublock = p->signal->coublock;
- maxrss = p->signal->cmaxrss;
+ utime = sig->cutime;
+ stime = sig->cstime;
+ r->ru_nvcsw = sig->cnvcsw;
+ r->ru_nivcsw = sig->cnivcsw;
+ r->ru_minflt = sig->cmin_flt;
+ r->ru_majflt = sig->cmaj_flt;
+ r->ru_inblock = sig->cinblock;
+ r->ru_oublock = sig->coublock;
+ maxrss = sig->cmaxrss;
if (who == RUSAGE_CHILDREN)
break;
fallthrough;
case RUSAGE_SELF:
- thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- utime += tgutime;
- stime += tgstime;
- r->ru_nvcsw += p->signal->nvcsw;
- r->ru_nivcsw += p->signal->nivcsw;
- r->ru_minflt += p->signal->min_flt;
- r->ru_majflt += p->signal->maj_flt;
- r->ru_inblock += p->signal->inblock;
- r->ru_oublock += p->signal->oublock;
- if (maxrss < p->signal->maxrss)
- maxrss = p->signal->maxrss;
- t = p;
- do {
+ r->ru_nvcsw += sig->nvcsw;
+ r->ru_nivcsw += sig->nivcsw;
+ r->ru_minflt += sig->min_flt;
+ r->ru_majflt += sig->maj_flt;
+ r->ru_inblock += sig->inblock;
+ r->ru_oublock += sig->oublock;
+ if (maxrss < sig->maxrss)
+ maxrss = sig->maxrss;
+
+ rcu_read_lock();
+ __for_each_thread(sig, t)
accumulate_thread_rusage(t, r);
- } while_each_thread(p, t);
+ rcu_read_unlock();
+
break;
default:
BUG();
}
- unlock_task_sighand(p, &flags);
-out:
- r->ru_utime = ns_to_kernel_old_timeval(utime);
- r->ru_stime = ns_to_kernel_old_timeval(stime);
+ if (need_seqretry(&sig->stats_lock, seq)) {
+ seq = 1;
+ goto retry;
+ }
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
- if (who != RUSAGE_CHILDREN) {
- struct mm_struct *mm = get_task_mm(p);
+ if (who == RUSAGE_CHILDREN)
+ goto out_children;
- if (mm) {
- setmax_mm_hiwater_rss(&maxrss, mm);
- mmput(mm);
- }
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ utime += tgutime;
+ stime += tgstime;
+
+out_thread:
+ mm = get_task_mm(p);
+ if (mm) {
+ setmax_mm_hiwater_rss(&maxrss, mm);
+ mmput(mm);
}
+
+out_children:
r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
+ r->ru_utime = ns_to_kernel_old_timeval(utime);
+ r->ru_stime = ns_to_kernel_old_timeval(stime);
}
SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
@@ -2368,19 +2381,45 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
}
#endif /* CONFIG_ANON_VMA_NAME */
+static inline unsigned long get_current_mdwe(void)
+{
+ unsigned long ret = 0;
+
+ if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ ret |= PR_MDWE_REFUSE_EXEC_GAIN;
+ if (test_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags))
+ ret |= PR_MDWE_NO_INHERIT;
+
+ return ret;
+}
+
static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
+ unsigned long current_bits;
+
if (arg3 || arg4 || arg5)
return -EINVAL;
- if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
+ if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT))
return -EINVAL;
+ /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */
+ if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
+ return -EINVAL;
+
+ /* PARISC cannot allow mdwe as it needs writable stacks */
+ if (IS_ENABLED(CONFIG_PARISC))
+ return -EINVAL;
+
+ current_bits = get_current_mdwe();
+ if (current_bits && current_bits != bits)
+ return -EPERM; /* Cannot unset the flags */
+
+ if (bits & PR_MDWE_NO_INHERIT)
+ set_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags);
if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
set_bit(MMF_HAS_MDWE, &current->mm->flags);
- else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
- return -EPERM; /* Cannot unset the flag */
return 0;
}
@@ -2390,9 +2429,7 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
{
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
-
- return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
- PR_MDWE_REFUSE_EXEC_GAIN : 0;
+ return get_current_mdwe();
}
static int prctl_get_auxv(void __user *addr, unsigned long len)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e137c1385c56..faad00cce269 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -51,8 +51,6 @@ COND_SYSCALL_COMPAT(io_pgetevents);
COND_SYSCALL(io_uring_setup);
COND_SYSCALL(io_uring_enter);
COND_SYSCALL(io_uring_register);
-COND_SYSCALL(lookup_dcookie);
-COND_SYSCALL_COMPAT(lookup_dcookie);
COND_SYSCALL(eventfd2);
COND_SYSCALL(epoll_create1);
COND_SYSCALL(epoll_ctl);
@@ -87,6 +85,9 @@ COND_SYSCALL_COMPAT(set_robust_list);
COND_SYSCALL(get_robust_list);
COND_SYSCALL_COMPAT(get_robust_list);
COND_SYSCALL(futex_waitv);
+COND_SYSCALL(futex_wake);
+COND_SYSCALL(futex_wait);
+COND_SYSCALL(futex_requeue);
COND_SYSCALL(kexec_load);
COND_SYSCALL_COMPAT(kexec_load);
COND_SYSCALL(init_module);
@@ -170,6 +171,9 @@ COND_SYSCALL(landlock_add_rule);
COND_SYSCALL(landlock_restrict_self);
COND_SYSCALL(fadvise64_64);
COND_SYSCALL_COMPAT(fadvise64_64);
+COND_SYSCALL(lsm_get_self_attr);
+COND_SYSCALL(lsm_set_self_attr);
+COND_SYSCALL(lsm_list_modules);
/* CONFIG_MMU only */
COND_SYSCALL(swapon);
@@ -200,6 +204,20 @@ COND_SYSCALL(recvmmsg_time32);
COND_SYSCALL_COMPAT(recvmmsg_time32);
COND_SYSCALL_COMPAT(recvmmsg_time64);
+/* Posix timer syscalls may be configured out */
+COND_SYSCALL(timer_create);
+COND_SYSCALL(timer_gettime);
+COND_SYSCALL(timer_getoverrun);
+COND_SYSCALL(timer_settime);
+COND_SYSCALL(timer_delete);
+COND_SYSCALL(clock_adjtime);
+COND_SYSCALL(getitimer);
+COND_SYSCALL(setitimer);
+COND_SYSCALL(alarm);
+COND_SYSCALL_COMPAT(timer_create);
+COND_SYSCALL_COMPAT(getitimer);
+COND_SYSCALL_COMPAT(setitimer);
+
/*
* Architecture specific syscalls: see further below
*/
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 354a2d294f52..157f7ce2942d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1939,15 +1939,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_IA64
- {
- .procname = "unaligned-dump-stack",
- .data = &unaligned_dump_stack,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
#ifdef CONFIG_RT_MUTEXES
{
.procname = "max_lock_depth",
@@ -1983,7 +1974,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_perf_event_sample_rate,
.maxlen = sizeof(sysctl_perf_event_sample_rate),
.mode = 0644,
- .proc_handler = perf_proc_update_handler,
+ .proc_handler = perf_event_max_sample_rate_handler,
.extra1 = SYSCTL_ONE,
},
{
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 065e1ef8fc8d..95a7e1b7f1da 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -78,6 +78,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
* task_work_cancel_match - cancel a pending work added by task_work_add()
* @task: the task which should execute the work
* @match: match function to call
+ * @data: data to be passed in to match function
*
* RETURNS:
* The found work or NULL if not found.
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 8ce3fa0c19e2..4354ea231fab 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -233,9 +233,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
else
memset(stats, 0, sizeof(*stats));
- tsk = first;
start_time = ktime_get_ns();
- do {
+ for_each_thread(first, tsk) {
if (tsk->exit_state)
continue;
/*
@@ -258,7 +257,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
stats->nvcsw += tsk->nvcsw;
stats->nivcsw += tsk->nivcsw;
- } while_each_thread(first, tsk);
+ }
unlock_task_sighand(first, &flags);
rc = 0;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8d9f13d847f0..4657cb8e8b1f 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -290,6 +290,17 @@ static int alarmtimer_suspend(struct device *dev)
rtc_timer_cancel(rtc, &rtctimer);
rtc_read_time(rtc, &tm);
now = rtc_tm_to_ktime(tm);
+
+ /*
+ * If the RTC alarm timer only supports a limited time offset, set the
+ * alarm time to the maximum supported value.
+ * The system may wake up earlier (possibly much earlier) than expected
+ * when the alarmtimer runs. This is the best the kernel can do if
+ * the alarmtimer exceeds the time that the rtc device can be programmed
+ * for.
+ */
+ min = rtc_bound_alarmtime(rtc, min);
+
now = ktime_add(now, min);
/* Set alarm, if in the past reject suspend briefly to handle */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c108ed8a9804..3052b1f1168e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -99,6 +99,7 @@ static u64 suspend_start;
* Interval: 0.5sec.
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
/*
* Threshold: 0.0312s, when doubled: 0.0625s.
@@ -134,6 +135,7 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
static DEFINE_SPINLOCK(watchdog_lock);
static int watchdog_running;
static atomic_t watchdog_reset_pending;
+static int64_t watchdog_max_interval;
static inline void clocksource_watchdog_lock(unsigned long *flags)
{
@@ -399,8 +401,8 @@ static inline void clocksource_reset_watchdog(void)
static void clocksource_watchdog(struct timer_list *unused)
{
u64 csnow, wdnow, cslast, wdlast, delta;
+ int64_t wd_nsec, cs_nsec, interval;
int next_cpu, reset_pending;
- int64_t wd_nsec, cs_nsec;
struct clocksource *cs;
enum wd_read_status read_ret;
unsigned long extra_wait = 0;
@@ -470,6 +472,27 @@ static void clocksource_watchdog(struct timer_list *unused)
if (atomic_read(&watchdog_reset_pending))
continue;
+ /*
+ * The processing of timer softirqs can get delayed (usually
+ * on account of ksoftirqd not getting to run in a timely
+ * manner), which causes the watchdog interval to stretch.
+ * Skew detection may fail for longer watchdog intervals
+ * on account of fixed margins being used.
+ * Some clocksources, e.g. acpi_pm, cannot tolerate
+ * watchdog intervals longer than a few seconds.
+ */
+ interval = max(cs_nsec, wd_nsec);
+ if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
+ if (system_state > SYSTEM_SCHEDULING &&
+ interval > 2 * watchdog_max_interval) {
+ watchdog_max_interval = interval;
+ pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
+ cs_nsec, wd_nsec);
+ }
+ watchdog_timer.expires = jiffies;
+ continue;
+ }
+
/* Check the deviation from the watchdog clocksource. */
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
if (abs(cs_nsec - wd_nsec) > md) {
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 238262e4aba7..edb0f821dcea 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1085,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
enum hrtimer_mode mode)
{
debug_activate(timer, mode);
+ WARN_ON_ONCE(!base->cpu_base->online);
base->cpu_base->active_bases |= 1 << base->index;
@@ -2183,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
+ cpu_base->online = 1;
hrtimer_cpu_base_init_expiry_lock(cpu_base);
return 0;
}
@@ -2219,29 +2221,22 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
}
}
-int hrtimers_dead_cpu(unsigned int scpu)
+int hrtimers_cpu_dying(unsigned int dying_cpu)
{
struct hrtimer_cpu_base *old_base, *new_base;
- int i;
+ int i, ncpu = cpumask_first(cpu_active_mask);
- BUG_ON(cpu_online(scpu));
- tick_cancel_sched_timer(scpu);
+ tick_cancel_sched_timer(dying_cpu);
+
+ old_base = this_cpu_ptr(&hrtimer_bases);
+ new_base = &per_cpu(hrtimer_bases, ncpu);
- /*
- * this BH disable ensures that raise_softirq_irqoff() does
- * not wakeup ksoftirqd (and acquire the pi-lock) while
- * holding the cpu_base lock
- */
- local_bh_disable();
- local_irq_disable();
- old_base = &per_cpu(hrtimer_bases, scpu);
- new_base = this_cpu_ptr(&hrtimer_bases);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
*/
- raw_spin_lock(&new_base->lock);
- raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock(&old_base->lock);
+ raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
migrate_hrtimer_list(&old_base->clock_base[i],
@@ -2252,15 +2247,14 @@ int hrtimers_dead_cpu(unsigned int scpu)
* The migration might have changed the first expiring softirq
* timer on this CPU. Update it.
*/
- hrtimer_update_softirq_timer(new_base, false);
+ __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
+ /* Tell the other CPU to retrigger the next event */
+ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
- raw_spin_unlock(&old_base->lock);
raw_spin_unlock(&new_base->lock);
+ old_base->online = 0;
+ raw_spin_unlock(&old_base->lock);
- /* Check, if we got expired work to do */
- __hrtimer_peek_ahead_timers();
- local_irq_enable();
- local_bh_enable();
return 0;
}
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 77c0c2370b6d..9de66bbbb3d1 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -19,7 +19,8 @@
*/
static struct posix_clock *get_posix_clock(struct file *fp)
{
- struct posix_clock *clk = fp->private_data;
+ struct posix_clock_context *pccontext = fp->private_data;
+ struct posix_clock *clk = pccontext->clk;
down_read(&clk->rwsem);
@@ -39,6 +40,7 @@ static void put_posix_clock(struct posix_clock *clk)
static ssize_t posix_clock_read(struct file *fp, char __user *buf,
size_t count, loff_t *ppos)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
int err = -EINVAL;
@@ -46,7 +48,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
return -ENODEV;
if (clk->ops.read)
- err = clk->ops.read(clk, fp->f_flags, buf, count);
+ err = clk->ops.read(pccontext, fp->f_flags, buf, count);
put_posix_clock(clk);
@@ -55,6 +57,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
__poll_t result = 0;
@@ -62,7 +65,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
return EPOLLERR;
if (clk->ops.poll)
- result = clk->ops.poll(clk, fp, wait);
+ result = clk->ops.poll(pccontext, fp, wait);
put_posix_clock(clk);
@@ -72,6 +75,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
static long posix_clock_ioctl(struct file *fp,
unsigned int cmd, unsigned long arg)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
int err = -ENOTTY;
@@ -79,7 +83,7 @@ static long posix_clock_ioctl(struct file *fp,
return -ENODEV;
if (clk->ops.ioctl)
- err = clk->ops.ioctl(clk, cmd, arg);
+ err = clk->ops.ioctl(pccontext, cmd, arg);
put_posix_clock(clk);
@@ -90,6 +94,7 @@ static long posix_clock_ioctl(struct file *fp,
static long posix_clock_compat_ioctl(struct file *fp,
unsigned int cmd, unsigned long arg)
{
+ struct posix_clock_context *pccontext = fp->private_data;
struct posix_clock *clk = get_posix_clock(fp);
int err = -ENOTTY;
@@ -97,7 +102,7 @@ static long posix_clock_compat_ioctl(struct file *fp,
return -ENODEV;
if (clk->ops.ioctl)
- err = clk->ops.ioctl(clk, cmd, arg);
+ err = clk->ops.ioctl(pccontext, cmd, arg);
put_posix_clock(clk);
@@ -110,6 +115,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
int err;
struct posix_clock *clk =
container_of(inode->i_cdev, struct posix_clock, cdev);
+ struct posix_clock_context *pccontext;
down_read(&clk->rwsem);
@@ -117,14 +123,20 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
err = -ENODEV;
goto out;
}
+ pccontext = kzalloc(sizeof(*pccontext), GFP_KERNEL);
+ if (!pccontext) {
+ err = -ENOMEM;
+ goto out;
+ }
+ pccontext->clk = clk;
+ fp->private_data = pccontext;
if (clk->ops.open)
- err = clk->ops.open(clk, fp->f_mode);
+ err = clk->ops.open(pccontext, fp->f_mode);
else
err = 0;
if (!err) {
get_device(clk->dev);
- fp->private_data = clk;
}
out:
up_read(&clk->rwsem);
@@ -133,14 +145,20 @@ out:
static int posix_clock_release(struct inode *inode, struct file *fp)
{
- struct posix_clock *clk = fp->private_data;
+ struct posix_clock_context *pccontext = fp->private_data;
+ struct posix_clock *clk;
int err = 0;
+ if (!pccontext)
+ return -ENODEV;
+ clk = pccontext->clk;
+
if (clk->ops.release)
- err = clk->ops.release(clk);
+ err = clk->ops.release(pccontext);
put_device(clk->dev);
+ kfree(pccontext);
fp->private_data = NULL;
return err;
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 828aeecbd1e8..9b6fcb8d85e7 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -17,40 +17,6 @@
#include <linux/time_namespace.h>
#include <linux/compat.h>
-#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
-/* Architectures may override SYS_NI and COMPAT_SYS_NI */
-#include <asm/syscall_wrapper.h>
-#endif
-
-asmlinkage long sys_ni_posix_timers(void)
-{
- pr_err_once("process %d (%s) attempted a POSIX timer syscall "
- "while CONFIG_POSIX_TIMERS is not set\n",
- current->pid, current->comm);
- return -ENOSYS;
-}
-
-#ifndef SYS_NI
-#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers)
-#endif
-
-#ifndef COMPAT_SYS_NI
-#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers)
-#endif
-
-SYS_NI(timer_create);
-SYS_NI(timer_gettime);
-SYS_NI(timer_getoverrun);
-SYS_NI(timer_settime);
-SYS_NI(timer_delete);
-SYS_NI(clock_adjtime);
-SYS_NI(getitimer);
-SYS_NI(setitimer);
-SYS_NI(clock_adjtime32);
-#ifdef __ARCH_WANT_SYS_ALARM
-SYS_NI(alarm);
-#endif
-
/*
* We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
* as it is easy to remain compatible with little code. CLOCK_BOOTTIME
@@ -158,18 +124,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
which_clock);
}
-#ifdef CONFIG_COMPAT
-COMPAT_SYS_NI(timer_create);
-#endif
-
-#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
-COMPAT_SYS_NI(getitimer);
-COMPAT_SYS_NI(setitimer);
-#endif
-
#ifdef CONFIG_COMPAT_32BIT_TIME
-SYS_NI(timer_settime32);
-SYS_NI(timer_gettime32);
SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock,
struct old_timespec32 __user *, tp)
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 649f2b48e8f0..481b7ab65e2c 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -56,7 +56,6 @@ extern int clockevents_program_event(struct clock_event_device *dev,
ktime_t expires, bool force);
extern void clockevents_handle_noop(struct clock_event_device *dev);
extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
-extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
/* Broadcasting support */
# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
@@ -197,3 +196,5 @@ void hrtimers_resume_local(void);
#else
#define JIFFIES_SHIFT 8
#endif
+
+extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 87015e9deacc..01fb50c1b17e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -4,7 +4,7 @@
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
*
- * No idle tick implementation for low and high resolution timers
+ * NOHZ implementation for low and high resolution timers
*
* Started by: Thomas Gleixner and Ingo Molnar
*/
@@ -45,7 +45,7 @@ struct tick_sched *tick_get_tick_sched(int cpu)
#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
/*
- * The time, when the last jiffy update happened. Write access must hold
+ * The time when the last jiffy update happened. Write access must hold
* jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
* consistent view of jiffies and last_jiffies_update.
*/
@@ -60,13 +60,13 @@ static void tick_do_update_jiffies64(ktime_t now)
ktime_t delta, nextp;
/*
- * 64bit can do a quick check without holding jiffies lock and
+ * 64-bit can do a quick check without holding the jiffies lock and
* without looking at the sequence count. The smp_load_acquire()
* pairs with the update done later in this function.
*
- * 32bit cannot do that because the store of tick_next_period
- * consists of two 32bit stores and the first store could move it
- * to a random point in the future.
+ * 32-bit cannot do that because the store of 'tick_next_period'
+ * consists of two 32-bit stores, and the first store could be
+ * moved by the CPU to a random point in the future.
*/
if (IS_ENABLED(CONFIG_64BIT)) {
if (ktime_before(now, smp_load_acquire(&tick_next_period)))
@@ -75,7 +75,7 @@ static void tick_do_update_jiffies64(ktime_t now)
unsigned int seq;
/*
- * Avoid contention on jiffies_lock and protect the quick
+ * Avoid contention on 'jiffies_lock' and protect the quick
* check with the sequence count.
*/
do {
@@ -90,7 +90,7 @@ static void tick_do_update_jiffies64(ktime_t now)
/* Quick check failed, i.e. update is required. */
raw_spin_lock(&jiffies_lock);
/*
- * Reevaluate with the lock held. Another CPU might have done the
+ * Re-evaluate with the lock held. Another CPU might have done the
* update already.
*/
if (ktime_before(now, tick_next_period)) {
@@ -114,25 +114,23 @@ static void tick_do_update_jiffies64(ktime_t now)
TICK_NSEC);
}
- /* Advance jiffies to complete the jiffies_seq protected job */
+ /* Advance jiffies to complete the 'jiffies_seq' protected job */
jiffies_64 += ticks;
- /*
- * Keep the tick_next_period variable up to date.
- */
+ /* Keep the tick_next_period variable up to date */
nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);
if (IS_ENABLED(CONFIG_64BIT)) {
/*
* Pairs with smp_load_acquire() in the lockless quick
- * check above and ensures that the update to jiffies_64 is
- * not reordered vs. the store to tick_next_period, neither
+ * check above, and ensures that the update to 'jiffies_64' is
+ * not reordered vs. the store to 'tick_next_period', neither
* by the compiler nor by the CPU.
*/
smp_store_release(&tick_next_period, nextp);
} else {
/*
- * A plain store is good enough on 32bit as the quick check
+ * A plain store is good enough on 32-bit, as the quick check
* above is protected by the sequence count.
*/
tick_next_period = nextp;
@@ -140,7 +138,7 @@ static void tick_do_update_jiffies64(ktime_t now)
/*
* Release the sequence count. calc_global_load() below is not
- * protected by it, but jiffies_lock needs to be held to prevent
+ * protected by it, but 'jiffies_lock' needs to be held to prevent
* concurrent invocations.
*/
write_seqcount_end(&jiffies_seq);
@@ -160,7 +158,8 @@ static ktime_t tick_init_jiffy_update(void)
raw_spin_lock(&jiffies_lock);
write_seqcount_begin(&jiffies_seq);
- /* Did we start the jiffies update yet ? */
+
+ /* Have we started the jiffies update yet ? */
if (last_jiffies_update == 0) {
u32 rem;
@@ -175,8 +174,10 @@ static ktime_t tick_init_jiffy_update(void)
last_jiffies_update = tick_next_period;
}
period = last_jiffies_update;
+
write_seqcount_end(&jiffies_seq);
raw_spin_unlock(&jiffies_lock);
+
return period;
}
@@ -192,10 +193,10 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
* concurrency: This happens only when the CPU in charge went
* into a long sleep. If two CPUs happen to assign themselves to
* this duty, then the jiffies update is still serialized by
- * jiffies_lock.
+ * 'jiffies_lock'.
*
* If nohz_full is enabled, this should not happen because the
- * tick_do_timer_cpu never relinquishes.
+ * 'tick_do_timer_cpu' CPU never relinquishes.
*/
if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
#ifdef CONFIG_NO_HZ_FULL
@@ -205,12 +206,12 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
}
#endif
- /* Check, if the jiffies need an update */
+ /* Check if jiffies need an update */
if (tick_do_timer_cpu == cpu)
tick_do_update_jiffies64(now);
/*
- * If jiffies update stalled for too long (timekeeper in stop_machine()
+ * If the jiffies update stalled for too long (timekeeper in stop_machine()
* or VMEXIT'ed for several msecs), force an update.
*/
if (ts->last_tick_jiffies != jiffies) {
@@ -234,10 +235,10 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
/*
* When we are idle and the tick is stopped, we have to touch
* the watchdog as we might not schedule for a really long
- * time. This happens on complete idle SMP systems while
+ * time. This happens on completely idle SMP systems while
* waiting on the login prompt. We also increment the "start of
* idle" jiffy stamp so the idle accounting adjustment we do
- * when we go busy again does not account too much ticks.
+ * when we go busy again does not account too many ticks.
*/
if (ts->tick_stopped) {
touch_softlockup_watchdog_sched();
@@ -362,7 +363,7 @@ static void tick_nohz_kick_task(struct task_struct *tsk)
/*
* If the task is not running, run_posix_cpu_timers()
- * has nothing to elapse, IPI can then be spared.
+ * has nothing to elapse, and an IPI can then be optimized out.
*
* activate_task() STORE p->tick_dep_mask
* STORE p->on_rq
@@ -425,7 +426,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep,
/*
* Set a global tick dependency. Used by perf events that rely on freq and
- * by unstable clock.
+ * unstable clocks.
*/
void tick_nohz_dep_set(enum tick_dep_bits bit)
{
@@ -439,7 +440,7 @@ void tick_nohz_dep_clear(enum tick_dep_bits bit)
/*
* Set per-CPU tick dependency. Used by scheduler and perf events in order to
- * manage events throttling.
+ * manage event-throttling.
*/
void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
{
@@ -455,7 +456,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
if (cpu == smp_processor_id()) {
tick_nohz_full_kick();
} else {
- /* Remote irq work not NMI-safe */
+ /* Remote IRQ work not NMI-safe */
if (!WARN_ON_ONCE(in_nmi()))
tick_nohz_full_kick_cpu(cpu);
}
@@ -473,7 +474,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
/*
- * Set a per-task tick dependency. RCU need this. Also posix CPU timers
+ * Set a per-task tick dependency. RCU needs this. Also posix CPU timers
* in order to elapse per task timers.
*/
void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
@@ -546,7 +547,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
{
/*
- * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
+ * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound
* timers, workqueues, timekeeping, ...) on behalf of full dynticks
* CPUs. It must remain online when nohz full is enabled.
*/
@@ -568,12 +569,12 @@ void __init tick_nohz_init(void)
return;
/*
- * Full dynticks uses irq work to drive the tick rescheduling on safe
- * locking contexts. But then we need irq work to raise its own
- * interrupts to avoid circular dependency on the tick
+ * Full dynticks uses IRQ work to drive the tick rescheduling on safe
+ * locking contexts. But then we need IRQ work to raise its own
+ * interrupts to avoid circular dependency on the tick.
*/
if (!arch_irq_work_has_interrupt()) {
- pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
+ pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n");
cpumask_clear(tick_nohz_full_mask);
tick_nohz_full_running = false;
return;
@@ -643,7 +644,7 @@ bool tick_nohz_tick_stopped_cpu(int cpu)
* In case the sched_tick was stopped on this CPU, we have to check if jiffies
* must be updated. Otherwise an interrupt handler could use a stale jiffy
* value. We do this unconditionally on any CPU, as we don't know whether the
- * CPU, which has the update task assigned is in a long sleep.
+ * CPU, which has the update task assigned, is in a long sleep.
*/
static void tick_nohz_update_jiffies(ktime_t now)
{
@@ -726,7 +727,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
* counters if NULL.
*
* Return the cumulative idle time (since boot) for a given
- * CPU, in microseconds. Note this is partially broken due to
+ * CPU, in microseconds. Note that this is partially broken due to
* the counter of iowait tasks that can be remotely updated without
* any synchronization. Therefore it is possible to observe backward
* values within two consecutive reads.
@@ -787,7 +788,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
}
/*
- * Reset to make sure next tick stop doesn't get fooled by past
+ * Reset to make sure the next tick stop doesn't get fooled by past
* cached clock deadline.
*/
ts->next_tick = 0;
@@ -816,11 +817,11 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
/*
* Keep the periodic tick, when RCU, architecture or irq_work
* requests it.
- * Aside of that check whether the local timer softirq is
- * pending. If so its a bad idea to call get_next_timer_interrupt()
+ * Aside of that, check whether the local timer softirq is
+ * pending. If so, its a bad idea to call get_next_timer_interrupt(),
* because there is an already expired timer, so it will request
* immediate expiry, which rearms the hardware timer with a
- * minimal delta which brings us back to this place
+ * minimal delta, which brings us back to this place
* immediately. Lather, rinse and repeat...
*/
if (rcu_needs_cpu() || arch_needs_cpu() ||
@@ -838,6 +839,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
ts->next_timer = next_tick;
}
+ /* Make sure next_tick is never before basemono! */
+ if (WARN_ON_ONCE(basemono > next_tick))
+ next_tick = basemono;
+
/*
* If the tick is due in the next period, keep it ticking or
* force prod the timer.
@@ -861,7 +866,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
/*
* If this CPU is the one which had the do_timer() duty last, we limit
- * the sleep time to the timekeeping max_deferment value.
+ * the sleep time to the timekeeping 'max_deferment' value.
* Otherwise we can sleep as long as we want.
*/
delta = timekeeping_max_deferment();
@@ -886,7 +891,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
u64 basemono = ts->timer_expires_base;
u64 expires = ts->timer_expires;
- ktime_t tick = expires;
/* Make sure we won't be trying to stop it twice in a row. */
ts->timer_expires_base = 0;
@@ -895,8 +899,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
* If this CPU is the one which updates jiffies, then give up
* the assignment and let it be taken by the CPU which runs
* the tick timer next, which might be this CPU as well. If we
- * don't drop this here the jiffies might be stale and
- * do_timer() never invoked. Keep track of the fact that it
+ * don't drop this here, the jiffies might be stale and
+ * do_timer() never gets invoked. Keep track of the fact that it
* was the one which had the do_timer() duty last.
*/
if (cpu == tick_do_timer_cpu) {
@@ -906,10 +910,10 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
ts->do_timer_last = 0;
}
- /* Skip reprogram of event if its not changed */
+ /* Skip reprogram of event if it's not changed */
if (ts->tick_stopped && (expires == ts->next_tick)) {
/* Sanity check: make sure clockevent is actually programmed */
- if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
+ if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
return;
WARN_ON_ONCE(1);
@@ -919,11 +923,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
}
/*
- * nohz_stop_sched_tick can be called several times before
- * the nohz_restart_sched_tick is called. This happens when
- * interrupts arrive which do not cause a reschedule. In the
- * first call we save the current tick time, so we can restart
- * the scheduler tick in nohz_restart_sched_tick.
+ * tick_nohz_stop_tick() can be called several times before
+ * tick_nohz_restart_sched_tick() is called. This happens when
+ * interrupts arrive which do not cause a reschedule. In the first
+ * call we save the current tick time, so we can restart the
+ * scheduler tick in tick_nohz_restart_sched_tick().
*/
if (!ts->tick_stopped) {
calc_load_nohz_start();
@@ -934,7 +938,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
trace_tick_stop(1, TICK_DEP_MASK_NONE);
}
- ts->next_tick = tick;
+ ts->next_tick = expires;
/*
* If the expiration time == KTIME_MAX, then we simply stop
@@ -949,11 +953,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
}
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer, tick,
+ hrtimer_start(&ts->sched_timer, expires,
HRTIMER_MODE_ABS_PINNED_HARD);
} else {
- hrtimer_set_expires(&ts->sched_timer, tick);
- tick_program_event(tick, 1);
+ hrtimer_set_expires(&ts->sched_timer, expires);
+ tick_program_event(expires, 1);
}
}
@@ -985,9 +989,8 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
calc_load_nohz_stop();
touch_softlockup_watchdog_sched();
- /*
- * Cancel the scheduled timer and restore the tick
- */
+
+ /* Cancel the scheduled timer and restore the tick: */
ts->tick_stopped = 0;
tick_nohz_restart(ts, now);
}
@@ -1019,11 +1022,11 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
/*
* A pending softirq outside an IRQ (or softirq disabled section) context
* should be waiting for ksoftirqd to handle it. Therefore we shouldn't
- * reach here due to the need_resched() early check in can_stop_idle_tick().
+ * reach this code due to the need_resched() early check in can_stop_idle_tick().
*
* However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
* cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
- * triggering the below since wakep_softirqd() is ignored.
+ * triggering the code below, since wakep_softirqd() is ignored.
*
*/
static bool report_idle_softirq(void)
@@ -1044,7 +1047,7 @@ static bool report_idle_softirq(void)
if (ratelimit >= 10)
return false;
- /* On RT, softirqs handling may be waiting on some lock */
+ /* On RT, softirq handling may be waiting on some lock */
if (local_bh_blocked())
return false;
@@ -1061,8 +1064,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
* If this CPU is offline and it is the one which updates
* jiffies, then give up the assignment and let it be taken by
* the CPU which runs the tick timer next. If we don't drop
- * this here the jiffies might be stale and do_timer() never
- * invoked.
+ * this here, the jiffies might be stale and do_timer() never
+ * gets invoked.
*/
if (unlikely(!cpu_online(cpu))) {
if (cpu == tick_do_timer_cpu)
@@ -1175,12 +1178,23 @@ void tick_nohz_idle_enter(void)
}
/**
- * tick_nohz_irq_exit - update next tick event from interrupt exit
+ * tick_nohz_irq_exit - Notify the tick about IRQ exit
+ *
+ * A timer may have been added/modified/deleted either by the current IRQ,
+ * or by another place using this IRQ as a notification. This IRQ may have
+ * also updated the RCU callback list. These events may require a
+ * re-evaluation of the next tick. Depending on the context:
+ *
+ * 1) If the CPU is idle and no resched is pending, just proceed with idle
+ * time accounting. The next tick will be re-evaluated on the next idle
+ * loop iteration.
*
- * When an interrupt fires while we are idle and it doesn't cause
- * a reschedule, it may still add, modify or delete a timer, enqueue
- * an RCU callback, etc...
- * So we need to re-calculate and reprogram the next tick event.
+ * 2) If the CPU is nohz_full:
+ *
+ * 2.1) If there is any tick dependency, restart the tick if stopped.
+ *
+ * 2.2) If there is no tick dependency, (re-)evaluate the next tick and
+ * stop/update it accordingly.
*/
void tick_nohz_irq_exit(void)
{
@@ -1208,7 +1222,7 @@ bool tick_nohz_idle_got_tick(void)
/**
* tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
- * or the tick, whatever that expires first. Note that, if the tick has been
+ * or the tick, whichever expires first. Note that, if the tick has been
* stopped, it returns the next hrtimer.
*
* Called from power state control code with interrupts disabled
@@ -1252,7 +1266,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
return *delta_next;
/*
- * If the next highres timer to expire is earlier than next_event, the
+ * If the next highres timer to expire is earlier than 'next_event', the
* idle governor needs to know that.
*/
next_event = min_t(u64, next_event,
@@ -1296,9 +1310,9 @@ static void tick_nohz_account_idle_time(struct tick_sched *ts,
if (vtime_accounting_enabled_this_cpu())
return;
/*
- * We stopped the tick in idle. Update process times would miss the
- * time we slept as update_process_times does only a 1 tick
- * accounting. Enforce that this is accounted to idle !
+ * We stopped the tick in idle. update_process_times() would miss the
+ * time we slept, as it does only a 1 tick accounting.
+ * Enforce that this is accounted to idle !
*/
ticks = jiffies - ts->idle_jiffies;
/*
@@ -1330,11 +1344,20 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
}
/**
- * tick_nohz_idle_exit - restart the idle tick from the idle task
+ * tick_nohz_idle_exit - Update the tick upon idle task exit
+ *
+ * When the idle task exits, update the tick depending on the
+ * following situations:
+ *
+ * 1) If the CPU is not in nohz_full mode (most cases), then
+ * restart the tick.
+ *
+ * 2) If the CPU is in nohz_full mode (corner case):
+ * 2.1) If the tick can be kept stopped (no tick dependencies)
+ * then re-evaluate the next tick and try to keep it stopped
+ * as long as possible.
+ * 2.2) If the tick has dependencies, restart the tick.
*
- * Restart the idle tick when the CPU is woken up from idle
- * This also exit the RCU extended quiescent state. The CPU
- * can use RCU again after this function is called.
*/
void tick_nohz_idle_exit(void)
{
@@ -1364,9 +1387,15 @@ void tick_nohz_idle_exit(void)
}
/*
- * The nohz low res interrupt handler
+ * In low-resolution mode, the tick handler must be implemented directly
+ * at the clockevent level. hrtimer can't be used instead, because its
+ * infrastructure actually relies on the tick itself as a backend in
+ * low-resolution mode (see hrtimer_run_queues()).
+ *
+ * This low-resolution handler still makes use of some hrtimer APIs meanwhile
+ * for convenience with expiration calculation and forwarding.
*/
-static void tick_nohz_handler(struct clock_event_device *dev)
+static void tick_nohz_lowres_handler(struct clock_event_device *dev)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
struct pt_regs *regs = get_irq_regs();
@@ -1377,18 +1406,16 @@ static void tick_nohz_handler(struct clock_event_device *dev)
tick_sched_do_timer(ts, now);
tick_sched_handle(ts, regs);
- if (unlikely(ts->tick_stopped)) {
- /*
- * The clockevent device is not reprogrammed, so change the
- * clock event device to ONESHOT_STOPPED to avoid spurious
- * interrupts on devices which might not be truly one shot.
- */
- tick_program_event(KTIME_MAX, 1);
- return;
+ /*
+ * In dynticks mode, tick reprogram is deferred:
+ * - to the idle task if in dynticks-idle
+ * - to IRQ exit if in full-dynticks.
+ */
+ if (likely(!ts->tick_stopped)) {
+ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
- hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
- tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
@@ -1402,7 +1429,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
}
/**
- * tick_nohz_switch_to_nohz - switch to nohz mode
+ * tick_nohz_switch_to_nohz - switch to NOHZ mode
*/
static void tick_nohz_switch_to_nohz(void)
{
@@ -1412,12 +1439,12 @@ static void tick_nohz_switch_to_nohz(void)
if (!tick_nohz_enabled)
return;
- if (tick_switch_to_oneshot(tick_nohz_handler))
+ if (tick_switch_to_oneshot(tick_nohz_lowres_handler))
return;
/*
- * Recycle the hrtimer in ts, so we can share the
- * hrtimer_forward with the highres code.
+ * Recycle the hrtimer in 'ts', so we can share the
+ * hrtimer_forward_now() function with the highres code.
*/
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
/* Get the next period */
@@ -1440,7 +1467,7 @@ static inline void tick_nohz_irq_enter(void)
if (ts->idle_active)
tick_nohz_stop_idle(ts, now);
/*
- * If all CPUs are idle. We may need to update a stale jiffies value.
+ * If all CPUs are idle we may need to update a stale jiffies value.
* Note nohz_full is a special case: a timekeeper is guaranteed to stay
* alive but it might be busy looping with interrupts disabled in some
* rare case (typically stop machine). So we must make sure we have a
@@ -1459,7 +1486,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
- * Called from irq_enter to notify about the possible interruption of idle()
+ * Called from irq_enter() to notify about the possible interruption of idle()
*/
void tick_irq_enter(void)
{
@@ -1475,7 +1502,7 @@ void tick_irq_enter(void)
* We rearm the timer until we get disabled by the idle code.
* Called with interrupts disabled.
*/
-static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
+static enum hrtimer_restart tick_nohz_highres_handler(struct hrtimer *timer)
{
struct tick_sched *ts =
container_of(timer, struct tick_sched, sched_timer);
@@ -1485,15 +1512,19 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
tick_sched_do_timer(ts, now);
/*
- * Do not call, when we are not in irq context and have
- * no valid regs pointer
+ * Do not call when we are not in IRQ context and have
+ * no valid 'regs' pointer
*/
if (regs)
tick_sched_handle(ts, regs);
else
ts->next_tick = 0;
- /* No need to reprogram if we are in idle or full dynticks mode */
+ /*
+ * In dynticks mode, tick reprogram is deferred:
+ * - to the idle task if in dynticks-idle
+ * - to IRQ exit if in full-dynticks.
+ */
if (unlikely(ts->tick_stopped))
return HRTIMER_NORESTART;
@@ -1520,16 +1551,14 @@ void tick_setup_sched_timer(void)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now = ktime_get();
- /*
- * Emulate tick processing via per-CPU hrtimers:
- */
+ /* Emulate tick processing via per-CPU hrtimers: */
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- ts->sched_timer.function = tick_sched_timer;
+ ts->sched_timer.function = tick_nohz_highres_handler;
/* Get the next period (per-CPU) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
- /* Offset the tick to avert jiffies_lock contention. */
+ /* Offset the tick to avert 'jiffies_lock' contention. */
if (sched_skew_tick) {
u64 offset = TICK_NSEC >> 1;
do_div(offset, num_possible_cpus());
@@ -1547,13 +1576,23 @@ void tick_setup_sched_timer(void)
void tick_cancel_sched_timer(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t idle_sleeptime, iowait_sleeptime;
+ unsigned long idle_calls, idle_sleeps;
# ifdef CONFIG_HIGH_RES_TIMERS
if (ts->sched_timer.base)
hrtimer_cancel(&ts->sched_timer);
# endif
+ idle_sleeptime = ts->idle_sleeptime;
+ iowait_sleeptime = ts->iowait_sleeptime;
+ idle_calls = ts->idle_calls;
+ idle_sleeps = ts->idle_sleeps;
memset(ts, 0, sizeof(*ts));
+ ts->idle_sleeptime = idle_sleeptime;
+ ts->iowait_sleeptime = iowait_sleeptime;
+ ts->idle_calls = idle_calls;
+ ts->idle_sleeps = idle_sleeps;
}
#endif
@@ -1579,10 +1618,10 @@ void tick_oneshot_notify(void)
}
/*
- * Check, if a change happened, which makes oneshot possible.
+ * Check if a change happened, which makes oneshot possible.
*
- * Called cyclic from the hrtimer softirq (driven by the timer
- * softirq) allow_nohz signals, that we can switch into low-res nohz
+ * Called cyclically from the hrtimer softirq (driven by the timer
+ * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ
* mode, because high resolution timers are disabled (either compile
* or runtime). Called with interrupts disabled.
*/
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 63a8ce7177dd..352b161113cd 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -571,18 +571,15 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk,
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
- if (!is_timers_nohz_active())
- return;
-
/*
- * TODO: This wants some optimizing similar to the code below, but we
- * will do that when we switch from push to pull for deferrable timers.
+ * Deferrable timers do not prevent the CPU from entering dynticks and
+ * are not taken into account on the idle/nohz_full path. An IPI when a
+ * new deferrable timer is enqueued will wake up the remote CPU but
+ * nothing will be done with the deferrable timer base. Therefore skip
+ * the remote IPI for deferrable timers completely.
*/
- if (timer->flags & TIMER_DEFERRABLE) {
- if (tick_nohz_full_cpu(base->cpu))
- wake_up_nohz_cpu(base->cpu);
+ if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
return;
- }
/*
* We might have to IPI the remote CPU if the base is idle and the
@@ -606,7 +603,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
__set_bit(idx, base->pending_map);
timer_set_idx(timer, idx);
- trace_timer_start(timer, timer->expires, timer->flags);
+ trace_timer_start(timer, bucket_expiry);
/*
* Check whether this is the new first expiring timer. The
@@ -942,31 +939,34 @@ get_target_base(struct timer_base *base, unsigned tflags)
return get_timer_this_cpu_base(tflags);
}
-static inline void forward_timer_base(struct timer_base *base)
+static inline void __forward_timer_base(struct timer_base *base,
+ unsigned long basej)
{
- unsigned long jnow = READ_ONCE(jiffies);
-
/*
- * No need to forward if we are close enough below jiffies.
- * Also while executing timers, base->clk is 1 offset ahead
- * of jiffies to avoid endless requeuing to current jiffies.
+ * Check whether we can forward the base. We can only do that when
+ * @basej is past base->clk otherwise we might rewind base->clk.
*/
- if ((long)(jnow - base->clk) < 1)
+ if (time_before_eq(basej, base->clk))
return;
/*
* If the next expiry value is > jiffies, then we fast forward to
* jiffies otherwise we forward to the next expiry value.
*/
- if (time_after(base->next_expiry, jnow)) {
- base->clk = jnow;
+ if (time_after(base->next_expiry, basej)) {
+ base->clk = basej;
} else {
if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
return;
base->clk = base->next_expiry;
}
+
}
+static inline void forward_timer_base(struct timer_base *base)
+{
+ __forward_timer_base(base, READ_ONCE(jiffies));
+}
/*
* We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -1803,8 +1803,10 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset,
/*
* Search the first expiring timer in the various clock levels. Caller must
* hold base->lock.
+ *
+ * Store next expiry time in base->next_expiry.
*/
-static unsigned long __next_timer_interrupt(struct timer_base *base)
+static void next_expiry_recalc(struct timer_base *base)
{
unsigned long clk, next, adj;
unsigned lvl, offset = 0;
@@ -1870,10 +1872,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
clk += adj;
}
+ base->next_expiry = next;
base->next_expiry_recalc = false;
base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
-
- return next;
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -1921,8 +1922,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ unsigned long nextevt = basej + NEXT_TIMER_MAX_DELTA;
u64 expires = KTIME_MAX;
- unsigned long nextevt;
+ bool was_idle;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1933,37 +1935,44 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
raw_spin_lock(&base->lock);
if (base->next_expiry_recalc)
- base->next_expiry = __next_timer_interrupt(base);
- nextevt = base->next_expiry;
+ next_expiry_recalc(base);
/*
* We have a fresh next event. Check whether we can forward the
- * base. We can only do that when @basej is past base->clk
- * otherwise we might rewind base->clk.
+ * base.
*/
- if (time_after(basej, base->clk)) {
- if (time_after(nextevt, basej))
- base->clk = basej;
- else if (time_after(nextevt, base->clk))
- base->clk = nextevt;
- }
+ __forward_timer_base(base, basej);
- if (time_before_eq(nextevt, basej)) {
- expires = basem;
- base->is_idle = false;
+ if (base->timers_pending) {
+ nextevt = base->next_expiry;
+
+ /* If we missed a tick already, force 0 delta */
+ if (time_before(nextevt, basej))
+ nextevt = basej;
+ expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
} else {
- if (base->timers_pending)
- expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
- * If we expect to sleep more than a tick, mark the base idle.
- * Also the tick is stopped so any added timer must forward
- * the base clk itself to keep granularity small. This idle
- * logic is only maintained for the BASE_STD base, deferrable
- * timers may still see large granularity skew (by design).
+ * Move next_expiry for the empty base into the future to
+ * prevent a unnecessary raise of the timer softirq when the
+ * next_expiry value will be reached even if there is no timer
+ * pending.
*/
- if ((expires - basem) > TICK_NSEC)
- base->is_idle = true;
+ base->next_expiry = nextevt;
}
+
+ /*
+ * Base is idle if the next event is more than a tick away.
+ *
+ * If the base is marked idle then any timer add operation must forward
+ * the base clk itself to keep granularity small. This idle logic is
+ * only maintained for the BASE_STD base, deferrable timers may still
+ * see large granularity skew (by design).
+ */
+ was_idle = base->is_idle;
+ base->is_idle = time_after(nextevt, basej + 1);
+ if (was_idle != base->is_idle)
+ trace_timer_base_idle(base->is_idle, base->cpu);
+
raw_spin_unlock(&base->lock);
return cmp_next_hrtimer_event(basem, expires);
@@ -1984,7 +1993,10 @@ void timer_clear_idle(void)
* sending the IPI a few instructions smaller for the cost of taking
* the lock in the exit from idle path.
*/
- base->is_idle = false;
+ if (base->is_idle) {
+ base->is_idle = false;
+ trace_timer_base_idle(false, smp_processor_id());
+ }
}
#endif
@@ -2015,8 +2027,12 @@ static inline void __run_timers(struct timer_base *base)
*/
WARN_ON_ONCE(!levels && !base->next_expiry_recalc
&& base->timers_pending);
+ /*
+ * While executing timers, base->clk is set 1 offset ahead of
+ * jiffies to avoid endless requeuing to current jiffies.
+ */
base->clk++;
- base->next_expiry = __next_timer_interrupt(base);
+ next_expiry_recalc(base);
while (levels--)
expire_timers(base, heads + levels);
diff --git a/kernel/torture.c b/kernel/torture.c
index b28b05bbef02..c72ab2d251f4 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -87,14 +87,15 @@ EXPORT_SYMBOL_GPL(verbose_torout_sleep);
* nanosecond random fuzz. This function and its friends desynchronize
* testing from the timer wheel.
*/
-int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp)
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode mode,
+ struct torture_random_state *trsp)
{
ktime_t hto = baset_ns;
if (trsp)
hto += torture_random(trsp) % fuzzt_ns;
set_current_state(TASK_IDLE);
- return schedule_hrtimeout(&hto, HRTIMER_MODE_REL);
+ return schedule_hrtimeout(&hto, mode);
}
EXPORT_SYMBOL_GPL(torture_hrtimeout_ns);
@@ -106,7 +107,7 @@ int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state
{
ktime_t baset_ns = baset_us * NSEC_PER_USEC;
- return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+ return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
}
EXPORT_SYMBOL_GPL(torture_hrtimeout_us);
@@ -123,7 +124,7 @@ int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state
fuzzt_ns = (u32)~0U;
else
fuzzt_ns = fuzzt_us * NSEC_PER_USEC;
- return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+ return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
}
EXPORT_SYMBOL_GPL(torture_hrtimeout_ms);
@@ -136,7 +137,7 @@ int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp)
{
ktime_t baset_ns = jiffies_to_nsecs(baset_j);
- return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp);
+ return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), HRTIMER_MODE_REL, trsp);
}
EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies);
@@ -153,7 +154,7 @@ int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *
fuzzt_ns = (u32)~0U;
else
fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC;
- return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+ return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
}
EXPORT_SYMBOL_GPL(torture_hrtimeout_s);
@@ -520,9 +521,8 @@ static void torture_shuffle_task_unregister_all(void)
* A special case is when shuffle_idle_cpu = -1, in which case we allow
* the tasks to run on all CPUs.
*/
-static void torture_shuffle_tasks(void)
+static void torture_shuffle_tasks(struct torture_random_state *trp)
{
- DEFINE_TORTURE_RANDOM(rand);
struct shuffle_task *stp;
cpumask_setall(shuffle_tmp_mask);
@@ -543,7 +543,7 @@ static void torture_shuffle_tasks(void)
mutex_lock(&shuffle_task_mutex);
list_for_each_entry(stp, &shuffle_task_list, st_l) {
- if (!random_shuffle || torture_random(&rand) & 0x1)
+ if (!random_shuffle || torture_random(trp) & 0x1)
set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
}
mutex_unlock(&shuffle_task_mutex);
@@ -562,7 +562,7 @@ static int torture_shuffle(void *arg)
VERBOSE_TOROUT_STRING("torture_shuffle task started");
do {
torture_hrtimeout_jiffies(shuffle_interval, &rand);
- torture_shuffle_tasks();
+ torture_shuffle_tasks(&rand);
torture_shutdown_absorb("torture_shuffle");
} while (!torture_must_stop());
torture_kthread_stopping("torture_shuffle");
@@ -673,7 +673,7 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void))
if (ssecs > 0) {
shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0));
return torture_create_kthread(torture_shutdown, NULL,
- shutdown_task);
+ shutdown_task);
}
return 0;
}
@@ -720,7 +720,7 @@ static void torture_shutdown_cleanup(void)
* suddenly applied to or removed from the system.
*/
static struct task_struct *stutter_task;
-static int stutter_pause_test;
+static ktime_t stutter_till_abs_time;
static int stutter;
static int stutter_gap;
@@ -730,30 +730,16 @@ static int stutter_gap;
*/
bool stutter_wait(const char *title)
{
- unsigned int i = 0;
bool ret = false;
- int spt;
+ ktime_t till_ns;
cond_resched_tasks_rcu_qs();
- spt = READ_ONCE(stutter_pause_test);
- for (; spt; spt = READ_ONCE(stutter_pause_test)) {
- if (!ret && !rt_task(current)) {
- sched_set_normal(current, MAX_NICE);
- ret = true;
- }
- if (spt == 1) {
- torture_hrtimeout_jiffies(1, NULL);
- } else if (spt == 2) {
- while (READ_ONCE(stutter_pause_test)) {
- if (!(i++ & 0xffff))
- torture_hrtimeout_us(10, 0, NULL);
- cond_resched();
- }
- } else {
- torture_hrtimeout_jiffies(round_jiffies_relative(HZ), NULL);
- }
- torture_shutdown_absorb(title);
+ till_ns = READ_ONCE(stutter_till_abs_time);
+ if (till_ns && ktime_before(ktime_get(), till_ns)) {
+ torture_hrtimeout_ns(till_ns, 0, HRTIMER_MODE_ABS, NULL);
+ ret = true;
}
+ torture_shutdown_absorb(title);
return ret;
}
EXPORT_SYMBOL_GPL(stutter_wait);
@@ -764,23 +750,16 @@ EXPORT_SYMBOL_GPL(stutter_wait);
*/
static int torture_stutter(void *arg)
{
- DEFINE_TORTURE_RANDOM(rand);
- int wtime;
+ ktime_t till_ns;
VERBOSE_TOROUT_STRING("torture_stutter task started");
do {
if (!torture_must_stop() && stutter > 1) {
- wtime = stutter;
- if (stutter > 2) {
- WRITE_ONCE(stutter_pause_test, 1);
- wtime = stutter - 3;
- torture_hrtimeout_jiffies(wtime, &rand);
- wtime = 2;
- }
- WRITE_ONCE(stutter_pause_test, 2);
- torture_hrtimeout_jiffies(wtime, NULL);
+ till_ns = ktime_add_ns(ktime_get(),
+ jiffies_to_nsecs(stutter));
+ WRITE_ONCE(stutter_till_abs_time, till_ns);
+ torture_hrtimeout_jiffies(stutter - 1, NULL);
}
- WRITE_ONCE(stutter_pause_test, 0);
if (!torture_must_stop())
torture_hrtimeout_jiffies(stutter_gap, NULL);
torture_shutdown_absorb("torture_stutter");
@@ -812,6 +791,13 @@ static void torture_stutter_cleanup(void)
stutter_task = NULL;
}
+static void
+torture_print_module_parms(void)
+{
+ pr_alert("torture module --- %s: disable_onoff_at_boot=%d ftrace_dump_at_shutdown=%d verbose_sleep_frequency=%d verbose_sleep_duration=%d random_shuffle=%d\n",
+ torture_type, disable_onoff_at_boot, ftrace_dump_at_shutdown, verbose_sleep_frequency, verbose_sleep_duration, random_shuffle);
+}
+
/*
* Initialize torture module. Please note that this is -not- invoked via
* the usual module_init() mechanism, but rather by an explicit call from
@@ -834,6 +820,7 @@ bool torture_init_begin(char *ttype, int v)
torture_type = ttype;
verbose = v;
fullstop = FULLSTOP_DONTSTOP;
+ torture_print_module_parms();
return true;
}
EXPORT_SYMBOL_GPL(torture_init_begin);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a7264b2c17ad..7ac6c52b25eb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -24,6 +24,7 @@
#include <linux/key.h>
#include <linux/verification.h>
#include <linux/namei.h>
+#include <linux/fileattr.h>
#include <net/bpf_sk_storage.h>
@@ -41,6 +42,9 @@
#define bpf_event_rcu_dereference(p) \
rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
+#define MAX_UPROBE_MULTI_CNT (1U << 20)
+#define MAX_KPROBE_MULTI_CNT (1U << 20)
+
#ifdef CONFIG_MODULES
struct bpf_trace_module {
struct module *module;
@@ -117,6 +121,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
* and don't send kprobe event into ring-buffer,
* so return zero here
*/
+ rcu_read_lock();
+ bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array));
+ rcu_read_unlock();
ret = 0;
goto out;
}
@@ -1249,9 +1256,7 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
};
#ifdef CONFIG_KEYS
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
- "kfuncs which will be used in BPF programs");
+__bpf_kfunc_start_defs();
/**
* bpf_lookup_user_key - lookup a key by its serial
@@ -1375,6 +1380,8 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
struct bpf_dynptr_kern *sig_ptr,
struct bpf_key *trusted_keyring)
{
+ const void *data, *sig;
+ u32 data_len, sig_len;
int ret;
if (trusted_keyring->has_ref) {
@@ -1391,17 +1398,19 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
return ret;
}
- return verify_pkcs7_signature(data_ptr->data,
- __bpf_dynptr_size(data_ptr),
- sig_ptr->data,
- __bpf_dynptr_size(sig_ptr),
+ data_len = __bpf_dynptr_size(data_ptr);
+ data = __bpf_dynptr_data(data_ptr, data_len);
+ sig_len = __bpf_dynptr_size(sig_ptr);
+ sig = __bpf_dynptr_data(sig_ptr, sig_len);
+
+ return verify_pkcs7_signature(data, data_len, sig, sig_len,
trusted_keyring->key,
VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
NULL);
}
#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
-__diag_pop();
+__bpf_kfunc_end_defs();
BTF_SET8_START(key_sig_kfunc_set)
BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
@@ -1426,6 +1435,72 @@ static int __init bpf_key_sig_kfuncs_init(void)
late_initcall(bpf_key_sig_kfuncs_init);
#endif /* CONFIG_KEYS */
+/* filesystem kfuncs */
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_get_file_xattr - get xattr of a file
+ * @file: file to get xattr from
+ * @name__str: name of the xattr
+ * @value_ptr: output buffer of the xattr value
+ *
+ * Get xattr *name__str* of *file* and store the output in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "user." is allowed.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
+ struct bpf_dynptr_kern *value_ptr)
+{
+ struct dentry *dentry;
+ u32 value_len;
+ void *value;
+ int ret;
+
+ if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ return -EPERM;
+
+ value_len = __bpf_dynptr_size(value_ptr);
+ value = __bpf_dynptr_data_rw(value_ptr, value_len);
+ if (!value)
+ return -EINVAL;
+
+ dentry = file_dentry(file);
+ ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ);
+ if (ret)
+ return ret;
+ return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(fs_kfunc_set_ids)
+BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_SET8_END(fs_kfunc_set_ids)
+
+static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id))
+ return 0;
+
+ /* Only allow to attach from LSM hooks, to avoid recursion */
+ return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0;
+}
+
+static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &fs_kfunc_set_ids,
+ .filter = bpf_get_file_xattr_filter,
+};
+
+static int __init bpf_fs_kfuncs_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set);
+}
+
+late_initcall(bpf_fs_kfuncs_init);
+
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -2384,7 +2459,8 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
u32 *fd_type, const char **buf,
- u64 *probe_offset, u64 *probe_addr)
+ u64 *probe_offset, u64 *probe_addr,
+ unsigned long *missed)
{
bool is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
@@ -2419,7 +2495,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
#ifdef CONFIG_KPROBE_EVENTS
if (flags & TRACE_EVENT_FL_KPROBE)
err = bpf_get_kprobe_info(event, fd_type, buf,
- probe_offset, probe_addr,
+ probe_offset, probe_addr, missed,
event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
#ifdef CONFIG_UPROBE_EVENTS
@@ -2614,6 +2690,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
info->kprobe_multi.count = kmulti_link->cnt;
info->kprobe_multi.flags = kmulti_link->flags;
+ info->kprobe_multi.missed = kmulti_link->fp.nmissed;
if (!uaddrs)
return 0;
@@ -2710,6 +2787,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
int err;
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+ bpf_prog_inc_misses_counter(link->link.prog);
err = 0;
goto out;
}
@@ -2853,6 +2931,17 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3
return arr.mods_cnt;
}
+static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt)
+{
+ u32 i;
+
+ for (i = 0; i < cnt; i++) {
+ if (!within_error_injection_list(addrs[i]))
+ return -EINVAL;
+ }
+ return 0;
+}
+
int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_kprobe_multi_link *link = NULL;
@@ -2884,6 +2973,8 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
cnt = attr->link_create.kprobe_multi.cnt;
if (!cnt)
return -EINVAL;
+ if (cnt > MAX_KPROBE_MULTI_CNT)
+ return -E2BIG;
size = cnt * sizeof(*addrs);
addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
@@ -2930,6 +3021,11 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
goto error;
}
+ if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) {
+ err = -EINVAL;
+ goto error;
+ }
+
link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
err = -ENOMEM;
@@ -3009,6 +3105,7 @@ struct bpf_uprobe_multi_link;
struct bpf_uprobe {
struct bpf_uprobe_multi_link *link;
loff_t offset;
+ unsigned long ref_ctr_offset;
u64 cookie;
struct uprobe_consumer consumer;
};
@@ -3017,6 +3114,7 @@ struct bpf_uprobe_multi_link {
struct path path;
struct bpf_link link;
u32 cnt;
+ u32 flags;
struct bpf_uprobe *uprobes;
struct task_struct *task;
};
@@ -3058,9 +3156,79 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
kfree(umulti_link);
}
+static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets);
+ u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies);
+ u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets);
+ u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path);
+ u32 upath_size = info->uprobe_multi.path_size;
+ struct bpf_uprobe_multi_link *umulti_link;
+ u32 ucount = info->uprobe_multi.count;
+ int err = 0, i;
+ long left;
+
+ if (!upath ^ !upath_size)
+ return -EINVAL;
+
+ if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount)
+ return -EINVAL;
+
+ umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+ info->uprobe_multi.count = umulti_link->cnt;
+ info->uprobe_multi.flags = umulti_link->flags;
+ info->uprobe_multi.pid = umulti_link->task ?
+ task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
+
+ if (upath) {
+ char *p, *buf;
+
+ upath_size = min_t(u32, upath_size, PATH_MAX);
+
+ buf = kmalloc(upath_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ p = d_path(&umulti_link->path, buf, upath_size);
+ if (IS_ERR(p)) {
+ kfree(buf);
+ return PTR_ERR(p);
+ }
+ upath_size = buf + upath_size - p;
+ left = copy_to_user(upath, p, upath_size);
+ kfree(buf);
+ if (left)
+ return -EFAULT;
+ info->uprobe_multi.path_size = upath_size;
+ }
+
+ if (!uoffsets && !ucookies && !uref_ctr_offsets)
+ return 0;
+
+ if (ucount < umulti_link->cnt)
+ err = -ENOSPC;
+ else
+ ucount = umulti_link->cnt;
+
+ for (i = 0; i < ucount; i++) {
+ if (uoffsets &&
+ put_user(umulti_link->uprobes[i].offset, uoffsets + i))
+ return -EFAULT;
+ if (uref_ctr_offsets &&
+ put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i))
+ return -EFAULT;
+ if (ucookies &&
+ put_user(umulti_link->uprobes[i].cookie, ucookies + i))
+ return -EFAULT;
+ }
+
+ return err;
+}
+
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
.release = bpf_uprobe_multi_link_release,
.dealloc = bpf_uprobe_multi_link_dealloc,
+ .fill_link_info = bpf_uprobe_multi_link_fill_link_info,
};
static int uprobe_prog_run(struct bpf_uprobe *uprobe,
@@ -3148,7 +3316,6 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
{
struct bpf_uprobe_multi_link *link = NULL;
unsigned long __user *uref_ctr_offsets;
- unsigned long *ref_ctr_offsets = NULL;
struct bpf_link_primer link_primer;
struct bpf_uprobe *uprobes = NULL;
struct task_struct *task = NULL;
@@ -3182,6 +3349,8 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (!upath || !uoffsets || !cnt)
return -EINVAL;
+ if (cnt > MAX_UPROBE_MULTI_CNT)
+ return -E2BIG;
uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets);
ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies);
@@ -3207,8 +3376,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
rcu_read_unlock();
- if (!task)
+ if (!task) {
+ err = -ESRCH;
goto error_path_put;
+ }
}
err = -ENOMEM;
@@ -3219,22 +3390,20 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (!uprobes || !link)
goto error_free;
- if (uref_ctr_offsets) {
- ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL);
- if (!ref_ctr_offsets)
- goto error_free;
- }
-
for (i = 0; i < cnt; i++) {
- if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
+ if (__get_user(uprobes[i].offset, uoffsets + i)) {
err = -EFAULT;
goto error_free;
}
- if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) {
+ if (uprobes[i].offset < 0) {
+ err = -EINVAL;
+ goto error_free;
+ }
+ if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) {
err = -EFAULT;
goto error_free;
}
- if (__get_user(uprobes[i].offset, uoffsets + i)) {
+ if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
err = -EFAULT;
goto error_free;
}
@@ -3254,6 +3423,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
link->uprobes = uprobes;
link->path = path;
link->task = task;
+ link->flags = flags;
bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
&bpf_uprobe_multi_link_lops, prog);
@@ -3261,7 +3431,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
for (i = 0; i < cnt; i++) {
err = uprobe_register_refctr(d_real_inode(link->path.dentry),
uprobes[i].offset,
- ref_ctr_offsets ? ref_ctr_offsets[i] : 0,
+ uprobes[i].ref_ctr_offset,
&uprobes[i].consumer);
if (err) {
bpf_uprobe_unregister(&path, uprobes, i);
@@ -3273,11 +3443,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (err)
goto error_free;
- kvfree(ref_ctr_offsets);
return bpf_link_settle(&link_primer);
error_free:
- kvfree(ref_ctr_offsets);
kvfree(uprobes);
kfree(link);
if (task)
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 3b21f4063258..6cd2a4e3afb8 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -187,9 +187,9 @@ static void fprobe_init(struct fprobe *fp)
static int fprobe_init_rethook(struct fprobe *fp, int num)
{
- int i, size;
+ int size;
- if (num < 0)
+ if (num <= 0)
return -EINVAL;
if (!fp->exit_handler) {
@@ -202,29 +202,21 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
size = fp->nr_maxactive;
else
size = num * num_possible_cpus() * 2;
- if (size < 0)
- return -E2BIG;
-
- fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
- if (!fp->rethook)
- return -ENOMEM;
- for (i = 0; i < size; i++) {
- struct fprobe_rethook_node *node;
-
- node = kzalloc(sizeof(*node) + fp->entry_data_size, GFP_KERNEL);
- if (!node) {
- rethook_free(fp->rethook);
- fp->rethook = NULL;
- return -ENOMEM;
- }
- rethook_add_node(fp->rethook, &node->node);
- }
+ if (size <= 0)
+ return -EINVAL;
+
+ /* Initialize rethook */
+ fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler,
+ sizeof(struct fprobe_rethook_node), size);
+ if (IS_ERR(fp->rethook))
+ return PTR_ERR(fp->rethook);
+
return 0;
}
static void fprobe_fail_cleanup(struct fprobe *fp)
{
- if (fp->rethook) {
+ if (!IS_ERR_OR_NULL(fp->rethook)) {
/* Don't need to cleanup rethook->handler because this is not used. */
rethook_free(fp->rethook);
fp->rethook = NULL;
@@ -379,14 +371,14 @@ int unregister_fprobe(struct fprobe *fp)
if (!fprobe_is_registered(fp))
return -EINVAL;
- if (fp->rethook)
+ if (!IS_ERR_OR_NULL(fp->rethook))
rethook_stop(fp->rethook);
ret = unregister_ftrace_function(&fp->ops);
if (ret < 0)
return ret;
- if (fp->rethook)
+ if (!IS_ERR_OR_NULL(fp->rethook))
rethook_free(fp->rethook);
ftrace_free_filter(&fp->ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8de8bec5f366..c060d5b47910 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1183,18 +1183,19 @@ static void __add_hash_entry(struct ftrace_hash *hash,
hash->count++;
}
-static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+static struct ftrace_func_entry *
+add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
{
struct ftrace_func_entry *entry;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
- return -ENOMEM;
+ return NULL;
entry->ip = ip;
__add_hash_entry(hash, entry);
- return 0;
+ return entry;
}
static void
@@ -1349,7 +1350,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
struct ftrace_func_entry *entry;
struct ftrace_hash *new_hash;
int size;
- int ret;
int i;
new_hash = alloc_ftrace_hash(size_bits);
@@ -1366,8 +1366,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- ret = add_hash_entry(new_hash, entry->ip);
- if (ret < 0)
+ if (add_hash_entry(new_hash, entry->ip) == NULL)
goto free_hash;
}
}
@@ -2536,7 +2535,7 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec)
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
/* Protected by rcu_tasks for reading, and direct_mutex for writing */
-static struct ftrace_hash *direct_functions = EMPTY_HASH;
+static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH;
static DEFINE_MUTEX(direct_mutex);
int ftrace_direct_func_count;
@@ -2555,39 +2554,6 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
return entry->direct;
}
-static struct ftrace_func_entry*
-ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
- struct ftrace_hash **free_hash)
-{
- struct ftrace_func_entry *entry;
-
- if (ftrace_hash_empty(direct_functions) ||
- direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
- struct ftrace_hash *new_hash;
- int size = ftrace_hash_empty(direct_functions) ? 0 :
- direct_functions->count + 1;
-
- if (size < 32)
- size = 32;
-
- new_hash = dup_hash(direct_functions, size);
- if (!new_hash)
- return NULL;
-
- *free_hash = direct_functions;
- direct_functions = new_hash;
- }
-
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- return NULL;
-
- entry->ip = ip;
- entry->direct = addr;
- __add_hash_entry(direct_functions, entry);
- return entry;
-}
-
static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
@@ -4223,8 +4189,8 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter)
/* Do nothing if it exists */
if (entry)
return 0;
-
- ret = add_hash_entry(hash, rec->ip);
+ if (add_hash_entry(hash, rec->ip) == NULL)
+ ret = -ENOMEM;
}
return ret;
}
@@ -5266,7 +5232,8 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return 0;
}
- return add_hash_entry(hash, ip);
+ entry = add_hash_entry(hash, ip);
+ return entry ? 0 : -ENOMEM;
}
static int
@@ -5358,7 +5325,17 @@ static LIST_HEAD(ftrace_direct_funcs);
static int register_ftrace_function_nolock(struct ftrace_ops *ops);
+/*
+ * If there are multiple ftrace_ops, use SAVE_REGS by default, so that direct
+ * call will be jumped from ftrace_regs_caller. Only if the architecture does
+ * not support ftrace_regs_caller but direct_call, use SAVE_ARGS so that it
+ * jumps from ftrace_caller for multiple ftrace_ops.
+ */
+#ifndef HAVE_DYNAMIC_FTRACE_WITH_REGS
#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS)
+#else
+#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS)
+#endif
static int check_direct_multi(struct ftrace_ops *ops)
{
@@ -5410,7 +5387,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
*/
int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
- struct ftrace_hash *hash, *free_hash = NULL;
+ struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL;
struct ftrace_func_entry *entry, *new;
int err = -EBUSY, size, i;
@@ -5436,17 +5413,44 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
}
}
- /* ... and insert them to direct_functions hash. */
err = -ENOMEM;
+
+ /* Make a copy hash to place the new and the old entries in */
+ size = hash->count + direct_functions->count;
+ if (size > 32)
+ size = 32;
+ new_hash = alloc_ftrace_hash(fls(size));
+ if (!new_hash)
+ goto out_unlock;
+
+ /* Now copy over the existing direct entries */
+ size = 1 << direct_functions->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) {
+ new = add_hash_entry(new_hash, entry->ip);
+ if (!new)
+ goto out_unlock;
+ new->direct = entry->direct;
+ }
+ }
+
+ /* ... and add the new entries */
+ size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- new = ftrace_add_rec_direct(entry->ip, addr, &free_hash);
+ new = add_hash_entry(new_hash, entry->ip);
if (!new)
- goto out_remove;
+ goto out_unlock;
+ /* Update both the copy and the hash entry */
+ new->direct = addr;
entry->direct = addr;
}
}
+ free_hash = direct_functions;
+ rcu_assign_pointer(direct_functions, new_hash);
+ new_hash = NULL;
+
ops->func = call_direct_funcs;
ops->flags = MULTI_FLAGS;
ops->trampoline = FTRACE_REGS_ADDR;
@@ -5454,17 +5458,17 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
err = register_ftrace_function_nolock(ops);
- out_remove:
- if (err)
- remove_direct_functions_hash(hash, addr);
-
out_unlock:
mutex_unlock(&direct_mutex);
- if (free_hash) {
+ if (free_hash && free_hash != EMPTY_HASH) {
synchronize_rcu_tasks();
free_ftrace_hash(free_hash);
}
+
+ if (new_hash)
+ free_ftrace_hash(new_hash);
+
return err;
}
EXPORT_SYMBOL_GPL(register_ftrace_direct);
@@ -6309,7 +6313,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
if (entry)
continue;
- if (add_hash_entry(hash, rec->ip) < 0)
+ if (add_hash_entry(hash, rec->ip) == NULL)
goto out;
} else {
if (entry) {
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index 5eb9b598f4e9..fa03094e9e69 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -8,7 +8,6 @@
#include <linux/preempt.h>
#include <linux/rethook.h>
#include <linux/slab.h>
-#include <linux/sort.h>
/* Return hook list (shadow stack by list) */
@@ -36,21 +35,7 @@ void rethook_flush_task(struct task_struct *tk)
static void rethook_free_rcu(struct rcu_head *head)
{
struct rethook *rh = container_of(head, struct rethook, rcu);
- struct rethook_node *rhn;
- struct freelist_node *node;
- int count = 1;
-
- node = rh->pool.head;
- while (node) {
- rhn = container_of(node, struct rethook_node, freelist);
- node = node->next;
- kfree(rhn);
- count++;
- }
-
- /* The rh->ref is the number of pooled node + 1 */
- if (refcount_sub_and_test(count, &rh->ref))
- kfree(rh);
+ objpool_fini(&rh->pool);
}
/**
@@ -63,7 +48,7 @@ static void rethook_free_rcu(struct rcu_head *head)
*/
void rethook_stop(struct rethook *rh)
{
- WRITE_ONCE(rh->handler, NULL);
+ rcu_assign_pointer(rh->handler, NULL);
}
/**
@@ -78,59 +63,73 @@ void rethook_stop(struct rethook *rh)
*/
void rethook_free(struct rethook *rh)
{
- WRITE_ONCE(rh->handler, NULL);
+ rethook_stop(rh);
call_rcu(&rh->rcu, rethook_free_rcu);
}
+static int rethook_init_node(void *nod, void *context)
+{
+ struct rethook_node *node = nod;
+
+ node->rethook = context;
+ return 0;
+}
+
+static int rethook_fini_pool(struct objpool_head *head, void *context)
+{
+ kfree(context);
+ return 0;
+}
+
+static inline rethook_handler_t rethook_get_handler(struct rethook *rh)
+{
+ return (rethook_handler_t)rcu_dereference_check(rh->handler,
+ rcu_read_lock_any_held());
+}
+
/**
* rethook_alloc() - Allocate struct rethook.
* @data: a data to pass the @handler when hooking the return.
- * @handler: the return hook callback function.
+ * @handler: the return hook callback function, must NOT be NULL
+ * @size: node size: rethook node and additional data
+ * @num: number of rethook nodes to be preallocated
*
* Allocate and initialize a new rethook with @data and @handler.
- * Return NULL if memory allocation fails or @handler is NULL.
+ * Return pointer of new rethook, or error codes for failures.
+ *
* Note that @handler == NULL means this rethook is going to be freed.
*/
-struct rethook *rethook_alloc(void *data, rethook_handler_t handler)
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler,
+ int size, int num)
{
- struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+ struct rethook *rh;
- if (!rh || !handler) {
- kfree(rh);
- return NULL;
- }
+ if (!handler || num <= 0 || size < sizeof(struct rethook_node))
+ return ERR_PTR(-EINVAL);
+
+ rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+ if (!rh)
+ return ERR_PTR(-ENOMEM);
rh->data = data;
- rh->handler = handler;
- rh->pool.head = NULL;
- refcount_set(&rh->ref, 1);
+ rcu_assign_pointer(rh->handler, handler);
+ /* initialize the objpool for rethook nodes */
+ if (objpool_init(&rh->pool, num, size, GFP_KERNEL, rh,
+ rethook_init_node, rethook_fini_pool)) {
+ kfree(rh);
+ return ERR_PTR(-ENOMEM);
+ }
return rh;
}
-/**
- * rethook_add_node() - Add a new node to the rethook.
- * @rh: the struct rethook.
- * @node: the struct rethook_node to be added.
- *
- * Add @node to @rh. User must allocate @node (as a part of user's
- * data structure.) The @node fields are initialized in this function.
- */
-void rethook_add_node(struct rethook *rh, struct rethook_node *node)
-{
- node->rethook = rh;
- freelist_add(&node->freelist, &rh->pool);
- refcount_inc(&rh->ref);
-}
-
static void free_rethook_node_rcu(struct rcu_head *head)
{
struct rethook_node *node = container_of(head, struct rethook_node, rcu);
+ struct rethook *rh = node->rethook;
- if (refcount_dec_and_test(&node->rethook->ref))
- kfree(node->rethook);
- kfree(node);
+ objpool_drop(node, &rh->pool);
}
/**
@@ -142,10 +141,11 @@ static void free_rethook_node_rcu(struct rcu_head *head)
*/
void rethook_recycle(struct rethook_node *node)
{
- lockdep_assert_preemption_disabled();
+ rethook_handler_t handler;
- if (likely(READ_ONCE(node->rethook->handler)))
- freelist_add(&node->freelist, &node->rethook->pool);
+ handler = rethook_get_handler(node->rethook);
+ if (likely(handler))
+ objpool_push(node, &node->rethook->pool);
else
call_rcu(&node->rcu, free_rethook_node_rcu);
}
@@ -160,10 +160,7 @@ NOKPROBE_SYMBOL(rethook_recycle);
*/
struct rethook_node *rethook_try_get(struct rethook *rh)
{
- rethook_handler_t handler = READ_ONCE(rh->handler);
- struct freelist_node *fn;
-
- lockdep_assert_preemption_disabled();
+ rethook_handler_t handler = rethook_get_handler(rh);
/* Check whether @rh is going to be freed. */
if (unlikely(!handler))
@@ -178,11 +175,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
if (unlikely(!rcu_is_watching()))
return NULL;
- fn = freelist_try_get(&rh->pool);
- if (!fn)
- return NULL;
-
- return container_of(fn, struct rethook_node, freelist);
+ return (struct rethook_node *)objpool_pop(&rh->pool);
}
NOKPROBE_SYMBOL(rethook_try_get);
@@ -312,7 +305,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs,
rhn = container_of(first, struct rethook_node, llist);
if (WARN_ON_ONCE(rhn->frame != frame))
break;
- handler = READ_ONCE(rhn->rethook->handler);
+ handler = rethook_get_handler(rhn->rethook);
if (handler)
handler(rhn, rhn->rethook->data,
correct_ret_addr, regs);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 78502d4c7214..fd4bfe3ecf01 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -27,6 +27,7 @@
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <asm/local64.h>
#include <asm/local.h>
/*
@@ -317,6 +318,11 @@ struct buffer_data_page {
unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
};
+struct buffer_data_read_page {
+ unsigned order; /* order of the page */
+ struct buffer_data_page *data; /* actual data, stored in this page */
+};
+
/*
* Note, the buffer_page list must be first. The buffer pages
* are allocated in cache lines, which means that each buffer
@@ -331,6 +337,7 @@ struct buffer_page {
unsigned read; /* index for next read */
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
+ unsigned order; /* order of the page */
struct buffer_data_page *page; /* Actual data page */
};
@@ -354,9 +361,14 @@ static void rb_init_page(struct buffer_data_page *bpage)
local_set(&bpage->commit, 0);
}
+static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
+{
+ return local_read(&bpage->page->commit);
+}
+
static void free_buffer_page(struct buffer_page *bpage)
{
- free_page((unsigned long)bpage->page);
+ free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
}
@@ -368,41 +380,6 @@ static inline bool test_time_stamp(u64 delta)
return !!(delta & TS_DELTA_TEST);
}
-#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
-
-/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
-#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-
-int ring_buffer_print_page_header(struct trace_seq *s)
-{
- struct buffer_data_page field;
-
- trace_seq_printf(s, "\tfield: u64 timestamp;\t"
- "offset:0;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)sizeof(field.time_stamp),
- (unsigned int)is_signed_type(u64));
-
- trace_seq_printf(s, "\tfield: local_t commit;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- (unsigned int)sizeof(field.commit),
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: int overwrite;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- 1,
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: char data;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), data),
- (unsigned int)BUF_PAGE_SIZE,
- (unsigned int)is_signed_type(char));
-
- return !trace_seq_has_overflowed(s);
-}
-
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
@@ -458,27 +435,9 @@ enum {
RB_CTX_MAX
};
-#if BITS_PER_LONG == 32
-#define RB_TIME_32
-#endif
-
-/* To test on 64 bit machines */
-//#define RB_TIME_32
-
-#ifdef RB_TIME_32
-
-struct rb_time_struct {
- local_t cnt;
- local_t top;
- local_t bottom;
- local_t msb;
-};
-#else
-#include <asm/local64.h>
struct rb_time_struct {
local64_t time;
};
-#endif
typedef struct rb_time_struct rb_time_t;
#define MAX_NEST 5
@@ -552,6 +511,10 @@ struct trace_buffer {
struct rb_irq_work irq_work;
bool time_stamp_abs;
+
+ unsigned int subbuf_size;
+ unsigned int subbuf_order;
+ unsigned int max_data_size;
};
struct ring_buffer_iter {
@@ -565,194 +528,49 @@ struct ring_buffer_iter {
u64 read_stamp;
u64 page_stamp;
struct ring_buffer_event *event;
+ size_t event_size;
int missed_events;
};
-#ifdef RB_TIME_32
-
-/*
- * On 32 bit machines, local64_t is very expensive. As the ring
- * buffer doesn't need all the features of a true 64 bit atomic,
- * on 32 bit, it uses these functions (64 still uses local64_t).
- *
- * For the ring buffer, 64 bit required operations for the time is
- * the following:
- *
- * - Reads may fail if it interrupted a modification of the time stamp.
- * It will succeed if it did not interrupt another write even if
- * the read itself is interrupted by a write.
- * It returns whether it was successful or not.
- *
- * - Writes always succeed and will overwrite other writes and writes
- * that were done by events interrupting the current write.
- *
- * - A write followed by a read of the same time stamp will always succeed,
- * but may not contain the same value.
- *
- * - A cmpxchg will fail if it interrupted another write or cmpxchg.
- * Other than that, it acts like a normal cmpxchg.
- *
- * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
- * (bottom being the least significant 30 bits of the 60 bit time stamp).
- *
- * The two most significant bits of each half holds a 2 bit counter (0-3).
- * Each update will increment this counter by one.
- * When reading the top and bottom, if the two counter bits match then the
- * top and bottom together make a valid 60 bit number.
- */
-#define RB_TIME_SHIFT 30
-#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
-#define RB_TIME_MSB_SHIFT 60
-
-static inline int rb_time_cnt(unsigned long val)
-{
- return (val >> RB_TIME_SHIFT) & 3;
-}
-
-static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
-{
- u64 val;
-
- val = top & RB_TIME_VAL_MASK;
- val <<= RB_TIME_SHIFT;
- val |= bottom & RB_TIME_VAL_MASK;
-
- return val;
-}
-
-static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
-{
- unsigned long top, bottom, msb;
- unsigned long c;
-
- /*
- * If the read is interrupted by a write, then the cnt will
- * be different. Loop until both top and bottom have been read
- * without interruption.
- */
- do {
- c = local_read(&t->cnt);
- top = local_read(&t->top);
- bottom = local_read(&t->bottom);
- msb = local_read(&t->msb);
- } while (c != local_read(&t->cnt));
-
- *cnt = rb_time_cnt(top);
-
- /* If top and bottom counts don't match, this interrupted a write */
- if (*cnt != rb_time_cnt(bottom))
- return false;
-
- /* The shift to msb will lose its cnt bits */
- *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT);
- return true;
-}
-
-static bool rb_time_read(rb_time_t *t, u64 *ret)
-{
- unsigned long cnt;
-
- return __rb_time_read(t, ret, &cnt);
-}
-
-static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
-{
- return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
-}
-
-static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom,
- unsigned long *msb)
-{
- *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
- *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
- *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT);
-}
-
-static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
-{
- val = rb_time_val_cnt(val, cnt);
- local_set(t, val);
-}
-
-static void rb_time_set(rb_time_t *t, u64 val)
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s)
{
- unsigned long cnt, top, bottom, msb;
-
- rb_time_split(val, &top, &bottom, &msb);
-
- /* Writes always succeed with a valid number even if it gets interrupted. */
- do {
- cnt = local_inc_return(&t->cnt);
- rb_time_val_set(&t->top, top, cnt);
- rb_time_val_set(&t->bottom, bottom, cnt);
- rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt);
- } while (cnt != local_read(&t->cnt));
-}
-
-static inline bool
-rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
-{
- return local_try_cmpxchg(l, &expect, set);
-}
-
-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
-{
- unsigned long cnt, top, bottom, msb;
- unsigned long cnt2, top2, bottom2, msb2;
- u64 val;
-
- /* The cmpxchg always fails if it interrupted an update */
- if (!__rb_time_read(t, &val, &cnt2))
- return false;
-
- if (val != expect)
- return false;
+ struct buffer_data_page field;
- cnt = local_read(&t->cnt);
- if ((cnt & 3) != cnt2)
- return false;
+ trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)sizeof(field.time_stamp),
+ (unsigned int)is_signed_type(u64));
- cnt2 = cnt + 1;
+ trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit),
+ (unsigned int)is_signed_type(long));
- rb_time_split(val, &top, &bottom, &msb);
- top = rb_time_val_cnt(top, cnt);
- bottom = rb_time_val_cnt(bottom, cnt);
+ trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ 1,
+ (unsigned int)is_signed_type(long));
- rb_time_split(set, &top2, &bottom2, &msb2);
- top2 = rb_time_val_cnt(top2, cnt2);
- bottom2 = rb_time_val_cnt(bottom2, cnt2);
+ trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)buffer->subbuf_size,
+ (unsigned int)is_signed_type(char));
- if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
- return false;
- if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
- return false;
- if (!rb_time_read_cmpxchg(&t->top, top, top2))
- return false;
- if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
- return false;
- return true;
+ return !trace_seq_has_overflowed(s);
}
-#else /* 64 bits */
-
-/* local64_t always succeeds */
-
-static inline bool rb_time_read(rb_time_t *t, u64 *ret)
+static inline void rb_time_read(rb_time_t *t, u64 *ret)
{
*ret = local64_read(&t->time);
- return true;
}
static void rb_time_set(rb_time_t *t, u64 val)
{
local64_set(&t->time, val);
}
-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
-{
- return local64_try_cmpxchg(&t->time, &expect, set);
-}
-#endif
-
/*
* Enable this to make sure that the event passed to
* ring_buffer_event_time_stamp() is not committed and also
@@ -858,10 +676,7 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
WARN_ONCE(1, "nest (%d) greater than max", nest);
fail:
- /* Can only fail on 32 bit */
- if (!rb_time_read(&cpu_buffer->write_stamp, &ts))
- /* Screw it, just read the current time */
- ts = rb_time_stamp(cpu_buffer->buffer);
+ rb_time_read(&cpu_buffer->write_stamp, &ts);
return ts;
}
@@ -919,9 +734,14 @@ static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int f
if (!nr_pages || !full)
return true;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+ /*
+ * Add one as dirty will never equal nr_pages, as the sub-buffer
+ * that the writer is on is not counted as dirty.
+ * This is needed if "buffer_percent" is set to 100.
+ */
+ dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1;
- return (dirty * 100) > (full * nr_pages);
+ return (dirty * 100) >= (full * nr_pages);
}
/*
@@ -982,7 +802,8 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
/* make sure the waiters see the new index */
smp_wmb();
- rb_wake_up_waiters(&rbwork->work);
+ /* This can be called in any context */
+ irq_work_queue(&rbwork->work);
}
/**
@@ -1123,7 +944,7 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
full = 0;
} else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return -EINVAL;
+ return EPOLLERR;
cpu_buffer = buffer->buffers[cpu];
work = &cpu_buffer->irq_work;
@@ -1132,6 +953,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
if (full) {
poll_wait(filp, &work->full_waiters, poll_table);
work->full_waiters_pending = true;
+ if (!cpu_buffer->shortest_full ||
+ cpu_buffer->shortest_full > full)
+ cpu_buffer->shortest_full = full;
} else {
poll_wait(filp, &work->waiters, poll_table);
work->waiters_pending = true;
@@ -1648,10 +1472,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
+ cpu_buffer->buffer->subbuf_order);
if (!page)
goto free_pages;
bpage->page = page_address(page);
+ bpage->order = cpu_buffer->buffer->subbuf_order;
rb_init_page(bpage->page);
if (user_thread && fatal_signal_pending(current))
@@ -1730,7 +1556,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
rb_check_bpage(cpu_buffer, bpage);
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order);
if (!page)
goto fail_free_reader;
bpage->page = page_address(page);
@@ -1779,6 +1606,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
free_buffer_page(bpage);
}
+ free_page((unsigned long)cpu_buffer->free_page);
+
kfree(cpu_buffer);
}
@@ -1811,7 +1640,14 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ /* Default buffer page size - one system page */
+ buffer->subbuf_order = 0;
+ buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+
+ /* Max payload is buffer page size - header (8bytes) */
+ buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
+
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
@@ -2003,7 +1839,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
* Increment overrun to account for the lost events.
*/
local_add(page_entries, &cpu_buffer->overrun);
- local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+ local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes);
local_inc(&cpu_buffer->pages_lost);
}
@@ -2048,7 +1884,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
retries = 10;
success = false;
while (retries--) {
- struct list_head *head_page, *prev_page, *r;
+ struct list_head *head_page, *prev_page;
struct list_head *last_page, *first_page;
struct list_head *head_page_with_bit;
struct buffer_page *hpage = rb_set_head_page(cpu_buffer);
@@ -2067,9 +1903,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
last_page->next = head_page_with_bit;
first_page->prev = prev_page;
- r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
-
- if (r == head_page_with_bit) {
+ /* caution: head_page_with_bit gets updated on cmpxchg failure */
+ if (try_cmpxchg(&prev_page->next,
+ &head_page_with_bit, first_page)) {
/*
* yay, we replaced the page pointer to our new list,
* now, we just have to update to head page's prev
@@ -2130,7 +1966,7 @@ static void update_pages_handler(struct work_struct *work)
* @size: the new size.
* @cpu_id: the cpu buffer to resize
*
- * Minimum size is 2 * BUF_PAGE_SIZE.
+ * Minimum size is 2 * buffer->subbuf_size.
*
* Returns 0 on success and < 0 on failure.
*/
@@ -2152,7 +1988,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
!cpumask_test_cpu(cpu_id, buffer->cpumask))
return 0;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
/* we need a minimum of two pages */
if (nr_pages < 2)
@@ -2198,6 +2034,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
err = -ENOMEM;
goto out_err;
}
+
+ cond_resched();
}
cpus_read_lock();
@@ -2365,11 +2203,6 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->reader_page->read);
}
-static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
-{
- return local_read(&bpage->page->commit);
-}
-
static struct ring_buffer_event *
rb_iter_head_event(struct ring_buffer_iter *iter)
{
@@ -2388,6 +2221,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
*/
commit = rb_page_commit(iter_head_page);
smp_rmb();
+
+ /* An event needs to be at least 8 bytes in size */
+ if (iter->head > commit - 8)
+ goto reset;
+
event = __rb_page_index(iter_head_page, iter->head);
length = rb_event_length(event);
@@ -2397,7 +2235,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
*/
barrier();
- if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
+ if ((iter->head + length) > commit || length > iter->event_size)
/* Writer corrupted the read? */
goto reset;
@@ -2437,11 +2275,13 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
}
static __always_inline unsigned
-rb_event_index(struct ring_buffer_event *event)
+rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
- return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
+ addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1;
+
+ return addr - BUF_PAGE_HDR_SIZE;
}
static void rb_inc_iter(struct ring_buffer_iter *iter)
@@ -2510,7 +2350,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* the counters.
*/
local_add(entries, &cpu_buffer->overrun);
- local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+ local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
local_inc(&cpu_buffer->pages_lost);
/*
@@ -2630,6 +2470,7 @@ static inline void
rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long tail, struct rb_event_info *info)
{
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
struct buffer_page *tail_page = info->tail_page;
struct ring_buffer_event *event;
unsigned long length = info->length;
@@ -2638,13 +2479,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* Only the event that crossed the page boundary
* must fill the old tail_page with padding.
*/
- if (tail >= BUF_PAGE_SIZE) {
+ if (tail >= bsize) {
/*
* If the page was filled, then we still need
* to update the real_end. Reset it to zero
* and the reader will ignore it.
*/
- if (tail == BUF_PAGE_SIZE)
+ if (tail == bsize)
tail_page->real_end = 0;
local_sub(length, &tail_page->write);
@@ -2653,9 +2494,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
event = __rb_page_index(tail_page, tail);
- /* account for padding bytes */
- local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
-
/*
* Save the original length to the meta data.
* This will be used by the reader to add lost event
@@ -2669,12 +2507,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* write counter enough to allow another writer to slip
* in on this page.
* We put in a discarded commit instead, to make sure
- * that this space is not used again.
+ * that this space is not used again, and this space will
+ * not be accounted into 'entries_bytes'.
*
* If we are less than the minimum size, we don't need to
* worry about it.
*/
- if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+ if (tail > (bsize - RB_EVNT_MIN_SIZE)) {
/* No room for any events */
/* Mark the rest of the page with padding */
@@ -2689,16 +2528,19 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
}
/* Put in a discarded event */
- event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+ event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING;
/* time delta must be non zero */
event->time_delta = 1;
+ /* account for padding bytes */
+ local_add(bsize - tail, &cpu_buffer->entries_bytes);
+
/* Make sure the padding is visible before the tail_page->write update */
smp_wmb();
/* Set write to end of buffer */
- length = (tail + length) - BUF_PAGE_SIZE;
+ length = (tail + length) - bsize;
local_sub(length, &tail_page->write);
}
@@ -2812,7 +2654,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* Slow path */
static struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
+rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event, u64 delta, bool abs)
{
if (abs)
event->type_len = RINGBUF_TYPE_TIME_STAMP;
@@ -2820,7 +2663,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
event->type_len = RINGBUF_TYPE_TIME_EXTEND;
/* Not the first event on the page, or not delta? */
- if (abs || rb_event_index(event)) {
+ if (abs || rb_event_index(cpu_buffer, event)) {
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
@@ -2850,7 +2693,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
(unsigned long long)info->ts,
(unsigned long long)info->before,
(unsigned long long)info->after,
- (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
+ (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}),
sched_clock_stable() ? "" :
"If you just came from a suspend/resume,\n"
"please switch to the trace global clock:\n"
@@ -2894,7 +2737,7 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
if (!abs)
info->delta = 0;
}
- *event = rb_add_time_stamp(*event, info->delta, abs);
+ *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs);
*length -= RB_LEN_TIME_EXTEND;
*delta = 0;
}
@@ -2970,25 +2813,6 @@ static unsigned rb_calculate_event_length(unsigned length)
return length;
}
-static u64 rb_time_delta(struct ring_buffer_event *event)
-{
- switch (event->type_len) {
- case RINGBUF_TYPE_PADDING:
- return 0;
-
- case RINGBUF_TYPE_TIME_EXTEND:
- return rb_event_time_stamp(event);
-
- case RINGBUF_TYPE_TIME_STAMP:
- return 0;
-
- case RINGBUF_TYPE_DATA:
- return event->time_delta;
- default:
- return 0;
- }
-}
-
static inline bool
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
@@ -2996,51 +2820,42 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long new_index, old_index;
struct buffer_page *bpage;
unsigned long addr;
- u64 write_stamp;
- u64 delta;
- new_index = rb_event_index(event);
+ new_index = rb_event_index(cpu_buffer, event);
old_index = new_index + rb_event_ts_length(event);
addr = (unsigned long)event;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
bpage = READ_ONCE(cpu_buffer->tail_page);
- delta = rb_time_delta(event);
-
- if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
- return false;
-
- /* Make sure the write stamp is read before testing the location */
- barrier();
-
+ /*
+ * Make sure the tail_page is still the same and
+ * the next write location is the end of this event
+ */
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
local_read(&bpage->write) & ~RB_WRITE_MASK;
unsigned long event_length = rb_event_length(event);
- /* Something came in, can't discard */
- if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
- write_stamp, write_stamp - delta))
- return false;
-
/*
- * It's possible that the event time delta is zero
- * (has the same time stamp as the previous event)
- * in which case write_stamp and before_stamp could
- * be the same. In such a case, force before_stamp
- * to be different than write_stamp. It doesn't
- * matter what it is, as long as its different.
+ * For the before_stamp to be different than the write_stamp
+ * to make sure that the next event adds an absolute
+ * value and does not rely on the saved write stamp, which
+ * is now going to be bogus.
+ *
+ * By setting the before_stamp to zero, the next event
+ * is not going to use the write_stamp and will instead
+ * create an absolute timestamp. This means there's no
+ * reason to update the wirte_stamp!
*/
- if (!delta)
- rb_time_set(&cpu_buffer->before_stamp, 0);
+ rb_time_set(&cpu_buffer->before_stamp, 0);
/*
* If an event were to come in now, it would see that the
* write_stamp and the before_stamp are different, and assume
* that this event just added itself before updating
* the write stamp. The interrupting event will fix the
- * write stamp for us, and use the before stamp as its delta.
+ * write stamp for us, and use an absolute timestamp.
*/
/*
@@ -3396,6 +3211,76 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
#define CHECK_FULL_PAGE 1L
#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
+
+static const char *show_irq_str(int bits)
+{
+ const char *type[] = {
+ ".", // 0
+ "s", // 1
+ "h", // 2
+ "Hs", // 3
+ "n", // 4
+ "Ns", // 5
+ "Nh", // 6
+ "NHs", // 7
+ };
+
+ return type[bits];
+}
+
+/* Assume this is an trace event */
+static const char *show_flags(struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+ int bits = 0;
+
+ if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+ return "X";
+
+ entry = ring_buffer_event_data(event);
+
+ if (entry->flags & TRACE_FLAG_SOFTIRQ)
+ bits |= 1;
+
+ if (entry->flags & TRACE_FLAG_HARDIRQ)
+ bits |= 2;
+
+ if (entry->flags & TRACE_FLAG_NMI)
+ bits |= 4;
+
+ return show_irq_str(bits);
+}
+
+static const char *show_irq(struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+
+ if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+ return "";
+
+ entry = ring_buffer_event_data(event);
+ if (entry->flags & TRACE_FLAG_IRQS_OFF)
+ return "d";
+ return "";
+}
+
+static const char *show_interrupt_level(void)
+{
+ unsigned long pc = preempt_count();
+ unsigned char level = 0;
+
+ if (pc & SOFTIRQ_OFFSET)
+ level |= 1;
+
+ if (pc & HARDIRQ_MASK)
+ level |= 2;
+
+ if (pc & NMI_MASK)
+ level |= 4;
+
+ return show_irq_str(level);
+}
+
static void dump_buffer_page(struct buffer_data_page *bpage,
struct rb_event_info *info,
unsigned long tail)
@@ -3416,34 +3301,57 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
case RINGBUF_TYPE_TIME_EXTEND:
delta = rb_event_time_stamp(event);
ts += delta;
- pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta);
+ pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n",
+ e, ts, delta);
break;
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
ts = rb_fix_abs_ts(delta, ts);
- pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
+ pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n",
+ e, ts, delta);
break;
case RINGBUF_TYPE_PADDING:
ts += event->time_delta;
- pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta);
+ pr_warn(" 0x%x: [%lld] delta:%d PADDING\n",
+ e, ts, event->time_delta);
break;
case RINGBUF_TYPE_DATA:
ts += event->time_delta;
- pr_warn(" [%lld] delta:%d\n", ts, event->time_delta);
+ pr_warn(" 0x%x: [%lld] delta:%d %s%s\n",
+ e, ts, event->time_delta,
+ show_flags(event), show_irq(event));
break;
default:
break;
}
}
+ pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e);
}
static DEFINE_PER_CPU(atomic_t, checking);
static atomic_t ts_dump;
+#define buffer_warn_return(fmt, ...) \
+ do { \
+ /* If another report is happening, ignore this one */ \
+ if (atomic_inc_return(&ts_dump) != 1) { \
+ atomic_dec(&ts_dump); \
+ goto out; \
+ } \
+ atomic_inc(&cpu_buffer->record_disabled); \
+ pr_warn(fmt, ##__VA_ARGS__); \
+ dump_buffer_page(bpage, info, tail); \
+ atomic_dec(&ts_dump); \
+ /* There's some cases in boot up that this can happen */ \
+ if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \
+ /* Do not re-enable checking */ \
+ return; \
+ } while (0)
+
/*
* Check if the current event time stamp matches the deltas on
* the buffer page.
@@ -3477,7 +3385,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
return;
/*
- * If this interrupted another event,
+ * If this interrupted another event,
*/
if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
goto out;
@@ -3497,7 +3405,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
- ts = rb_fix_abs_ts(delta, ts);
+ delta = rb_fix_abs_ts(delta, ts);
+ if (delta < ts) {
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
+ cpu_buffer->cpu, ts, delta);
+ }
+ ts = delta;
break;
case RINGBUF_TYPE_PADDING:
@@ -3514,23 +3427,11 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
}
if ((full && ts > info->ts) ||
(!full && ts + info->delta != info->ts)) {
- /* If another report is happening, ignore this one */
- if (atomic_inc_return(&ts_dump) != 1) {
- atomic_dec(&ts_dump);
- goto out;
- }
- atomic_inc(&cpu_buffer->record_disabled);
- /* There's some cases in boot up that this can happen */
- WARN_ON_ONCE(system_state != SYSTEM_BOOTING);
- pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n",
- cpu_buffer->cpu,
- ts + info->delta, info->ts, info->delta,
- info->before, info->after,
- full ? " (full)" : "");
- dump_buffer_page(bpage, info, tail);
- atomic_dec(&ts_dump);
- /* Do not re-enable checking */
- return;
+ buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
+ cpu_buffer->cpu,
+ ts + info->delta, info->ts, info->delta,
+ info->before, info->after,
+ full ? " (full)" : "", show_interrupt_level());
}
out:
atomic_dec(this_cpu_ptr(&checking));
@@ -3550,16 +3451,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event;
struct buffer_page *tail_page;
unsigned long tail, write, w;
- bool a_ok;
- bool b_ok;
/* Don't let the compiler play games with cpu_buffer->tail_page */
tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
/*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK;
barrier();
- b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ rb_time_read(&cpu_buffer->write_stamp, &info->after);
barrier();
info->ts = rb_time_stamp(cpu_buffer->buffer);
@@ -3571,7 +3470,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
* absolute timestamp.
* Don't bother if this is the start of a new page (w == 0).
*/
- if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
+ if (!w) {
+ /* Use the sub-buffer timestamp */
+ info->delta = 0;
+ } else if (unlikely(info->before != info->after)) {
info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
info->length += RB_LEN_TIME_EXTEND;
} else {
@@ -3593,27 +3495,20 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
tail = write - info->length;
/* See if we shot pass the end of this buffer page */
- if (unlikely(write > BUF_PAGE_SIZE)) {
- /* before and after may now different, fix it up*/
- b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- if (a_ok && b_ok && info->before != info->after)
- (void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
- info->before, info->after);
- if (a_ok && b_ok)
- check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
+ if (unlikely(write > cpu_buffer->buffer->subbuf_size)) {
+ check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
return rb_move_tail(cpu_buffer, tail, info);
}
if (likely(tail == w)) {
- u64 save_before;
- bool s_ok;
-
/* Nothing interrupted us between A and C */
/*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts);
- barrier();
- /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
- RB_WARN_ON(cpu_buffer, !s_ok);
+ /*
+ * If something came in between C and D, the write stamp
+ * may now not be in sync. But that's fine as the before_stamp
+ * will be different and then next event will just be forced
+ * to use an absolute timestamp.
+ */
if (likely(!(info->add_timestamp &
(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
/* This did not interrupt any time update */
@@ -3621,41 +3516,37 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
else
/* Just use full timestamp for interrupting event */
info->delta = info->ts;
- barrier();
check_buffer(cpu_buffer, info, tail);
- if (unlikely(info->ts != save_before)) {
- /* SLOW PATH - Interrupted between C and E */
-
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- RB_WARN_ON(cpu_buffer, !a_ok);
-
- /* Write stamp must only go forward */
- if (save_before > info->after) {
- /*
- * We do not care about the result, only that
- * it gets updated atomically.
- */
- (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
- info->after, save_before);
- }
- }
} else {
u64 ts;
/* SLOW PATH - Interrupted between A and C */
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- /* Was interrupted before here, write_stamp must be valid */
- RB_WARN_ON(cpu_buffer, !a_ok);
+
+ /* Save the old before_stamp */
+ rb_time_read(&cpu_buffer->before_stamp, &info->before);
+
+ /*
+ * Read a new timestamp and update the before_stamp to make
+ * the next event after this one force using an absolute
+ * timestamp. This is in case an interrupt were to come in
+ * between E and F.
+ */
ts = rb_time_stamp(cpu_buffer->buffer);
+ rb_time_set(&cpu_buffer->before_stamp, ts);
+
barrier();
- /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
- info->after < ts &&
- rb_time_cmpxchg(&cpu_buffer->write_stamp,
- info->after, ts)) {
- /* Nothing came after this event between C and E */
+ /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ barrier();
+ /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+ info->after == info->before && info->after < ts) {
+ /*
+ * Nothing came after this event between C and F, it is
+ * safe to use info->after for the delta as it
+ * matched info->before and is still valid.
+ */
info->delta = ts - info->after;
} else {
/*
- * Interrupted between C and E:
+ * Interrupted between C and F:
* Lost the previous events time stamp. Just set the
* delta to zero, and this will be the same time as
* the event this event interrupted. And the events that
@@ -3706,6 +3597,12 @@ rb_reserve_next_event(struct trace_buffer *buffer,
int nr_loops = 0;
int add_ts_default;
+ /* ring buffer does cmpxchg, make sure it is safe in NMI context */
+ if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
+ (unlikely(in_nmi()))) {
+ return NULL;
+ }
+
rb_start_commit(cpu_buffer);
/* The commit page can not change after this */
@@ -3729,6 +3626,8 @@ rb_reserve_next_event(struct trace_buffer *buffer,
if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
add_ts_default = RB_ADD_STAMP_ABSOLUTE;
info.length += RB_LEN_TIME_EXTEND;
+ if (info.length > cpu_buffer->buffer->max_data_size)
+ goto out_fail;
} else {
add_ts_default = RB_ADD_STAMP_NONE;
}
@@ -3802,7 +3701,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
goto out;
- if (unlikely(length > BUF_MAX_DATA_SIZE))
+ if (unlikely(length > buffer->max_data_size))
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -3836,7 +3735,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage = cpu_buffer->commit_page;
struct buffer_page *start;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
/* Do the likely case first */
if (likely(bpage->page == (void *)addr)) {
@@ -3952,7 +3851,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
if (atomic_read(&cpu_buffer->record_disabled))
goto out;
- if (length > BUF_MAX_DATA_SIZE)
+ if (length > buffer->max_data_size)
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -4208,7 +4107,7 @@ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
/**
- * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
+ * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer
* @buffer: The ring buffer
* @cpu: The per CPU buffer to read from.
*/
@@ -4532,6 +4431,7 @@ static struct buffer_page *
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
unsigned long overwrite;
unsigned long flags;
int nr_loops = 0;
@@ -4667,7 +4567,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
#define USECS_WAIT 1000000
for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
/* If the write is past the end of page, a writer is still updating it */
- if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
+ if (likely(!reader || rb_page_write(reader) <= bsize))
break;
udelay(1);
@@ -4716,6 +4616,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
length = rb_event_length(event);
cpu_buffer->reader_page->read += length;
+ cpu_buffer->read_bytes += length;
}
static void rb_advance_iter(struct ring_buffer_iter *iter)
@@ -5109,7 +5010,9 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
if (!iter)
return NULL;
- iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags);
+ /* Holds the entire event: data and meta data */
+ iter->event_size = buffer->subbuf_size;
+ iter->event = kmalloc(iter->event_size, flags);
if (!iter->event) {
kfree(iter);
return NULL;
@@ -5225,19 +5128,28 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_advance);
*/
unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
{
- /*
- * Earlier, this method returned
- * BUF_PAGE_SIZE * buffer->nr_pages
- * Since the nr_pages field is now removed, we have converted this to
- * return the per cpu buffer value.
- */
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
- return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
+ return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages;
}
EXPORT_SYMBOL_GPL(ring_buffer_size);
+/**
+ * ring_buffer_max_event_size - return the max data size of an event
+ * @buffer: The ring buffer.
+ *
+ * Returns the maximum size an event can be.
+ */
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer)
+{
+ /* If abs timestamp is requested, events have a timestamp too */
+ if (ring_buffer_time_stamp_abs(buffer))
+ return buffer->max_data_size - RB_LEN_TIME_EXTEND;
+ return buffer->max_data_size;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_max_event_size);
+
static void rb_clear_buffer_page(struct buffer_page *page)
{
local_set(&page->write, 0);
@@ -5508,6 +5420,9 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
+ if (buffer_a->subbuf_order != buffer_b->subbuf_order)
+ goto out;
+
ret = -EAGAIN;
if (atomic_read(&buffer_a->record_disabled))
@@ -5579,40 +5494,48 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* Returns:
* The page allocated, or ERR_PTR
*/
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = NULL;
+ struct buffer_data_read_page *bpage = NULL;
unsigned long flags;
struct page *page;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return ERR_PTR(-ENODEV);
+ bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
+ if (!bpage)
+ return ERR_PTR(-ENOMEM);
+
+ bpage->order = buffer->subbuf_order;
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
if (cpu_buffer->free_page) {
- bpage = cpu_buffer->free_page;
+ bpage->data = cpu_buffer->free_page;
cpu_buffer->free_page = NULL;
}
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
- if (bpage)
+ if (bpage->data)
goto out;
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page) {
+ kfree(bpage);
return ERR_PTR(-ENOMEM);
+ }
- bpage = page_address(page);
+ bpage->data = page_address(page);
out:
- rb_init_page(bpage);
+ rb_init_page(bpage->data);
return bpage;
}
@@ -5622,14 +5545,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
* ring_buffer_free_read_page - free an allocated read page
* @buffer: the buffer the page was allocate for
* @cpu: the cpu buffer the page came from
- * @data: the page to free
+ * @data_page: the page to free
*
* Free a page allocated from ring_buffer_alloc_read_page.
*/
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+ struct buffer_data_read_page *data_page)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = data;
+ struct buffer_data_page *bpage = data_page->data;
struct page *page = virt_to_page(bpage);
unsigned long flags;
@@ -5638,8 +5562,12 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
cpu_buffer = buffer->buffers[cpu];
- /* If the page is still in use someplace else, we can't reuse it */
- if (page_ref_count(page) > 1)
+ /*
+ * If the page is still in use someplace else, or order of the page
+ * is different from the subbuffer order of the buffer -
+ * we can't reuse it
+ */
+ if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order)
goto out;
local_irq_save(flags);
@@ -5654,7 +5582,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
local_irq_restore(flags);
out:
- free_page((unsigned long)bpage);
+ free_pages((unsigned long)bpage, data_page->order);
+ kfree(data_page);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
@@ -5675,9 +5604,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
* if (IS_ERR(rpage))
* return PTR_ERR(rpage);
- * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
+ * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0);
* if (ret >= 0)
- * process_page(rpage, ret);
+ * process_page(ring_buffer_read_page_data(rpage), ret);
+ * ring_buffer_free_read_page(buffer, cpu, rpage);
*
* When @full is set, the function will not return true unless
* the writer is off the reader page.
@@ -5692,7 +5622,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* <0 if no data has been transferred.
*/
int ring_buffer_read_page(struct trace_buffer *buffer,
- void **data_page, size_t len, int cpu, int full)
+ struct buffer_data_read_page *data_page,
+ size_t len, int cpu, int full)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
@@ -5717,10 +5648,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
len -= BUF_PAGE_HDR_SIZE;
- if (!data_page)
+ if (!data_page || !data_page->data)
+ goto out;
+ if (data_page->order != buffer->subbuf_order)
goto out;
- bpage = *data_page;
+ bpage = data_page->data;
if (!bpage)
goto out;
@@ -5809,16 +5742,16 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
} else {
/* update the entry counter */
cpu_buffer->read += rb_page_entries(reader);
- cpu_buffer->read_bytes += BUF_PAGE_SIZE;
+ cpu_buffer->read_bytes += rb_page_commit(reader);
/* swap the pages */
rb_init_page(bpage);
bpage = reader->page;
- reader->page = *data_page;
+ reader->page = data_page->data;
local_set(&reader->write, 0);
local_set(&reader->entries, 0);
reader->read = 0;
- *data_page = bpage;
+ data_page->data = bpage;
/*
* Use the real_end for the data size,
@@ -5840,7 +5773,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* If there is room at the end of the page to save the
* missed events, then record it there.
*/
- if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+ if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
memcpy(&bpage->data[commit], &missed_events,
sizeof(missed_events));
local_add(RB_MISSED_STORED, &bpage->commit);
@@ -5852,8 +5785,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/*
* This page may be off to user land. Zero it out here.
*/
- if (commit < BUF_PAGE_SIZE)
- memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+ if (commit < buffer->subbuf_size)
+ memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
out_unlock:
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -5863,6 +5796,209 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+/**
+ * ring_buffer_read_page_data - get pointer to the data in the page.
+ * @page: the page to get the data from
+ *
+ * Returns pointer to the actual data in this page.
+ */
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page)
+{
+ return page->data;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_page_data);
+
+/**
+ * ring_buffer_subbuf_size_get - get size of the sub buffer.
+ * @buffer: the buffer to get the sub buffer size from
+ *
+ * Returns size of the sub buffer, in bytes.
+ */
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer)
+{
+ return buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get);
+
+/**
+ * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page.
+ * @buffer: The ring_buffer to get the system sub page order from
+ *
+ * By default, one ring buffer sub page equals to one system page. This parameter
+ * is configurable, per ring buffer. The size of the ring buffer sub page can be
+ * extended, but must be an order of system page size.
+ *
+ * Returns the order of buffer sub page size, in system pages:
+ * 0 means the sub buffer size is 1 system page and so forth.
+ * In case of an error < 0 is returned.
+ */
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer)
+{
+ if (!buffer)
+ return -EINVAL;
+
+ return buffer->subbuf_order;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
+
+/**
+ * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
+ * @buffer: The ring_buffer to set the new page size.
+ * @order: Order of the system pages in one sub buffer page
+ *
+ * By default, one ring buffer pages equals to one system page. This API can be
+ * used to set new size of the ring buffer page. The size must be order of
+ * system page size, that's why the input parameter @order is the order of
+ * system pages that are allocated for one ring buffer page:
+ * 0 - 1 system page
+ * 1 - 2 system pages
+ * 3 - 4 system pages
+ * ...
+ *
+ * Returns 0 on success or < 0 in case of an error.
+ */
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *bpage, *tmp;
+ int old_order, old_size;
+ int nr_pages;
+ int psize;
+ int err;
+ int cpu;
+
+ if (!buffer || order < 0)
+ return -EINVAL;
+
+ if (buffer->subbuf_order == order)
+ return 0;
+
+ psize = (1 << order) * PAGE_SIZE;
+ if (psize <= BUF_PAGE_HDR_SIZE)
+ return -EINVAL;
+
+ old_order = buffer->subbuf_order;
+ old_size = buffer->subbuf_size;
+
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+ atomic_inc(&buffer->record_disabled);
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ buffer->subbuf_order = order;
+ buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
+
+ /* Make sure all new buffers are allocated, before deleting the old ones */
+ for_each_buffer_cpu(buffer, cpu) {
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ /* Update the number of pages to match the new size */
+ nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
+ nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
+
+ /* we need a minimum of two pages */
+ if (nr_pages < 2)
+ nr_pages = 2;
+
+ cpu_buffer->nr_pages_to_update = nr_pages;
+
+ /* Include the reader page */
+ nr_pages++;
+
+ /* Allocate the new size buffer */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (__rb_allocate_pages(cpu_buffer, nr_pages,
+ &cpu_buffer->new_pages)) {
+ /* not enough memory for new pages */
+ err = -ENOMEM;
+ goto error;
+ }
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ /* Clear the head bit to make the link list normal to read */
+ rb_head_page_deactivate(cpu_buffer);
+
+ /* Now walk the list and free all the old sub buffers */
+ list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ /* The above loop stopped an the last page needing to be freed */
+ bpage = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ free_buffer_page(bpage);
+
+ /* Free the current reader page */
+ free_buffer_page(cpu_buffer->reader_page);
+
+ /* One page was allocated for the reader page */
+ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next,
+ struct buffer_page, list);
+ list_del_init(&cpu_buffer->reader_page->list);
+
+ /* The cpu_buffer pages are a link list with no head */
+ cpu_buffer->pages = cpu_buffer->new_pages.next;
+ cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev;
+ cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next;
+
+ /* Clear the new_pages list */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update;
+ cpu_buffer->nr_pages_to_update = 0;
+
+ free_pages((unsigned long)cpu_buffer->free_page, old_order);
+ cpu_buffer->free_page = NULL;
+
+ rb_head_page_activate(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+ }
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ return 0;
+
+error:
+ buffer->subbuf_order = old_order;
+ buffer->subbuf_size = old_size;
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index aef34673d79d..008187ebd7fe 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -104,10 +104,11 @@ static enum event_status read_event(int cpu)
static enum event_status read_page(int cpu)
{
+ struct buffer_data_read_page *bpage;
struct ring_buffer_event *event;
struct rb_page *rpage;
unsigned long commit;
- void *bpage;
+ int page_size;
int *entry;
int ret;
int inc;
@@ -117,14 +118,15 @@ static enum event_status read_page(int cpu)
if (IS_ERR(bpage))
return EVENT_DROPPED;
- ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+ page_size = ring_buffer_subbuf_size_get(buffer);
+ ret = ring_buffer_read_page(buffer, bpage, page_size, cpu, 1);
if (ret >= 0) {
- rpage = bpage;
+ rpage = ring_buffer_read_page_data(bpage);
/* The commit may have missed event flags set, clear them */
commit = local_read(&rpage->commit) & 0xfffff;
for (i = 0; i < commit && !test_error ; i += inc) {
- if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+ if (i >= (page_size - offsetof(struct rb_page, data))) {
TEST_ERROR();
break;
}
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index 8dfe85499d4a..354c2117be43 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -477,6 +477,17 @@ static int __init synth_event_gen_test_init(void)
ret = test_trace_synth_event();
WARN_ON(ret);
+
+ /* Disable when done */
+ trace_array_set_clr_event(gen_synth_test->tr,
+ "synthetic",
+ "gen_synth_test", false);
+ trace_array_set_clr_event(empty_synth_test->tr,
+ "synthetic",
+ "empty_synth_test", false);
+ trace_array_set_clr_event(create_synth_test->tr,
+ "synthetic",
+ "create_synth_test", false);
out:
return ret;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2b4ded753367..9ff8a439d674 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -54,12 +54,6 @@
#include "trace.h"
#include "trace_output.h"
-/*
- * On boot up, the ring buffer is set to the minimum size, so that
- * we do not waste memory on systems that are not using tracing.
- */
-bool ring_buffer_expanded;
-
#ifdef CONFIG_FTRACE_STARTUP_TEST
/*
* We need to change this state when a selftest is running.
@@ -202,7 +196,7 @@ static int __init set_cmdline_ftrace(char *str)
strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
default_bootup_tracer = bootup_tracer_buf;
/* We are using ftrace early, expand it */
- ring_buffer_expanded = true;
+ trace_set_ring_buffer_expanded(NULL);
return 1;
}
__setup("ftrace=", set_cmdline_ftrace);
@@ -247,7 +241,7 @@ static int __init boot_alloc_snapshot(char *str)
} else {
allocate_snapshot = true;
/* We also need the main ring buffer expanded */
- ring_buffer_expanded = true;
+ trace_set_ring_buffer_expanded(NULL);
}
return 1;
}
@@ -490,6 +484,13 @@ static struct trace_array global_trace = {
.trace_flags = TRACE_DEFAULT_FLAGS,
};
+void trace_set_ring_buffer_expanded(struct trace_array *tr)
+{
+ if (!tr)
+ tr = &global_trace;
+ tr->ring_buffer_expanded = true;
+}
+
LIST_HEAD(ftrace_trace_arrays);
int trace_array_get(struct trace_array *this_tr)
@@ -1262,10 +1263,17 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val);
int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
+ int order;
int ret;
if (!tr->allocated_snapshot) {
+ /* Make the snapshot buffer have the same order as main buffer */
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ if (ret < 0)
+ return ret;
+
/* allocate spare buffer */
ret = resize_buffer_duplicate_size(&tr->max_buffer,
&tr->array_buffer, RING_BUFFER_ALL_CPUS);
@@ -1285,6 +1293,7 @@ static void free_snapshot(struct trace_array *tr)
* The max_tr ring buffer has some state (e.g. ring->clock) and
* we want preserve it.
*/
+ ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0);
ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
set_buffer_entries(&tr->max_buffer, 1);
tracing_reset_online_cpus(&tr->max_buffer);
@@ -1730,15 +1739,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
int len;
- if (trace_seq_used(s) <= s->seq.readpos)
+ if (trace_seq_used(s) <= s->readpos)
return -EBUSY;
- len = trace_seq_used(s) - s->seq.readpos;
+ len = trace_seq_used(s) - s->readpos;
if (cnt > len)
cnt = len;
- memcpy(buf, s->buffer + s->seq.readpos, cnt);
+ memcpy(buf, s->buffer + s->readpos, cnt);
- s->seq.readpos += cnt;
+ s->readpos += cnt;
return cnt;
}
@@ -1772,7 +1781,7 @@ static void trace_create_maxlat_file(struct trace_array *tr,
init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
tr->d_max_latency = trace_create_file("tracing_max_latency",
TRACE_MODE_WRITE,
- d_tracer, &tr->max_latency,
+ d_tracer, tr,
&tracing_max_lat_fops);
}
@@ -1805,7 +1814,7 @@ void latency_fsnotify(struct trace_array *tr)
#define trace_create_maxlat_file(tr, d_tracer) \
trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \
- d_tracer, &tr->max_latency, &tracing_max_lat_fops)
+ d_tracer, tr, &tracing_max_lat_fops)
#endif
@@ -1893,6 +1902,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
__update_max_tr(tr, tsk, cpu);
arch_spin_unlock(&tr->max_lock);
+
+ /* Any waiters on the old snapshot buffer need to wake up */
+ ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
}
/**
@@ -1944,12 +1956,23 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
static int wait_on_pipe(struct trace_iterator *iter, int full)
{
+ int ret;
+
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
- return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file,
- full);
+ ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ /*
+ * Make sure this is still the snapshot buffer, as if a snapshot were
+ * to happen, this would now be the main buffer.
+ */
+ if (iter->snapshot)
+ iter->array_buffer = &iter->tr->max_buffer;
+#endif
+ return ret;
}
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -2012,7 +2035,7 @@ static int run_tracer_selftest(struct tracer *type)
#ifdef CONFIG_TRACER_MAX_TRACE
if (type->use_max_tr) {
/* If we expanded the buffers, make sure the max is expanded too */
- if (ring_buffer_expanded)
+ if (tr->ring_buffer_expanded)
ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
RING_BUFFER_ALL_CPUS);
tr->allocated_snapshot = true;
@@ -2038,7 +2061,7 @@ static int run_tracer_selftest(struct tracer *type)
tr->allocated_snapshot = false;
/* Shrink the max buffer again */
- if (ring_buffer_expanded)
+ if (tr->ring_buffer_expanded)
ring_buffer_resize(tr->max_buffer.buffer, 1,
RING_BUFFER_ALL_CPUS);
}
@@ -2297,7 +2320,7 @@ struct saved_cmdlines_buffer {
unsigned *map_cmdline_to_pid;
unsigned cmdline_num;
int cmdline_idx;
- char *saved_cmdlines;
+ char saved_cmdlines[];
};
static struct saved_cmdlines_buffer *savedcmd;
@@ -2311,47 +2334,58 @@ static inline void set_cmdline(int idx, const char *cmdline)
strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}
-static int allocate_cmdlines_buffer(unsigned int val,
- struct saved_cmdlines_buffer *s)
+static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
{
+ int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
+
+ kfree(s->map_cmdline_to_pid);
+ free_pages((unsigned long)s, order);
+}
+
+static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
+{
+ struct saved_cmdlines_buffer *s;
+ struct page *page;
+ int orig_size, size;
+ int order;
+
+ /* Figure out how much is needed to hold the given number of cmdlines */
+ orig_size = sizeof(*s) + val * TASK_COMM_LEN;
+ order = get_order(orig_size);
+ size = 1 << (order + PAGE_SHIFT);
+ page = alloc_pages(GFP_KERNEL, order);
+ if (!page)
+ return NULL;
+
+ s = page_address(page);
+ memset(s, 0, sizeof(*s));
+
+ /* Round up to actual allocation */
+ val = (size - sizeof(*s)) / TASK_COMM_LEN;
+ s->cmdline_num = val;
+
s->map_cmdline_to_pid = kmalloc_array(val,
sizeof(*s->map_cmdline_to_pid),
GFP_KERNEL);
- if (!s->map_cmdline_to_pid)
- return -ENOMEM;
-
- s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL);
- if (!s->saved_cmdlines) {
- kfree(s->map_cmdline_to_pid);
- return -ENOMEM;
+ if (!s->map_cmdline_to_pid) {
+ free_saved_cmdlines_buffer(s);
+ return NULL;
}
s->cmdline_idx = 0;
- s->cmdline_num = val;
memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
val * sizeof(*s->map_cmdline_to_pid));
- return 0;
+ return s;
}
static int trace_create_savedcmd(void)
{
- int ret;
-
- savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
- if (!savedcmd)
- return -ENOMEM;
+ savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
- ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
- if (ret < 0) {
- kfree(savedcmd);
- savedcmd = NULL;
- return -ENOMEM;
- }
-
- return 0;
+ return savedcmd ? 0 : -ENOMEM;
}
int is_tracing_stopped(void)
@@ -2359,13 +2393,7 @@ int is_tracing_stopped(void)
return global_trace.stop_count;
}
-/**
- * tracing_start - quick start of the tracer
- *
- * If tracing is enabled but was stopped by tracing_stop,
- * this will start the tracer back up.
- */
-void tracing_start(void)
+static void tracing_start_tr(struct trace_array *tr)
{
struct trace_buffer *buffer;
unsigned long flags;
@@ -2373,119 +2401,83 @@ void tracing_start(void)
if (tracing_disabled)
return;
- raw_spin_lock_irqsave(&global_trace.start_lock, flags);
- if (--global_trace.stop_count) {
- if (global_trace.stop_count < 0) {
+ raw_spin_lock_irqsave(&tr->start_lock, flags);
+ if (--tr->stop_count) {
+ if (WARN_ON_ONCE(tr->stop_count < 0)) {
/* Someone screwed up their debugging */
- WARN_ON_ONCE(1);
- global_trace.stop_count = 0;
+ tr->stop_count = 0;
}
goto out;
}
/* Prevent the buffers from switching */
- arch_spin_lock(&global_trace.max_lock);
+ arch_spin_lock(&tr->max_lock);
- buffer = global_trace.array_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- buffer = global_trace.max_buffer.buffer;
+ buffer = tr->max_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
#endif
- arch_spin_unlock(&global_trace.max_lock);
-
- out:
- raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
-}
-
-static void tracing_start_tr(struct trace_array *tr)
-{
- struct trace_buffer *buffer;
- unsigned long flags;
-
- if (tracing_disabled)
- return;
-
- /* If global, we need to also start the max tracer */
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
- return tracing_start();
-
- raw_spin_lock_irqsave(&tr->start_lock, flags);
-
- if (--tr->stop_count) {
- if (tr->stop_count < 0) {
- /* Someone screwed up their debugging */
- WARN_ON_ONCE(1);
- tr->stop_count = 0;
- }
- goto out;
- }
-
- buffer = tr->array_buffer.buffer;
- if (buffer)
- ring_buffer_record_enable(buffer);
+ arch_spin_unlock(&tr->max_lock);
out:
raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
/**
- * tracing_stop - quick stop of the tracer
+ * tracing_start - quick start of the tracer
*
- * Light weight way to stop tracing. Use in conjunction with
- * tracing_start.
+ * If tracing is enabled but was stopped by tracing_stop,
+ * this will start the tracer back up.
*/
-void tracing_stop(void)
+void tracing_start(void)
+
+{
+ return tracing_start_tr(&global_trace);
+}
+
+static void tracing_stop_tr(struct trace_array *tr)
{
struct trace_buffer *buffer;
unsigned long flags;
- raw_spin_lock_irqsave(&global_trace.start_lock, flags);
- if (global_trace.stop_count++)
+ raw_spin_lock_irqsave(&tr->start_lock, flags);
+ if (tr->stop_count++)
goto out;
/* Prevent the buffers from switching */
- arch_spin_lock(&global_trace.max_lock);
+ arch_spin_lock(&tr->max_lock);
- buffer = global_trace.array_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- buffer = global_trace.max_buffer.buffer;
+ buffer = tr->max_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
#endif
- arch_spin_unlock(&global_trace.max_lock);
+ arch_spin_unlock(&tr->max_lock);
out:
- raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+ raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
-static void tracing_stop_tr(struct trace_array *tr)
+/**
+ * tracing_stop - quick stop of the tracer
+ *
+ * Light weight way to stop tracing. Use in conjunction with
+ * tracing_start.
+ */
+void tracing_stop(void)
{
- struct trace_buffer *buffer;
- unsigned long flags;
-
- /* If global, we need to also stop the max tracer */
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
- return tracing_stop();
-
- raw_spin_lock_irqsave(&tr->start_lock, flags);
- if (tr->stop_count++)
- goto out;
-
- buffer = tr->array_buffer.buffer;
- if (buffer)
- ring_buffer_record_disable(buffer);
-
- out:
- raw_spin_unlock_irqrestore(&tr->start_lock, flags);
+ return tracing_stop_tr(&global_trace);
}
static int trace_save_cmdline(struct task_struct *tsk)
@@ -2769,8 +2761,11 @@ void trace_buffered_event_enable(void)
for_each_tracing_cpu(cpu) {
page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
- goto failed;
+ /* This is just an optimization and can handle failures */
+ if (!page) {
+ pr_err("Failed to allocate event buffer\n");
+ break;
+ }
event = page_address(page);
memset(event, 0, sizeof(*event));
@@ -2784,10 +2779,6 @@ void trace_buffered_event_enable(void)
WARN_ON_ONCE(1);
preempt_enable();
}
-
- return;
- failed:
- trace_buffered_event_disable();
}
static void enable_trace_buffered_event(void *data)
@@ -2822,11 +2813,9 @@ void trace_buffered_event_disable(void)
if (--trace_buffered_event_ref)
return;
- preempt_disable();
/* For each CPU, set the buffer as used. */
- smp_call_function_many(tracing_buffer_mask,
- disable_trace_buffered_event, NULL, 1);
- preempt_enable();
+ on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event,
+ NULL, true);
/* Wait for all current users to finish */
synchronize_rcu();
@@ -2835,17 +2824,19 @@ void trace_buffered_event_disable(void)
free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
per_cpu(trace_buffered_event, cpu) = NULL;
}
+
/*
- * Make sure trace_buffered_event is NULL before clearing
- * trace_buffered_event_cnt.
+ * Wait for all CPUs that potentially started checking if they can use
+ * their event buffer only after the previous synchronize_rcu() call and
+ * they still read a valid pointer from trace_buffered_event. It must be
+ * ensured they don't see cleared trace_buffered_event_cnt else they
+ * could wrongly decide to use the pointed-to buffer which is now freed.
*/
- smp_wmb();
+ synchronize_rcu();
- preempt_disable();
- /* Do the work on each cpu */
- smp_call_function_many(tracing_buffer_mask,
- enable_trace_buffered_event, NULL, 1);
- preempt_enable();
+ /* For each CPU, relinquish the buffer */
+ on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL,
+ true);
}
static struct trace_buffer *temp_buffer;
@@ -3403,7 +3394,7 @@ void trace_printk_init_buffers(void)
pr_warn("**********************************************************\n");
/* Expand the buffers to set size */
- tracing_update_buffers();
+ tracing_update_buffers(&global_trace);
buffers_allocated = 1;
@@ -3795,7 +3786,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
/* OK if part of the temp seq buffer */
if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
- (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE))
+ (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE))
return true;
/* Core rodata can not be freed */
@@ -3827,15 +3818,6 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
return false;
}
-static const char *show_buffer(struct trace_seq *s)
-{
- struct seq_buf *seq = &s->seq;
-
- seq_buf_terminate(seq);
-
- return seq->buffer;
-}
-
static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
static int test_can_verify_check(const char *fmt, ...)
@@ -3975,7 +3957,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
*/
if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
"fmt: '%s' current_buffer: '%s'",
- fmt, show_buffer(&iter->seq))) {
+ fmt, seq_buf_str(&iter->seq.seq))) {
int ret;
/* Try to safely read the string */
@@ -4773,7 +4755,11 @@ static int s_show(struct seq_file *m, void *v)
iter->leftover = ret;
} else {
- print_trace_line(iter);
+ ret = print_trace_line(iter);
+ if (ret == TRACE_TYPE_PARTIAL_LINE) {
+ iter->seq.full = 0;
+ trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
+ }
ret = trace_print_seq(m, &iter->seq);
/*
* If we overflow the seq_file buffer, then it will
@@ -4973,6 +4959,54 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp)
return 0;
}
+/*
+ * The private pointer of the inode is the trace_event_file.
+ * Update the tr ref count associated to it.
+ */
+int tracing_open_file_tr(struct inode *inode, struct file *filp)
+{
+ struct trace_event_file *file = inode->i_private;
+ int ret;
+
+ ret = tracing_check_open_get_tr(file->tr);
+ if (ret)
+ return ret;
+
+ mutex_lock(&event_mutex);
+
+ /* Fail if the file is marked for removal */
+ if (file->flags & EVENT_FILE_FL_FREED) {
+ trace_array_put(file->tr);
+ ret = -ENODEV;
+ } else {
+ event_file_get(file);
+ }
+
+ mutex_unlock(&event_mutex);
+ if (ret)
+ return ret;
+
+ filp->private_data = inode->i_private;
+
+ return 0;
+}
+
+int tracing_release_file_tr(struct inode *inode, struct file *filp)
+{
+ struct trace_event_file *file = inode->i_private;
+
+ trace_array_put(file->tr);
+ event_file_put(file);
+
+ return 0;
+}
+
+int tracing_single_release_file_tr(struct inode *inode, struct file *filp)
+{
+ tracing_release_file_tr(inode, filp);
+ return single_release(inode, filp);
+}
+
static int tracing_mark_open(struct inode *inode, struct file *filp)
{
stream_open(inode, filp);
@@ -5017,7 +5051,7 @@ static int tracing_release(struct inode *inode, struct file *file)
return 0;
}
-static int tracing_release_generic_tr(struct inode *inode, struct file *file)
+int tracing_release_generic_tr(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -6033,26 +6067,14 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
-static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
-{
- kfree(s->saved_cmdlines);
- kfree(s->map_cmdline_to_pid);
- kfree(s);
-}
-
static int tracing_resize_saved_cmdlines(unsigned int val)
{
struct saved_cmdlines_buffer *s, *savedcmd_temp;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = allocate_cmdlines_buffer(val);
if (!s)
return -ENOMEM;
- if (allocate_cmdlines_buffer(val, s) < 0) {
- kfree(s);
- return -ENOMEM;
- }
-
preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
@@ -6347,19 +6369,21 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
* we use the size that was given, and we can forget about
* expanding it later.
*/
- ring_buffer_expanded = true;
+ trace_set_ring_buffer_expanded(tr);
/* May be called before buffers are initialized */
if (!tr->array_buffer.buffer)
return 0;
+ /* Do not allow tracing while resizing ring buffer */
+ tracing_stop_tr(tr);
+
ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu);
if (ret < 0)
- return ret;
+ goto out_start;
#ifdef CONFIG_TRACER_MAX_TRACE
- if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
- !tr->current_trace->use_max_tr)
+ if (!tr->allocated_snapshot)
goto out;
ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
@@ -6384,7 +6408,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
WARN_ON(1);
tracing_disabled = 1;
}
- return ret;
+ goto out_start;
}
update_buffer_entries(&tr->max_buffer, cpu);
@@ -6393,7 +6417,8 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
#endif /* CONFIG_TRACER_MAX_TRACE */
update_buffer_entries(&tr->array_buffer, cpu);
-
+ out_start:
+ tracing_start_tr(tr);
return ret;
}
@@ -6425,6 +6450,7 @@ out:
/**
* tracing_update_buffers - used by tracing facility to expand ring buffers
+ * @tr: The tracing instance
*
* To save on memory when the tracing is never used on a system with it
* configured in. The ring buffers are set to a minimum size. But once
@@ -6433,13 +6459,13 @@ out:
*
* This function is to be called when a tracer is about to be used.
*/
-int tracing_update_buffers(void)
+int tracing_update_buffers(struct trace_array *tr)
{
int ret = 0;
mutex_lock(&trace_types_lock);
- if (!ring_buffer_expanded)
- ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
+ if (!tr->ring_buffer_expanded)
+ ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
mutex_unlock(&trace_types_lock);
@@ -6493,7 +6519,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
mutex_lock(&trace_types_lock);
- if (!ring_buffer_expanded) {
+ if (!tr->ring_buffer_expanded) {
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
if (ret < 0)
@@ -6691,14 +6717,18 @@ static ssize_t
tracing_max_lat_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos);
+ struct trace_array *tr = filp->private_data;
+
+ return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos);
}
static ssize_t
tracing_max_lat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
+ struct trace_array *tr = filp->private_data;
+
+ return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos);
}
#endif
@@ -6923,8 +6953,8 @@ waitagain:
goto out;
}
- if (cnt >= PAGE_SIZE)
- cnt = PAGE_SIZE - 1;
+ if (cnt >= TRACE_SEQ_BUFFER_SIZE)
+ cnt = TRACE_SEQ_BUFFER_SIZE - 1;
/* reset all but tr, trace, and overruns */
trace_iterator_reset(iter);
@@ -6975,7 +7005,7 @@ waitagain:
/* Now copy what we have to the user */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
- if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
+ if (iter->seq.readpos >= trace_seq_used(&iter->seq))
trace_seq_init(&iter->seq);
/*
@@ -7161,7 +7191,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
}
if (buf_size_same) {
- if (!ring_buffer_expanded)
+ if (!tr->ring_buffer_expanded)
r = sprintf(buf, "%lu (expanded: %lu)\n",
size >> 10,
trace_buf_size >> 10);
@@ -7218,10 +7248,10 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
mutex_lock(&trace_types_lock);
for_each_tracing_cpu(cpu) {
size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10;
- if (!ring_buffer_expanded)
+ if (!tr->ring_buffer_expanded)
expanded_size += trace_buf_size >> 10;
}
- if (ring_buffer_expanded)
+ if (tr->ring_buffer_expanded)
r = sprintf(buf, "%lu\n", size);
else
r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
@@ -7269,8 +7299,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
enum event_trigger_type tt = ETT_NONE;
struct trace_buffer *buffer;
struct print_entry *entry;
+ int meta_size;
ssize_t written;
- int size;
+ size_t size;
int len;
/* Used in tracing_mark_raw_write() as well */
@@ -7283,23 +7314,44 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (!(tr->trace_flags & TRACE_ITER_MARKERS))
return -EINVAL;
- if (cnt > TRACE_BUF_SIZE)
- cnt = TRACE_BUF_SIZE;
-
- BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
+ if ((ssize_t)cnt < 0)
+ return -EINVAL;
- size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
+ meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */
+ again:
+ size = cnt + meta_size;
/* If less than "<faulted>", then make sure we can still add that */
if (cnt < FAULTED_SIZE)
size += FAULTED_SIZE - cnt;
+ if (size > TRACE_SEQ_BUFFER_SIZE) {
+ cnt -= size - TRACE_SEQ_BUFFER_SIZE;
+ goto again;
+ }
+
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
tracing_gen_ctx());
- if (unlikely(!event))
+ if (unlikely(!event)) {
+ /*
+ * If the size was greater than what was allowed, then
+ * make it smaller and try again.
+ */
+ if (size > ring_buffer_max_event_size(buffer)) {
+ /* cnt < FAULTED size should never be bigger than max */
+ if (WARN_ON_ONCE(cnt < FAULTED_SIZE))
+ return -EBADF;
+ cnt = ring_buffer_max_event_size(buffer) - meta_size;
+ /* The above should only happen once */
+ if (WARN_ON_ONCE(cnt + meta_size == size))
+ return -EBADF;
+ goto again;
+ }
+
/* Ring buffer disabled, return as if not open for write */
return -EBADF;
+ }
entry = ring_buffer_event_data(event);
entry->ip = _THIS_IP_;
@@ -7334,9 +7386,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
return written;
}
-/* Limit it for now to 3K (including tag) */
-#define RAW_DATA_MAX_SIZE (1024*3)
-
static ssize_t
tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
@@ -7358,19 +7407,18 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
return -EINVAL;
/* The marker must at least have a tag id */
- if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE)
+ if (cnt < sizeof(unsigned int))
return -EINVAL;
- if (cnt > TRACE_BUF_SIZE)
- cnt = TRACE_BUF_SIZE;
-
- BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
-
size = sizeof(*entry) + cnt;
if (cnt < FAULT_SIZE_ID)
size += FAULT_SIZE_ID - cnt;
buffer = tr->array_buffer.buffer;
+
+ if (size > ring_buffer_max_event_size(buffer))
+ return -EINVAL;
+
event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
tracing_gen_ctx());
if (!event)
@@ -7555,6 +7603,7 @@ struct ftrace_buffer_info {
struct trace_iterator iter;
void *spare;
unsigned int spare_cpu;
+ unsigned int spare_size;
unsigned int read;
};
@@ -7615,7 +7664,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
unsigned long val;
int ret;
- ret = tracing_update_buffers();
+ ret = tracing_update_buffers(tr);
if (ret < 0)
return ret;
@@ -7752,18 +7801,20 @@ static const struct file_operations tracing_thresh_fops = {
#ifdef CONFIG_TRACER_MAX_TRACE
static const struct file_operations tracing_max_lat_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.read = tracing_max_lat_read,
.write = tracing_max_lat_write,
.llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
};
#endif
static const struct file_operations set_tracer_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.read = tracing_set_trace_read,
.write = tracing_set_trace_write,
.llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
};
static const struct file_operations tracing_pipe_fops = {
@@ -8257,6 +8308,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
+ void *trace_data;
+ int page_size;
ssize_t ret = 0;
ssize_t size;
@@ -8268,6 +8321,17 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
return -EBUSY;
#endif
+ page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+
+ /* Make sure the spare matches the current sub buffer size */
+ if (info->spare) {
+ if (page_size != info->spare_size) {
+ ring_buffer_free_read_page(iter->array_buffer->buffer,
+ info->spare_cpu, info->spare);
+ info->spare = NULL;
+ }
+ }
+
if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
iter->cpu_file);
@@ -8276,19 +8340,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->spare = NULL;
} else {
info->spare_cpu = iter->cpu_file;
+ info->spare_size = page_size;
}
}
if (!info->spare)
return ret;
/* Do we have previous read data to read? */
- if (info->read < PAGE_SIZE)
+ if (info->read < page_size)
goto read;
again:
trace_access_lock(iter->cpu_file);
ret = ring_buffer_read_page(iter->array_buffer->buffer,
- &info->spare,
+ info->spare,
count,
iter->cpu_file, 0);
trace_access_unlock(iter->cpu_file);
@@ -8309,11 +8374,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->read = 0;
read:
- size = PAGE_SIZE - info->read;
+ size = page_size - info->read;
if (size > count)
size = count;
-
- ret = copy_to_user(ubuf, info->spare + info->read, size);
+ trace_data = ring_buffer_read_page_data(info->spare);
+ ret = copy_to_user(ubuf, trace_data + info->read, size);
if (ret == size)
return -EFAULT;
@@ -8424,6 +8489,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
+ int page_size;
int entries, i;
ssize_t ret = 0;
@@ -8432,13 +8498,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return -EBUSY;
#endif
- if (*ppos & (PAGE_SIZE - 1))
+ page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+ if (*ppos & (page_size - 1))
return -EINVAL;
- if (len & (PAGE_SIZE - 1)) {
- if (len < PAGE_SIZE)
+ if (len & (page_size - 1)) {
+ if (len < page_size)
return -EINVAL;
- len &= PAGE_MASK;
+ len &= (~(page_size - 1));
}
if (splice_grow_spd(pipe, &spd))
@@ -8448,7 +8515,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
- for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
+ for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) {
struct page *page;
int r;
@@ -8469,7 +8536,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
}
ref->cpu = iter->cpu_file;
- r = ring_buffer_read_page(ref->buffer, &ref->page,
+ r = ring_buffer_read_page(ref->buffer, ref->page,
len, iter->cpu_file, 1);
if (r < 0) {
ring_buffer_free_read_page(ref->buffer, ref->cpu,
@@ -8478,14 +8545,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- page = virt_to_page(ref->page);
+ page = virt_to_page(ring_buffer_read_page_data(ref->page));
spd.pages[i] = page;
- spd.partial[i].len = PAGE_SIZE;
+ spd.partial[i].len = page_size;
spd.partial[i].offset = 0;
spd.partial[i].private = (unsigned long)ref;
spd.nr_pages++;
- *ppos += PAGE_SIZE;
+ *ppos += page_size;
entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
}
@@ -8506,7 +8573,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
wait_index = READ_ONCE(iter->wait_index);
- ret = wait_on_pipe(iter, iter->tr->buffer_percent);
+ ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent);
if (ret)
goto out;
@@ -8956,12 +9023,33 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
return cnt;
}
+static int tracing_open_options(struct inode *inode, struct file *filp)
+{
+ struct trace_option_dentry *topt = inode->i_private;
+ int ret;
+
+ ret = tracing_check_open_get_tr(topt->tr);
+ if (ret)
+ return ret;
+
+ filp->private_data = inode->i_private;
+ return 0;
+}
+
+static int tracing_release_options(struct inode *inode, struct file *file)
+{
+ struct trace_option_dentry *topt = file->private_data;
+
+ trace_array_put(topt->tr);
+ return 0;
+}
static const struct file_operations trace_options_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_options,
.read = trace_options_read,
.write = trace_options_write,
.llseek = generic_file_llseek,
+ .release = tracing_release_options,
};
/*
@@ -9308,6 +9396,103 @@ static const struct file_operations buffer_percent_fops = {
.llseek = default_llseek,
};
+static ssize_t
+buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ size_t size;
+ char buf[64];
+ int order;
+ int r;
+
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ size = (PAGE_SIZE << order) / 1024;
+
+ r = sprintf(buf, "%zd\n", size);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+buffer_subbuf_size_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ unsigned long val;
+ int old_order;
+ int order;
+ int pages;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ val *= 1024; /* value passed in is in KB */
+
+ pages = DIV_ROUND_UP(val, PAGE_SIZE);
+ order = fls(pages - 1);
+
+ /* limit between 1 and 128 system pages */
+ if (order < 0 || order > 7)
+ return -EINVAL;
+
+ /* Do not allow tracing while changing the order of the ring buffer */
+ tracing_stop_tr(tr);
+
+ old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ if (old_order == order)
+ goto out;
+
+ ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order);
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+
+ if (!tr->allocated_snapshot)
+ goto out_max;
+
+ ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ if (ret) {
+ /* Put back the old order */
+ cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order);
+ if (WARN_ON_ONCE(cnt)) {
+ /*
+ * AARGH! We are left with different orders!
+ * The max buffer is our "snapshot" buffer.
+ * When a tracer needs a snapshot (one of the
+ * latency tracers), it swaps the max buffer
+ * with the saved snap shot. We succeeded to
+ * update the order of the main buffer, but failed to
+ * update the order of the max buffer. But when we tried
+ * to reset the main buffer to the original size, we
+ * failed there too. This is very unlikely to
+ * happen, but if it does, warn and kill all
+ * tracing.
+ */
+ tracing_disabled = 1;
+ }
+ goto out;
+ }
+ out_max:
+#endif
+ (*ppos)++;
+ out:
+ if (ret)
+ cnt = ret;
+ tracing_start_tr(tr);
+ return cnt;
+}
+
+static const struct file_operations buffer_subbuf_size_fops = {
+ .open = tracing_open_generic_tr,
+ .read = buffer_subbuf_size_read,
+ .write = buffer_subbuf_size_write,
+ .release = tracing_release_generic_tr,
+ .llseek = default_llseek,
+};
+
static struct dentry *trace_instance_dir;
static void
@@ -9458,7 +9643,8 @@ static int trace_array_create_dir(struct trace_array *tr)
return ret;
}
-static struct trace_array *trace_array_create(const char *name)
+static struct trace_array *
+trace_array_create_systems(const char *name, const char *systems)
{
struct trace_array *tr;
int ret;
@@ -9478,6 +9664,12 @@ static struct trace_array *trace_array_create(const char *name)
if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
goto out_free_tr;
+ if (systems) {
+ tr->system_names = kstrdup_const(systems, GFP_KERNEL);
+ if (!tr->system_names)
+ goto out_free_tr;
+ }
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9496,6 +9688,9 @@ static struct trace_array *trace_array_create(const char *name)
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
+ /* The ring buffer is defaultly expanded */
+ trace_set_ring_buffer_expanded(tr);
+
if (ftrace_allocate_ftrace_ops(tr) < 0)
goto out_free_tr;
@@ -9521,12 +9716,18 @@ static struct trace_array *trace_array_create(const char *name)
free_trace_buffers(tr);
free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
+ kfree_const(tr->system_names);
kfree(tr->name);
kfree(tr);
return ERR_PTR(ret);
}
+static struct trace_array *trace_array_create(const char *name)
+{
+ return trace_array_create_systems(name, NULL);
+}
+
static int instance_mkdir(const char *name)
{
struct trace_array *tr;
@@ -9552,6 +9753,7 @@ out_unlock:
/**
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
* @name: The name of the trace array to be looked up/created.
+ * @systems: A list of systems to create event directories for (NULL for all)
*
* Returns pointer to trace array with given name.
* NULL, if it cannot be created.
@@ -9565,7 +9767,7 @@ out_unlock:
* trace_array_put() is called, user space can not delete it.
*
*/
-struct trace_array *trace_array_get_by_name(const char *name)
+struct trace_array *trace_array_get_by_name(const char *name, const char *systems)
{
struct trace_array *tr;
@@ -9577,7 +9779,7 @@ struct trace_array *trace_array_get_by_name(const char *name)
goto out_unlock;
}
- tr = trace_array_create(name);
+ tr = trace_array_create_systems(name, systems);
if (IS_ERR(tr))
tr = NULL;
@@ -9624,6 +9826,7 @@ static int __remove_instance(struct trace_array *tr)
free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
+ kfree_const(tr->system_names);
kfree(tr->name);
kfree(tr);
@@ -9705,7 +9908,6 @@ static __init void create_trace_instances(struct dentry *d_tracer)
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
- struct trace_event_file *file;
int cpu;
trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
@@ -9738,11 +9940,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker", 0220, d_tracer,
tr, &tracing_mark_fops);
- file = __find_event_file(tr, "ftrace", "print");
- if (file && file->dir)
- trace_create_file("trigger", TRACE_MODE_WRITE, file->dir,
- file, &event_trigger_fops);
- tr->trace_marker_file = file;
+ tr->trace_marker_file = __find_event_file(tr, "ftrace", "print");
trace_create_file("trace_marker_raw", 0220, d_tracer,
tr, &tracing_mark_raw_fops);
@@ -9761,6 +9959,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
tr, &buffer_percent_fops);
+ trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
+ tr, &buffer_subbuf_size_fops);
+
create_trace_options_dir(tr);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -10347,7 +10548,7 @@ __init static void enable_instances(void)
if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
do_allocate_snapshot(tok);
- tr = trace_array_get_by_name(tok);
+ tr = trace_array_get_by_name(tok, NULL);
if (!tr) {
pr_warn("Failed to create instance buffer %s\n", curr_str);
continue;
@@ -10390,7 +10591,7 @@ __init static int tracer_alloc_buffers(void)
trace_printk_init_buffers();
/* To save memory, keep the ring buffer size to its minimum */
- if (ring_buffer_expanded)
+ if (global_trace.ring_buffer_expanded)
ring_buf_size = trace_buf_size;
else
ring_buf_size = 1;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5669dd1f90d9..00f873910c5d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,11 +377,12 @@ struct trace_array {
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
unsigned int flags;
raw_spinlock_t start_lock;
+ const char *system_names;
struct list_head err_log;
struct dentry *dir;
struct dentry *options;
struct dentry *percpu_dir;
- struct dentry *event_dir;
+ struct eventfs_inode *event_dir;
struct trace_options *topts;
struct list_head systems;
struct list_head events;
@@ -410,6 +411,11 @@ struct trace_array {
struct cond_snapshot *cond_snapshot;
#endif
struct trace_func_repeats __percpu *last_func_repeats;
+ /*
+ * On boot up, the ring buffer is set to the minimum size, so that
+ * we do not waste memory on systems that are not using tracing.
+ */
+ bool ring_buffer_expanded;
};
enum {
@@ -610,6 +616,10 @@ void tracing_reset_all_online_cpus(void);
void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
+int tracing_release_generic_tr(struct inode *inode, struct file *file);
+int tracing_open_file_tr(struct inode *inode, struct file *filp);
+int tracing_release_file_tr(struct inode *inode, struct file *filp);
+int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);
@@ -759,7 +769,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
extern int DYN_FTRACE_TEST_NAME2(void);
-extern bool ring_buffer_expanded;
+extern void trace_set_ring_buffer_expanded(struct trace_array *tr);
extern bool tracing_selftest_disabled;
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1303,7 +1313,7 @@ static inline void trace_branch_disable(void)
#endif /* CONFIG_BRANCH_TRACER */
/* set ring buffers to default size if not already done so */
-int tracing_update_buffers(void);
+int tracing_update_buffers(struct trace_array *tr);
union trace_synth_field {
u8 as_u8;
@@ -1342,7 +1352,7 @@ struct trace_subsystem_dir {
struct list_head list;
struct event_subsystem *subsystem;
struct trace_array *tr;
- struct eventfs_file *ef;
+ struct eventfs_inode *ei;
int ref_count;
int nr_events;
};
@@ -1662,6 +1672,9 @@ extern void event_trigger_unregister(struct event_command *cmd_ops,
char *glob,
struct event_trigger_data *trigger_data);
+extern void event_file_get(struct trace_event_file *file);
+extern void event_file_put(struct trace_event_file *file);
+
/**
* struct event_trigger_ops - callbacks for trace event triggers
*
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7ccc7a8e155b..dbe29b4c6a7a 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -633,7 +633,7 @@ trace_boot_init_instances(struct xbc_node *node)
if (!p || *p == '\0')
continue;
- tr = trace_array_get_by_name(p);
+ tr = trace_array_get_by_name(p, NULL);
if (!tr) {
pr_err("Failed to get trace instance %s\n", p);
continue;
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 72714cbf475c..03c851f57969 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -788,12 +788,9 @@ find_and_get_event(const char *system, const char *event_name)
name = trace_event_name(tp_event);
if (!name || strcmp(event_name, name))
continue;
- if (!trace_event_try_get_ref(tp_event)) {
+ if (!trace_event_try_get_ref(tp_event))
return NULL;
- break;
- }
return tp_event;
- break;
}
return NULL;
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ed367d713be0..7c364b87352e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -984,32 +984,41 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
return;
if (!--dir->nr_events) {
- eventfs_remove(dir->ef);
+ eventfs_remove_dir(dir->ei);
list_del(&dir->list);
__put_system_dir(dir);
}
}
-static void remove_event_file_dir(struct trace_event_file *file)
+void event_file_get(struct trace_event_file *file)
{
- struct dentry *dir = file->dir;
- struct dentry *child;
+ atomic_inc(&file->ref);
+}
- if (dir) {
- spin_lock(&dir->d_lock); /* probably unneeded */
- list_for_each_entry(child, &dir->d_subdirs, d_child) {
- if (d_really_is_positive(child)) /* probably unneeded */
- d_inode(child)->i_private = NULL;
- }
- spin_unlock(&dir->d_lock);
+void event_file_put(struct trace_event_file *file)
+{
+ if (WARN_ON_ONCE(!atomic_read(&file->ref))) {
+ if (file->flags & EVENT_FILE_FL_FREED)
+ kmem_cache_free(file_cachep, file);
+ return;
+ }
- tracefs_remove(dir);
+ if (atomic_dec_and_test(&file->ref)) {
+ /* Count should only go to zero when it is freed */
+ if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED)))
+ return;
+ kmem_cache_free(file_cachep, file);
}
- eventfs_remove(file->ef);
+}
+
+static void remove_event_file_dir(struct trace_event_file *file)
+{
+ eventfs_remove_dir(file->ei);
list_del(&file->list);
remove_subsystem(file->system);
free_event_filter(file->filter);
- kmem_cache_free(file_cachep, file);
+ file->flags |= EVENT_FILE_FL_FREED;
+ event_file_put(file);
}
/*
@@ -1179,7 +1188,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
if (!cnt)
return 0;
- ret = tracing_update_buffers();
+ ret = tracing_update_buffers(tr);
if (ret < 0)
return ret;
@@ -1382,7 +1391,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
flags = file->flags;
mutex_unlock(&event_mutex);
- if (!file)
+ if (!file || flags & EVENT_FILE_FL_FREED)
return -ENODEV;
if (flags & EVENT_FILE_FL_ENABLED &&
@@ -1410,18 +1419,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
- ret = tracing_update_buffers();
- if (ret < 0)
- return ret;
-
switch (val) {
case 0:
case 1:
ret = -ENODEV;
mutex_lock(&event_mutex);
file = event_file_data(filp);
- if (likely(file))
+ if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) {
+ ret = tracing_update_buffers(file->tr);
+ if (ret < 0) {
+ mutex_unlock(&event_mutex);
+ return ret;
+ }
ret = ftrace_event_enable_disable(file, val);
+ }
mutex_unlock(&event_mutex);
break;
@@ -1495,7 +1506,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
- ret = tracing_update_buffers();
+ ret = tracing_update_buffers(dir->tr);
if (ret < 0)
return ret;
@@ -1694,7 +1705,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
file = event_file_data(filp);
- if (file)
+ if (file && !(file->flags & EVENT_FILE_FL_FREED))
print_event_filter(file, s);
mutex_unlock(&event_mutex);
@@ -1882,9 +1893,33 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
static ssize_t
-show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_page_header(tr->array_buffer.buffer, s);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
+
+ kfree(s);
+
+ return r;
+}
+
+static ssize_t
+show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- int (*func)(struct trace_seq *s) = filp->private_data;
struct trace_seq *s;
int r;
@@ -1897,7 +1932,7 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
trace_seq_init(s);
- func(s);
+ ring_buffer_print_entry_header(s);
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, trace_seq_used(s));
@@ -1969,7 +2004,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
if (!cnt)
return 0;
- ret = tracing_update_buffers();
+ ret = tracing_update_buffers(tr);
if (ret < 0)
return ret;
@@ -2103,9 +2138,10 @@ static const struct file_operations ftrace_set_event_notrace_pid_fops = {
};
static const struct file_operations ftrace_enable_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_file_tr,
.read = event_enable_read,
.write = event_enable_write,
+ .release = tracing_release_file_tr,
.llseek = default_llseek,
};
@@ -2122,9 +2158,10 @@ static const struct file_operations ftrace_event_id_fops = {
};
static const struct file_operations ftrace_event_filter_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_file_tr,
.read = event_filter_read,
.write = event_filter_write,
+ .release = tracing_release_file_tr,
.llseek = default_llseek,
};
@@ -2152,10 +2189,18 @@ static const struct file_operations ftrace_tr_enable_fops = {
.release = subsystem_release,
};
-static const struct file_operations ftrace_show_header_fops = {
- .open = tracing_open_generic,
- .read = show_header,
+static const struct file_operations ftrace_show_header_page_fops = {
+ .open = tracing_open_generic_tr,
+ .read = show_header_page_file,
.llseek = default_llseek,
+ .release = tracing_release_generic_tr,
+};
+
+static const struct file_operations ftrace_show_header_event_fops = {
+ .open = tracing_open_generic_tr,
+ .read = show_header_event_file,
+ .llseek = default_llseek,
+ .release = tracing_release_generic_tr,
};
static int
@@ -2291,13 +2336,40 @@ create_new_subsystem(const char *name)
return NULL;
}
-static struct eventfs_file *
+static int system_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (strcmp(name, "filter") == 0)
+ *fops = &ftrace_subsystem_filter_fops;
+
+ else if (strcmp(name, "enable") == 0)
+ *fops = &ftrace_system_enable_fops;
+
+ else
+ return 0;
+
+ *mode = TRACE_MODE_WRITE;
+ return 1;
+}
+
+static struct eventfs_inode *
event_subsystem_dir(struct trace_array *tr, const char *name,
- struct trace_event_file *file, struct dentry *parent)
+ struct trace_event_file *file, struct eventfs_inode *parent)
{
struct event_subsystem *system, *iter;
struct trace_subsystem_dir *dir;
- int res;
+ struct eventfs_inode *ei;
+ int nr_entries;
+ static struct eventfs_entry system_entries[] = {
+ {
+ .name = "filter",
+ .callback = system_callback,
+ },
+ {
+ .name = "enable",
+ .callback = system_callback,
+ }
+ };
/* First see if we did not already create this dir */
list_for_each_entry(dir, &tr->systems, list) {
@@ -2305,7 +2377,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
if (strcmp(system->name, name) == 0) {
dir->nr_events++;
file->system = dir;
- return dir->ef;
+ return dir->ei;
}
}
@@ -2329,38 +2401,29 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
} else
__get_system(system);
- dir->ef = eventfs_add_subsystem_dir(name, parent);
- if (IS_ERR(dir->ef)) {
+ /* ftrace only has directories no files */
+ if (strcmp(name, "ftrace") == 0)
+ nr_entries = 0;
+ else
+ nr_entries = ARRAY_SIZE(system_entries);
+
+ ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir);
+ if (IS_ERR(ei)) {
pr_warn("Failed to create system directory %s\n", name);
__put_system(system);
goto out_free;
}
+ dir->ei = ei;
dir->tr = tr;
dir->ref_count = 1;
dir->nr_events = 1;
dir->subsystem = system;
file->system = dir;
- /* the ftrace system is special, do not create enable or filter files */
- if (strcmp(name, "ftrace") != 0) {
-
- res = eventfs_add_file("filter", TRACE_MODE_WRITE,
- dir->ef, dir,
- &ftrace_subsystem_filter_fops);
- if (res) {
- kfree(system->filter);
- system->filter = NULL;
- pr_warn("Could not create tracefs '%s/filter' entry\n", name);
- }
-
- eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir,
- &ftrace_system_enable_fops);
- }
-
list_add(&dir->list, &tr->systems);
- return dir->ef;
+ return dir->ei;
out_free:
kfree(dir);
@@ -2409,14 +2472,134 @@ event_define_fields(struct trace_event_call *call)
return ret;
}
+static int event_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ struct trace_event_file *file = *data;
+ struct trace_event_call *call = file->event_call;
+
+ if (strcmp(name, "format") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_event_format_fops;
+ *data = call;
+ return 1;
+ }
+
+ /*
+ * Only event directories that can be enabled should have
+ * triggers or filters, with the exception of the "print"
+ * event that can have a "trigger" file.
+ */
+ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
+ if (call->class->reg && strcmp(name, "enable") == 0) {
+ *mode = TRACE_MODE_WRITE;
+ *fops = &ftrace_enable_fops;
+ return 1;
+ }
+
+ if (strcmp(name, "filter") == 0) {
+ *mode = TRACE_MODE_WRITE;
+ *fops = &ftrace_event_filter_fops;
+ return 1;
+ }
+ }
+
+ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) ||
+ strcmp(trace_event_name(call), "print") == 0) {
+ if (strcmp(name, "trigger") == 0) {
+ *mode = TRACE_MODE_WRITE;
+ *fops = &event_trigger_fops;
+ return 1;
+ }
+ }
+
+#ifdef CONFIG_PERF_EVENTS
+ if (call->event.type && call->class->reg &&
+ strcmp(name, "id") == 0) {
+ *mode = TRACE_MODE_READ;
+ *data = (void *)(long)call->event.type;
+ *fops = &ftrace_event_id_fops;
+ return 1;
+ }
+#endif
+
+#ifdef CONFIG_HIST_TRIGGERS
+ if (strcmp(name, "hist") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &event_hist_fops;
+ return 1;
+ }
+#endif
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+ if (strcmp(name, "hist_debug") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &event_hist_debug_fops;
+ return 1;
+ }
+#endif
+#ifdef CONFIG_TRACE_EVENT_INJECT
+ if (call->event.type && call->class->reg &&
+ strcmp(name, "inject") == 0) {
+ *mode = 0200;
+ *fops = &event_inject_fops;
+ return 1;
+ }
+#endif
+ return 0;
+}
+
static int
-event_create_dir(struct dentry *parent, struct trace_event_file *file)
+event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
{
struct trace_event_call *call = file->event_call;
- struct eventfs_file *ef_subsystem = NULL;
struct trace_array *tr = file->tr;
+ struct eventfs_inode *e_events;
+ struct eventfs_inode *ei;
const char *name;
+ int nr_entries;
int ret;
+ static struct eventfs_entry event_entries[] = {
+ {
+ .name = "enable",
+ .callback = event_callback,
+ },
+ {
+ .name = "filter",
+ .callback = event_callback,
+ },
+ {
+ .name = "trigger",
+ .callback = event_callback,
+ },
+ {
+ .name = "format",
+ .callback = event_callback,
+ },
+#ifdef CONFIG_PERF_EVENTS
+ {
+ .name = "id",
+ .callback = event_callback,
+ },
+#endif
+#ifdef CONFIG_HIST_TRIGGERS
+ {
+ .name = "hist",
+ .callback = event_callback,
+ },
+#endif
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+ {
+ .name = "hist_debug",
+ .callback = event_callback,
+ },
+#endif
+#ifdef CONFIG_TRACE_EVENT_INJECT
+ {
+ .name = "inject",
+ .callback = event_callback,
+ },
+#endif
+ };
/*
* If the trace point header did not define TRACE_SYSTEM
@@ -2426,27 +2609,20 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0))
return -ENODEV;
- ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent);
- if (!ef_subsystem)
+ e_events = event_subsystem_dir(tr, call->class->system, file, parent);
+ if (!e_events)
return -ENOMEM;
+ nr_entries = ARRAY_SIZE(event_entries);
+
name = trace_event_name(call);
- file->ef = eventfs_add_dir(name, ef_subsystem);
- if (IS_ERR(file->ef)) {
+ ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file);
+ if (IS_ERR(ei)) {
pr_warn("Could not create tracefs '%s' directory\n", name);
return -1;
}
- if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
- eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file,
- &ftrace_enable_fops);
-
-#ifdef CONFIG_PERF_EVENTS
- if (call->event.type && call->class->reg)
- eventfs_add_file("id", TRACE_MODE_READ, file->ef,
- (void *)(long)call->event.type,
- &ftrace_event_id_fops);
-#endif
+ file->ei = ei;
ret = event_define_fields(call);
if (ret < 0) {
@@ -2454,35 +2630,6 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
return ret;
}
- /*
- * Only event directories that can be enabled should have
- * triggers or filters.
- */
- if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
- eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef,
- file, &ftrace_event_filter_fops);
-
- eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef,
- file, &event_trigger_fops);
- }
-
-#ifdef CONFIG_HIST_TRIGGERS
- eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file,
- &event_hist_fops);
-#endif
-#ifdef CONFIG_HIST_TRIGGERS_DEBUG
- eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file,
- &event_hist_debug_fops);
-#endif
- eventfs_add_file("format", TRACE_MODE_READ, file->ef, call,
- &ftrace_event_format_fops);
-
-#ifdef CONFIG_TRACE_EVENT_INJECT
- if (call->event.type && call->class->reg)
- eventfs_add_file("inject", 0200, file->ef, file,
- &event_inject_fops);
-#endif
-
return 0;
}
@@ -2776,10 +2923,32 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
update_event_fields(call, map[i]);
}
}
+ cond_resched();
}
up_write(&trace_event_sem);
}
+static bool event_in_systems(struct trace_event_call *call,
+ const char *systems)
+{
+ const char *system;
+ const char *p;
+
+ if (!systems)
+ return true;
+
+ system = call->class->system;
+ p = strstr(systems, system);
+ if (!p)
+ return false;
+
+ if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',')
+ return false;
+
+ p += strlen(system);
+ return !*p || isspace(*p) || *p == ',';
+}
+
static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
@@ -2789,9 +2958,12 @@ trace_create_new_event(struct trace_event_call *call,
struct trace_event_file *file;
unsigned int first;
+ if (!event_in_systems(call, tr->system_names))
+ return NULL;
+
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
if (!file)
- return NULL;
+ return ERR_PTR(-ENOMEM);
pid_list = rcu_dereference_protected(tr->filtered_pids,
lockdep_is_held(&event_mutex));
@@ -2808,6 +2980,7 @@ trace_create_new_event(struct trace_event_call *call,
atomic_set(&file->tm_ref, 0);
INIT_LIST_HEAD(&file->triggers);
list_add(&file->list, &tr->events);
+ event_file_get(file);
return file;
}
@@ -2829,7 +3002,7 @@ static __init int setup_trace_triggers(char *str)
int i;
strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
- ring_buffer_expanded = true;
+ trace_set_ring_buffer_expanded(NULL);
disable_tracing_selftest("running event triggers");
buf = bootup_trigger_buf;
@@ -2855,8 +3028,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
struct trace_event_file *file;
file = trace_create_new_event(call, tr);
+ /*
+ * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+ * allocation, or NULL if the event is not part of the tr->system_names.
+ * When the event is not part of the tr->system_names, return zero, not
+ * an error.
+ */
if (!file)
- return -ENOMEM;
+ return 0;
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
if (eventdir_initialized)
return event_create_dir(tr->event_dir, file);
@@ -2895,8 +3077,17 @@ __trace_early_add_new_event(struct trace_event_call *call,
int ret;
file = trace_create_new_event(call, tr);
+ /*
+ * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+ * allocation, or NULL if the event is not part of the tr->system_names.
+ * When the event is not part of the tr->system_names, return zero, not
+ * an error.
+ */
if (!file)
- return -ENOMEM;
+ return 0;
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
ret = event_define_fields(call);
if (ret)
@@ -3619,37 +3810,71 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
static __init int setup_trace_event(char *str)
{
strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
- ring_buffer_expanded = true;
+ trace_set_ring_buffer_expanded(NULL);
disable_tracing_selftest("running event tracing");
return 1;
}
__setup("trace_event=", setup_trace_event);
+static int events_callback(const char *name, umode_t *mode, void **data,
+ const struct file_operations **fops)
+{
+ if (strcmp(name, "enable") == 0) {
+ *mode = TRACE_MODE_WRITE;
+ *fops = &ftrace_tr_enable_fops;
+ return 1;
+ }
+
+ if (strcmp(name, "header_page") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_show_header_page_fops;
+
+ } else if (strcmp(name, "header_event") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_show_header_event_fops;
+ } else
+ return 0;
+
+ return 1;
+}
+
/* Expects to have event_mutex held when called */
static int
create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
{
- struct dentry *d_events;
+ struct eventfs_inode *e_events;
struct dentry *entry;
- int error = 0;
+ int nr_entries;
+ static struct eventfs_entry events_entries[] = {
+ {
+ .name = "enable",
+ .callback = events_callback,
+ },
+ {
+ .name = "header_page",
+ .callback = events_callback,
+ },
+ {
+ .name = "header_event",
+ .callback = events_callback,
+ },
+ };
entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
tr, &ftrace_set_event_fops);
if (!entry)
return -ENOMEM;
- d_events = eventfs_create_events_dir("events", parent);
- if (IS_ERR(d_events)) {
+ nr_entries = ARRAY_SIZE(events_entries);
+
+ e_events = eventfs_create_events_dir("events", parent, events_entries,
+ nr_entries, tr);
+ if (IS_ERR(e_events)) {
pr_warn("Could not create tracefs 'events' directory\n");
return -ENOMEM;
}
- error = eventfs_add_events_file("enable", TRACE_MODE_WRITE, d_events,
- tr, &ftrace_tr_enable_fops);
- if (error)
- return -ENOMEM;
-
/* There are not as crucial, just warn if they are not created */
trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
@@ -3659,16 +3884,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
TRACE_MODE_WRITE, parent, tr,
&ftrace_set_event_notrace_pid_fops);
- /* ring buffer internal formats */
- eventfs_add_events_file("header_page", TRACE_MODE_READ, d_events,
- ring_buffer_print_page_header,
- &ftrace_show_header_fops);
-
- eventfs_add_events_file("header_event", TRACE_MODE_READ, d_events,
- ring_buffer_print_entry_header,
- &ftrace_show_header_fops);
-
- tr->event_dir = d_events;
+ tr->event_dir = e_events;
return 0;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 33264e510d16..0c611b281a5b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -2349,6 +2349,9 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
struct event_filter *filter = NULL;
int err;
+ if (file->flags & EVENT_FILE_FL_FREED)
+ return -ENODEV;
+
if (!strcmp(strstrip(filter_string), "0")) {
filter_disable(file);
filter = event_filter(file);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index d06938ae0717..6ece1308d36a 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -774,23 +774,16 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
{
const char *system = NULL, *name = NULL;
struct trace_event_call *call;
- int len;
if (!str)
return;
- /* sizeof() contains the nul byte */
- len = sizeof(HIST_PREFIX) + strlen(str);
kfree(last_cmd);
- last_cmd = kzalloc(len, GFP_KERNEL);
+
+ last_cmd = kasprintf(GFP_KERNEL, HIST_PREFIX "%s", str);
if (!last_cmd)
return;
- strcpy(last_cmd, HIST_PREFIX);
- /* Again, sizeof() contains the nul byte */
- len -= sizeof(HIST_PREFIX);
- strncat(last_cmd, str, len);
-
if (file) {
call = file->event_call;
system = call->class->system;
@@ -4812,36 +4805,35 @@ static int parse_actions(struct hist_trigger_data *hist_data)
int len;
for (i = 0; i < hist_data->attrs->n_actions; i++) {
+ enum handler_id hid = 0;
+ char *action_str;
+
str = hist_data->attrs->action_str[i];
- if ((len = str_has_prefix(str, "onmatch("))) {
- char *action_str = str + len;
+ if ((len = str_has_prefix(str, "onmatch(")))
+ hid = HANDLER_ONMATCH;
+ else if ((len = str_has_prefix(str, "onmax(")))
+ hid = HANDLER_ONMAX;
+ else if ((len = str_has_prefix(str, "onchange(")))
+ hid = HANDLER_ONCHANGE;
- data = onmatch_parse(tr, action_str);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else if ((len = str_has_prefix(str, "onmax("))) {
- char *action_str = str + len;
+ action_str = str + len;
- data = track_data_parse(hist_data, action_str,
- HANDLER_ONMAX);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else if ((len = str_has_prefix(str, "onchange("))) {
- char *action_str = str + len;
+ switch (hid) {
+ case HANDLER_ONMATCH:
+ data = onmatch_parse(tr, action_str);
+ break;
+ case HANDLER_ONMAX:
+ case HANDLER_ONCHANGE:
+ data = track_data_parse(hist_data, action_str, hid);
+ break;
+ default:
+ data = ERR_PTR(-EINVAL);
+ break;
+ }
- data = track_data_parse(hist_data, action_str,
- HANDLER_ONCHANGE);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else {
- ret = -EINVAL;
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
break;
}
@@ -5630,10 +5622,12 @@ static int event_hist_open(struct inode *inode, struct file *file)
{
int ret;
- ret = security_locked_down(LOCKDOWN_TRACEFS);
+ ret = tracing_open_file_tr(inode, file);
if (ret)
return ret;
+ /* Clear private_data to avoid warning in single_open() */
+ file->private_data = NULL;
return single_open(file, hist_show, file);
}
@@ -5641,7 +5635,7 @@ const struct file_operations event_hist_fops = {
.open = event_hist_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = tracing_single_release_file_tr,
};
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
@@ -5907,10 +5901,12 @@ static int event_hist_debug_open(struct inode *inode, struct file *file)
{
int ret;
- ret = security_locked_down(LOCKDOWN_TRACEFS);
+ ret = tracing_open_file_tr(inode, file);
if (ret)
return ret;
+ /* Clear private_data to avoid warning in single_open() */
+ file->private_data = NULL;
return single_open(file, hist_debug_show, file);
}
@@ -5918,7 +5914,7 @@ const struct file_operations event_hist_debug_fops = {
.open = event_hist_debug_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = tracing_single_release_file_tr,
};
#endif
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index abe805d471eb..8650562bdaa9 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -328,7 +328,8 @@ event_inject_read(struct file *file, char __user *buf, size_t size,
}
const struct file_operations event_inject_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_file_tr,
.read = event_inject_read,
.write = event_inject_write,
+ .release = tracing_release_file_tr,
};
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 9897d0bfcab7..e7af286af4f1 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -337,7 +337,7 @@ static void print_synth_event_num_val(struct trace_seq *s,
break;
default:
- trace_seq_printf(s, print_fmt, name, val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u64, space);
break;
}
}
@@ -452,7 +452,7 @@ static unsigned int trace_string(struct synth_trace_event *entry,
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if ((unsigned long)str_val < TASK_SIZE)
- ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX);
+ ret = strncpy_from_user_nofault(str_field, (const void __user *)str_val, STR_VAR_LEN_MAX);
else
#endif
ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX);
@@ -1137,7 +1137,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_fields);
* @cmd: A pointer to the dynevent_cmd struct representing the new event
* @name: The name of the synthetic event
* @mod: The module creating the event, NULL if not created from a module
- * @args: Variable number of arg (pairs), one pair for each field
+ * @...: Variable number of arg (pairs), one pair for each field
*
* NOTE: Users normally won't want to call this function directly, but
* rather use the synth_event_gen_cmd_start() wrapper, which
@@ -1695,7 +1695,7 @@ __synth_event_trace_end(struct synth_event_trace_state *trace_state)
* synth_event_trace - Trace a synthetic event
* @file: The trace_event_file representing the synthetic event
* @n_vals: The number of values in vals
- * @args: Variable number of args containing the event values
+ * @...: Variable number of args containing the event values
*
* Trace a synthetic event using the values passed in the variable
* argument list.
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 46439e3bcec4..b33c3861fbbb 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1470,8 +1470,10 @@ register_snapshot_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
- if (tracing_alloc_snapshot_instance(file->tr) != 0)
- return 0;
+ int ret = tracing_alloc_snapshot_instance(file->tr);
+
+ if (ret < 0)
+ return ret;
return register_trigger(glob, data, file);
}
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 6f046650e527..e76f5e1efdf2 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -50,18 +50,6 @@
#define EVENT_STATUS_OTHER BIT(7)
/*
- * User register flags are not allowed yet, keep them here until we are
- * ready to expose them out to the user ABI.
- */
-enum user_reg_flag {
- /* Event will not delete upon last reference closing */
- USER_EVENT_REG_PERSIST = 1U << 0,
-
- /* This value or above is currently non-ABI */
- USER_EVENT_REG_MAX = 1U << 1,
-};
-
-/*
* Stores the system name, tables, and locks for a group of events. This
* allows isolation for events by various means.
*/
@@ -127,8 +115,13 @@ struct user_event_enabler {
/* Bit 7 is for freeing status of enablement */
#define ENABLE_VAL_FREEING_BIT 7
-/* Only duplicate the bit value */
-#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
+/* Bit 8 is for marking 32-bit on 64-bit */
+#define ENABLE_VAL_32_ON_64_BIT 8
+
+#define ENABLE_VAL_COMPAT_MASK (1 << ENABLE_VAL_32_ON_64_BIT)
+
+/* Only duplicate the bit and compat values */
+#define ENABLE_VAL_DUP_MASK (ENABLE_VAL_BIT_MASK | ENABLE_VAL_COMPAT_MASK)
#define ENABLE_BITOPS(e) (&(e)->values)
@@ -174,6 +167,30 @@ struct user_event_validator {
int flags;
};
+static inline void align_addr_bit(unsigned long *addr, int *bit,
+ unsigned long *flags)
+{
+ if (IS_ALIGNED(*addr, sizeof(long))) {
+#ifdef __BIG_ENDIAN
+ /* 32 bit on BE 64 bit requires a 32 bit offset when aligned. */
+ if (test_bit(ENABLE_VAL_32_ON_64_BIT, flags))
+ *bit += 32;
+#endif
+ return;
+ }
+
+ *addr = ALIGN_DOWN(*addr, sizeof(long));
+
+ /*
+ * We only support 32 and 64 bit values. The only time we need
+ * to align is a 32 bit value on a 64 bit kernel, which on LE
+ * is always 32 bits, and on BE requires no change when unaligned.
+ */
+#ifdef __LITTLE_ENDIAN
+ *bit += 32;
+#endif
+}
+
typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
void *tpdata, bool *faulted);
@@ -191,6 +208,17 @@ static u32 user_event_key(char *name)
return jhash(name, strlen(name), 0);
}
+static bool user_event_capable(u16 reg_flags)
+{
+ /* Persistent events require CAP_PERFMON / CAP_SYS_ADMIN */
+ if (reg_flags & USER_EVENT_REG_PERSIST) {
+ if (!perfmon_capable())
+ return false;
+ }
+
+ return true;
+}
+
static struct user_event *user_event_get(struct user_event *user)
{
refcount_inc(&user->refcnt);
@@ -482,6 +510,7 @@ static int user_event_enabler_write(struct user_event_mm *mm,
unsigned long *ptr;
struct page *page;
void *kaddr;
+ int bit = ENABLE_BIT(enabler);
int ret;
lockdep_assert_held(&event_mutex);
@@ -497,6 +526,8 @@ static int user_event_enabler_write(struct user_event_mm *mm,
test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))))
return -EBUSY;
+ align_addr_bit(&uaddr, &bit, ENABLE_BITOPS(enabler));
+
ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
&page, NULL);
@@ -515,9 +546,9 @@ static int user_event_enabler_write(struct user_event_mm *mm,
/* Update bit atomically, user tracers must be atomic as well */
if (enabler->event && enabler->event->status)
- set_bit(ENABLE_BIT(enabler), ptr);
+ set_bit(bit, ptr);
else
- clear_bit(ENABLE_BIT(enabler), ptr);
+ clear_bit(bit, ptr);
kunmap_local(kaddr);
unpin_user_pages_dirty_lock(&page, 1, true);
@@ -849,6 +880,12 @@ static struct user_event_enabler
enabler->event = user;
enabler->addr = uaddr;
enabler->values = reg->enable_bit;
+
+#if BITS_PER_LONG >= 64
+ if (reg->enable_size == 4)
+ set_bit(ENABLE_VAL_32_ON_64_BIT, ENABLE_BITOPS(enabler));
+#endif
+
retry:
/* Prevents state changes from racing with new enablers */
mutex_lock(&event_mutex);
@@ -1773,6 +1810,9 @@ static int user_event_free(struct dyn_event *ev)
if (!user_event_last_ref(user))
return -EBUSY;
+ if (!user_event_capable(user->reg_flags))
+ return -EPERM;
+
return destroy_user_event(user);
}
@@ -1888,10 +1928,13 @@ static int user_event_parse(struct user_event_group *group, char *name,
int argc = 0;
char **argv;
- /* User register flags are not ready yet */
- if (reg_flags != 0 || flags != NULL)
+ /* Currently don't support any text based flags */
+ if (flags != NULL)
return -EINVAL;
+ if (!user_event_capable(reg_flags))
+ return -EPERM;
+
/* Prevent dyn_event from racing */
mutex_lock(&event_mutex);
user = find_user_event(group, name, &key);
@@ -2024,6 +2067,9 @@ static int delete_user_event(struct user_event_group *group, char *name)
if (!user_event_last_ref(user))
return -EBUSY;
+ if (!user_event_capable(user->reg_flags))
+ return -EPERM;
+
return destroy_user_event(user);
}
@@ -2131,14 +2177,12 @@ static int user_events_open(struct inode *node, struct file *file)
static ssize_t user_events_write(struct file *file, const char __user *ubuf,
size_t count, loff_t *ppos)
{
- struct iovec iov;
struct iov_iter i;
if (unlikely(*ppos != 0))
return -EFAULT;
- if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf,
- count, &iov, &i)))
+ if (unlikely(import_ubuf(ITER_SOURCE, (char __user *)ubuf, count, &i)))
return -EFAULT;
return user_events_write_core(file, &i);
@@ -2377,7 +2421,8 @@ static long user_unreg_get(struct user_unreg __user *ureg,
}
static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
- unsigned long uaddr, unsigned char bit)
+ unsigned long uaddr, unsigned char bit,
+ unsigned long flags)
{
struct user_event_enabler enabler;
int result;
@@ -2385,7 +2430,7 @@ static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
memset(&enabler, 0, sizeof(enabler));
enabler.addr = uaddr;
- enabler.values = bit;
+ enabler.values = bit | flags;
retry:
/* Prevents state changes from racing with new enablers */
mutex_lock(&event_mutex);
@@ -2415,6 +2460,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
struct user_event_mm *mm = current->user_event_mm;
struct user_event_enabler *enabler, *next;
struct user_unreg reg;
+ unsigned long flags;
long ret;
ret = user_unreg_get(ureg, &reg);
@@ -2425,6 +2471,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
if (!mm)
return -ENOENT;
+ flags = 0;
ret = -ENOENT;
/*
@@ -2441,6 +2488,9 @@ static long user_events_ioctl_unreg(unsigned long uarg)
ENABLE_BIT(enabler) == reg.disable_bit) {
set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
+ /* We must keep compat flags for the clear */
+ flags |= enabler->values & ENABLE_VAL_COMPAT_MASK;
+
if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
user_event_enabler_destroy(enabler, true);
@@ -2454,7 +2504,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
/* Ensure bit is now cleared for user, regardless of event status */
if (!ret)
ret = user_event_mm_clear_bit(mm, reg.disable_addr,
- reg.disable_bit);
+ reg.disable_bit, flags);
return ret;
}
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 8bfe23af9c73..7d2ddbcfa377 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -927,11 +927,12 @@ static int parse_symbol_and_return(int argc, const char *argv[],
for (i = 2; i < argc; i++) {
tmp = strstr(argv[i], "$retval");
if (tmp && !isalnum(tmp[7]) && tmp[7] != '_') {
+ if (is_tracepoint) {
+ trace_probe_log_set_index(i);
+ trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE);
+ return -EINVAL;
+ }
*is_return = true;
- /*
- * NOTE: Don't check is_tracepoint here, because it will
- * be checked when the argument is parsed.
- */
break;
}
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3d7a180a8427..c4c6e0e0068b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -487,8 +487,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
return -EINVAL;
if (within_notrace_func(tk)) {
- pr_warn("Could not probe notrace function %s\n",
- trace_kprobe_symbol(tk));
+ pr_warn("Could not probe notrace function %ps\n",
+ (void *)trace_kprobe_address(tk));
return -EINVAL;
}
@@ -705,6 +705,41 @@ static struct notifier_block trace_kprobe_module_nb = {
.priority = 1 /* Invoked after kprobe module callback */
};
+static int count_symbols(void *data, unsigned long unused)
+{
+ unsigned int *count = data;
+
+ (*count)++;
+
+ return 0;
+}
+
+struct sym_count_ctx {
+ unsigned int count;
+ const char *name;
+};
+
+static int count_mod_symbols(void *data, const char *name, unsigned long unused)
+{
+ struct sym_count_ctx *ctx = data;
+
+ if (strcmp(name, ctx->name) == 0)
+ ctx->count++;
+
+ return 0;
+}
+
+static unsigned int number_of_same_symbols(char *func_name)
+{
+ struct sym_count_ctx ctx = { .count = 0, .name = func_name };
+
+ kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
+
+ module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx);
+
+ return ctx.count;
+}
+
static int __trace_kprobe_create(int argc, const char *argv[])
{
/*
@@ -836,6 +871,31 @@ static int __trace_kprobe_create(int argc, const char *argv[])
}
}
+ if (symbol && !strchr(symbol, ':')) {
+ unsigned int count;
+
+ count = number_of_same_symbols(symbol);
+ if (count > 1) {
+ /*
+ * Users should use ADDR to remove the ambiguity of
+ * using KSYM only.
+ */
+ trace_probe_log_err(0, NON_UNIQ_SYMBOL);
+ ret = -EADDRNOTAVAIL;
+
+ goto error;
+ } else if (count == 0) {
+ /*
+ * We can return ENOENT earlier than when register the
+ * kprobe.
+ */
+ trace_probe_log_err(0, BAD_PROBE_ADDR);
+ ret = -ENOENT;
+
+ goto error;
+ }
+ }
+
trace_probe_log_set_index(0);
if (event) {
ret = traceprobe_parse_event_name(&event, &group, gbuf,
@@ -960,10 +1020,10 @@ EXPORT_SYMBOL_GPL(kprobe_event_cmd_init);
/**
* __kprobe_event_gen_cmd_start - Generate a kprobe event command from arg list
* @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @kretprobe: Is this a return probe?
* @name: The name of the kprobe event
* @loc: The location of the kprobe event
- * @kretprobe: Is this a return probe?
- * @args: Variable number of arg (pairs), one pair for each field
+ * @...: Variable number of arg (pairs), one pair for each field
*
* NOTE: Users normally won't want to call this function directly, but
* rather use the kprobe_event_gen_cmd_start() wrapper, which automatically
@@ -1036,7 +1096,7 @@ EXPORT_SYMBOL_GPL(__kprobe_event_gen_cmd_start);
/**
* __kprobe_event_add_fields - Add probe fields to a kprobe command from arg list
* @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @args: Variable number of arg (pairs), one pair for each field
+ * @...: Variable number of arg (pairs), one pair for each field
*
* NOTE: Users normally won't want to call this function directly, but
* rather use the kprobe_event_add_fields() wrapper, which
@@ -1189,6 +1249,12 @@ static const struct file_operations kprobe_events_ops = {
.write = probes_write,
};
+static unsigned long trace_kprobe_missed(struct trace_kprobe *tk)
+{
+ return trace_kprobe_is_return(tk) ?
+ tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed;
+}
+
/* Probes profiling interfaces */
static int probes_profile_seq_show(struct seq_file *m, void *v)
{
@@ -1200,8 +1266,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
return 0;
tk = to_trace_kprobe(ev);
- nmissed = trace_kprobe_is_return(tk) ?
- tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed;
+ nmissed = trace_kprobe_missed(tk);
seq_printf(m, " %-44s %15lu %15lu\n",
trace_probe_name(&tk->tp),
trace_kprobe_nhit(tk),
@@ -1547,7 +1612,8 @@ NOKPROBE_SYMBOL(kretprobe_perf_func);
int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
const char **symbol, u64 *probe_offset,
- u64 *probe_addr, bool perf_type_tracepoint)
+ u64 *probe_addr, unsigned long *missed,
+ bool perf_type_tracepoint)
{
const char *pevent = trace_event_name(event->tp_event);
const char *group = event->tp_event->class->system;
@@ -1566,6 +1632,8 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
*probe_addr = kallsyms_show_value(current_cred()) ?
(unsigned long)tk->rp.kp.addr : 0;
*symbol = tk->symbol;
+ if (missed)
+ *missed = trace_kprobe_missed(tk);
return 0;
}
#endif /* CONFIG_PERF_EVENTS */
@@ -1695,6 +1763,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
}
#ifdef CONFIG_PERF_EVENTS
+
/* create a trace_kprobe, but don't add it to global lists */
struct trace_event_call *
create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
@@ -1705,6 +1774,24 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
int ret;
char *event;
+ if (func) {
+ unsigned int count;
+
+ count = number_of_same_symbols(func);
+ if (count > 1)
+ /*
+ * Users should use addr to remove the ambiguity of
+ * using func only.
+ */
+ return ERR_PTR(-EADDRNOTAVAIL);
+ else if (count == 0)
+ /*
+ * We can return ENOENT earlier than when register the
+ * kprobe.
+ */
+ return ERR_PTR(-ENOENT);
+ }
+
/*
* local trace_kprobes are not added to dyn_event, so they are never
* searched in find_trace_kprobe(). Therefore, there is no concern of
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index bd0d01d00fb9..a8e28f9b9271 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -2444,6 +2444,9 @@ static int timerlat_fd_open(struct inode *inode, struct file *file)
tlat = this_cpu_tmr_var();
tlat->count = 0;
+ hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
+ tlat->timer.function = timerlat_irq;
+
migrate_enable();
return 0;
};
@@ -2526,9 +2529,6 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
tlat->tracing_thread = false;
tlat->kthread = current;
- hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- tlat->timer.function = timerlat_irq;
-
/* Annotate now to drift new period */
tlat->abs_period = hrtimer_cb_get_time(&tlat->timer);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index db575094c498..3e7fa44dc2b2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -404,7 +404,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
vmstart = vma->vm_start;
}
if (file) {
- ret = trace_seq_path(s, &file->f_path);
+ ret = trace_seq_path(s, file_user_path(file));
if (ret)
trace_seq_printf(s, "[+0x%lx]",
ip - vmstart);
@@ -1587,11 +1587,12 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
+ int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
seq_print_ip_sym(s, field->ip, flags);
- trace_seq_printf(s, ": %s", field->buf);
+ trace_seq_printf(s, ": %.*s", max, field->buf);
return trace_handle_return(s);
}
@@ -1600,10 +1601,11 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
struct print_entry *field;
+ int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
- trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
+ trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf);
return trace_handle_return(&iter->seq);
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 4dc74d73fc1d..34289f9c6707 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1159,9 +1159,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
if (!(ctx->flags & TPARG_FL_TEVENT) &&
(strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 ||
strncmp(arg, "\\\"", 2) == 0)) {
- /* The type of $comm must be "string", and not an array. */
- if (parg->count || (t && strcmp(t, "string")))
+ /* The type of $comm must be "string", and not an array type. */
+ if (parg->count || (t && strcmp(t, "string"))) {
+ trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
+ NEED_STRING_TYPE);
goto out;
+ }
parg->type = find_fetch_type("string", ctx->flags);
} else
parg->type = find_fetch_type(t, ctx->flags);
@@ -1169,18 +1172,6 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE);
goto out;
}
- parg->offset = *size;
- *size += parg->type->size * (parg->count ?: 1);
-
- ret = -ENOMEM;
- if (parg->count) {
- len = strlen(parg->type->fmttype) + 6;
- parg->fmt = kmalloc(len, GFP_KERNEL);
- if (!parg->fmt)
- goto out;
- snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
- parg->count);
- }
code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
if (!code)
@@ -1204,6 +1195,19 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
goto fail;
}
}
+ parg->offset = *size;
+ *size += parg->type->size * (parg->count ?: 1);
+
+ if (parg->count) {
+ len = strlen(parg->type->fmttype) + 6;
+ parg->fmt = kmalloc(len, GFP_KERNEL);
+ if (!parg->fmt) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
+ parg->count);
+ }
ret = -EINVAL;
/* Store operation */
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 02b432ae7513..c1877d018269 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -450,6 +450,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(BAD_MAXACT, "Invalid maxactive number"), \
C(MAXACT_TOO_BIG, "Maxactive is too big"), \
C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \
+ C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \
C(BAD_RETPROBE, "Retprobe address must be an function entry"), \
C(NO_TRACEPOINT, "Tracepoint is not found"), \
C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \
@@ -514,7 +515,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(BAD_HYPHEN, "Failed to parse single hyphen. Forgot '>'?"), \
C(NO_BTF_FIELD, "This field is not found."), \
C(BAD_BTF_TID, "Failed to get BTF type info."),\
- C(BAD_TYPE4STR, "This type does not fit for string."),
+ C(BAD_TYPE4STR, "This type does not fit for string."),\
+ C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index bac06ee3b98b..c158d65a8a88 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -13,9 +13,6 @@
* trace_seq_init() more than once to reset the trace_seq to start
* from scratch.
*
- * The buffer size is currently PAGE_SIZE, although it may become dynamic
- * in the future.
- *
* A write to the buffer will either succeed or fail. That is, unlike
* sprintf() there will not be a partial write (well it may write into
* the buffer but it wont update the pointers). This allows users to
@@ -370,8 +367,12 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
*/
int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
{
+ int ret;
__trace_seq_init(s);
- return seq_buf_to_user(&s->seq, ubuf, cnt);
+ ret = seq_buf_to_user(&s->seq, ubuf, s->readpos, cnt);
+ if (ret > 0)
+ s->readpos += ret;
+ return ret;
}
EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index de753403cdaf..9c581d6da843 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -556,7 +556,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
{
struct syscall_tp_t {
struct trace_entry ent;
- unsigned long syscall_nr;
+ int syscall_nr;
unsigned long args[SYSCALL_DEFINE_MAXARGS];
} __aligned(8) param;
int i;
@@ -661,7 +661,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
{
struct syscall_tp_t {
struct trace_entry ent;
- unsigned long syscall_nr;
+ int syscall_nr;
unsigned long ret;
} __aligned(8) param;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 99c051de412a..a84b85d8aac1 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -151,7 +151,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
return -ENOMEM;
if (addr == FETCH_TOKEN_COMM)
- ret = strlcpy(dst, current->comm, maxlen);
+ ret = strscpy(dst, current->comm, maxlen);
else
ret = strncpy_from_user(dst, src, maxlen);
if (ret >= 0) {
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index c774e560f2f9..a4dcf0f24352 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -574,7 +574,12 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
}
memcpy(elt->key, key, map->key_size);
- entry->val = elt;
+ /*
+ * Ensure the initialization is visible and
+ * publish the elt.
+ */
+ smp_wmb();
+ WRITE_ONCE(entry->val, elt);
atomic64_inc(&map->hits);
return entry->val;
diff --git a/kernel/up.c b/kernel/up.c
index a38b8b095251..df50828cc2f0 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
}
EXPORT_SYMBOL(smp_call_function_single);
-int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
unsigned long flags;
diff --git a/kernel/user.c b/kernel/user.c
index d667debeafd6..03cedc366dc9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -18,8 +18,18 @@
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
+#include <linux/binfmts.h>
#include <linux/proc_ns.h>
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc init_binfmt_misc = {
+ .entries = LIST_HEAD_INIT(init_binfmt_misc.entries),
+ .enabled = true,
+ .entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock),
+};
+EXPORT_SYMBOL_GPL(init_binfmt_misc);
+#endif
+
/*
* userns count is 1 for root user, 1 for init_uts_ns,
* and 1 for... ?
@@ -67,6 +77,9 @@ struct user_namespace init_user_ns = {
.keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
.keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem),
#endif
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+ .binfmt_misc = &init_binfmt_misc,
+#endif
};
EXPORT_SYMBOL_GPL(init_user_ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 1d8e47bed3f1..ce4d99df5f0e 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -22,7 +22,7 @@
#include <linux/bsearch.h>
#include <linux/sort.h>
-static struct kmem_cache *user_ns_cachep __read_mostly;
+static struct kmem_cache *user_ns_cachep __ro_after_init;
static DEFINE_MUTEX(userns_state_mutex);
static bool new_idmap_permitted(const struct file *file,
@@ -213,6 +213,9 @@ static void free_user_ns(struct work_struct *work)
kfree(ns->projid_map.forward);
kfree(ns->projid_map.reverse);
}
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+ kfree(ns->binfmt_misc);
+#endif
retire_userns_sysctls(ns);
key_free_user_ns(ns);
ns_free_inum(&ns->ns);
@@ -228,7 +231,7 @@ void __put_user_ns(struct user_namespace *ns)
}
EXPORT_SYMBOL(__put_user_ns);
-/**
+/*
* struct idmap_key - holds the information necessary to find an idmapping in a
* sorted idmap array. It is passed to cmp_map_id() as first argument.
*/
@@ -238,7 +241,7 @@ struct idmap_key {
u32 count; /* == 0 unless used with map_id_range_down() */
};
-/**
+/*
* cmp_map_id - Function to be passed to bsearch() to find the requested
* idmapping. Expects struct idmap_key to be passed via @k.
*/
@@ -268,7 +271,7 @@ static int cmp_map_id(const void *k, const void *e)
return 1;
}
-/**
+/*
* map_id_range_down_max - Find idmap via binary search in ordered idmap array.
* Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
*/
@@ -285,7 +288,7 @@ map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 cou
sizeof(struct uid_gid_extent), cmp_map_id);
}
-/**
+/*
* map_id_range_down_base - Find idmap via binary search in static extent array.
* Can only be called if number of mappings is equal or less than
* UID_GID_MAP_MAX_BASE_EXTENTS.
@@ -329,12 +332,12 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
return id;
}
-static u32 map_id_down(struct uid_gid_map *map, u32 id)
+u32 map_id_down(struct uid_gid_map *map, u32 id)
{
return map_id_range_down(map, id, 1);
}
-/**
+/*
* map_id_up_base - Find idmap via binary search in static extent array.
* Can only be called if number of mappings is equal or less than
* UID_GID_MAP_MAX_BASE_EXTENTS.
@@ -355,7 +358,7 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
return NULL;
}
-/**
+/*
* map_id_up_max - Find idmap via binary search in ordered idmap array.
* Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
*/
@@ -372,7 +375,7 @@ map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
sizeof(struct uid_gid_extent), cmp_map_id);
}
-static u32 map_id_up(struct uid_gid_map *map, u32 id)
+u32 map_id_up(struct uid_gid_map *map, u32 id)
{
struct uid_gid_extent *extent;
unsigned extents = map->nr_extents;
@@ -767,7 +770,7 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
return false;
}
-/**
+/*
* insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
* Takes care to allocate a 4K block of memory if the number of mappings exceeds
* UID_GID_MAP_MAX_BASE_EXTENTS.
@@ -836,7 +839,7 @@ static int cmp_extents_reverse(const void *a, const void *b)
return 0;
}
-/**
+/*
* sort_idmaps - Sorts an array of idmap entries.
* Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
*/
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index d0b6b390ee42..03b90d7d2175 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -270,7 +270,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
goto error;
ret = -ENOMEM;
- pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
goto error;
@@ -331,7 +331,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe,
filter.__reserved != 0)
return -EINVAL;
- tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf));
+ tf = memdup_array_user(_filter->filters, filter.nr_filters, sizeof(*tf));
if (IS_ERR(tf))
return PTR_ERR(tf);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d145305d95fe..81a8862295d6 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -91,7 +91,7 @@ static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched);
-static unsigned long watchdog_hardlockup_all_cpu_dumped;
+static unsigned long hard_lockup_nmi_warn;
notrace void arch_touch_nmi_watchdog(void)
{
@@ -151,12 +151,32 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
*/
if (is_hardlockup(cpu)) {
unsigned int this_cpu = smp_processor_id();
+ unsigned long flags;
/* Only print hardlockups once. */
if (per_cpu(watchdog_hardlockup_warned, cpu))
return;
+ /*
+ * Prevent multiple hard-lockup reports if one cpu is already
+ * engaged in dumping all cpu back traces.
+ */
+ if (sysctl_hardlockup_all_cpu_backtrace) {
+ if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn))
+ return;
+ }
+
+ /*
+ * NOTE: we call printk_cpu_sync_get_irqsave() after printing
+ * the lockup message. While it would be nice to serialize
+ * that printout, we really want to make sure that if some
+ * other CPU somehow locked up while holding the lock associated
+ * with printk_cpu_sync_get_irqsave() that we can still at least
+ * get the message about the lockup out.
+ */
pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu);
+ printk_cpu_sync_get_irqsave(flags);
+
print_modules();
print_irqtrace_events(current);
if (cpu == this_cpu) {
@@ -164,17 +184,17 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
show_regs(regs);
else
dump_stack();
+ printk_cpu_sync_put_irqrestore(flags);
} else {
+ printk_cpu_sync_put_irqrestore(flags);
trigger_single_cpu_backtrace(cpu);
}
- /*
- * Perform multi-CPU dump only once to avoid multiple
- * hardlockups generating interleaving traces
- */
- if (sysctl_hardlockup_all_cpu_backtrace &&
- !test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped))
+ if (sysctl_hardlockup_all_cpu_backtrace) {
trigger_allbutcpu_cpu_backtrace(cpu);
+ if (!hardlockup_panic)
+ clear_bit_unlock(0, &hard_lockup_nmi_warn);
+ }
if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP");
@@ -283,6 +303,13 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static unsigned long soft_lockup_nmi_warn;
+static int __init softlockup_panic_setup(char *str)
+{
+ softlockup_panic = simple_strtoul(str, NULL, 0);
+ return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+
static int __init nowatchdog_setup(char *str)
{
watchdog_user_enabled = 0;
@@ -441,6 +468,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
struct pt_regs *regs = get_irq_regs();
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ unsigned long flags;
if (!watchdog_enabled)
return HRTIMER_NORESTART;
@@ -507,6 +535,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* Start period for the next softlockup warning. */
update_report_ts();
+ printk_cpu_sync_get_irqsave(flags);
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
@@ -516,10 +545,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
show_regs(regs);
else
dump_stack();
+ printk_cpu_sync_put_irqrestore(flags);
if (softlockup_all_cpu_backtrace) {
trigger_allbutcpu_cpu_backtrace(smp_processor_id());
- clear_bit_unlock(0, &soft_lockup_nmi_warn);
+ if (!softlockup_panic)
+ clear_bit_unlock(0, &soft_lockup_nmi_warn);
}
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c85825e17df8..76e60faed892 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -381,6 +381,12 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */
/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
+/* PL: user requested unbound cpumask via sysfs */
+static cpumask_var_t wq_requested_unbound_cpumask;
+
+/* PL: isolated cpumask to be excluded from unbound cpumask */
+static cpumask_var_t wq_isolated_cpumask;
+
/* for further constrain wq_unbound_cpumask by cmdline parameter*/
static struct cpumask wq_cmdline_cpumask __initdata;
@@ -418,21 +424,21 @@ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
* process context while holding a pool lock. Bounce to a dedicated kthread
* worker to avoid A-A deadlocks.
*/
-static struct kthread_worker *pwq_release_worker;
+static struct kthread_worker *pwq_release_worker __ro_after_init;
-struct workqueue_struct *system_wq __read_mostly;
+struct workqueue_struct *system_wq __ro_after_init;
EXPORT_SYMBOL(system_wq);
-struct workqueue_struct *system_highpri_wq __read_mostly;
+struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
-struct workqueue_struct *system_long_wq __read_mostly;
+struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
-struct workqueue_struct *system_unbound_wq __read_mostly;
+struct workqueue_struct *system_unbound_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_unbound_wq);
-struct workqueue_struct *system_freezable_wq __read_mostly;
+struct workqueue_struct *system_freezable_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_wq);
-struct workqueue_struct *system_power_efficient_wq __read_mostly;
+struct workqueue_struct *system_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
-struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
@@ -1684,9 +1690,6 @@ static int wq_select_unbound_cpu(int cpu)
pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
}
- if (cpumask_empty(wq_unbound_cpumask))
- return cpu;
-
new_cpu = __this_cpu_read(wq_rr_cpu_last);
new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
if (unlikely(new_cpu >= nr_cpu_ids)) {
@@ -2166,7 +2169,7 @@ static struct worker *create_worker(struct worker_pool *pool)
{
struct worker *worker;
int id;
- char id_buf[16];
+ char id_buf[23];
/* ID is needed to determine kthread name */
id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
@@ -4411,19 +4414,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
mutex_unlock(&ctx->wq->mutex);
}
-static void apply_wqattrs_lock(void)
-{
- /* CPUs should stay stable across pwq creations and installations */
- cpus_read_lock();
- mutex_lock(&wq_pool_mutex);
-}
-
-static void apply_wqattrs_unlock(void)
-{
- mutex_unlock(&wq_pool_mutex);
- cpus_read_unlock();
-}
-
static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
@@ -4600,12 +4590,22 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
}
cpus_read_unlock();
+ /* for unbound pwq, flush the pwq_release_worker ensures that the
+ * pwq_release_workfn() completes before calling kfree(wq).
+ */
+ if (ret)
+ kthread_flush_worker(pwq_release_worker);
+
return ret;
enomem:
if (wq->cpu_pwq) {
- for_each_possible_cpu(cpu)
- kfree(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ for_each_possible_cpu(cpu) {
+ struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
+
+ if (pwq)
+ kmem_cache_free(pwq_cache, pwq);
+ }
free_percpu(wq->cpu_pwq);
wq->cpu_pwq = NULL;
}
@@ -5612,50 +5612,54 @@ static void work_for_cpu_fn(struct work_struct *work)
}
/**
- * work_on_cpu - run a function in thread context on a particular cpu
+ * work_on_cpu_key - run a function in thread context on a particular cpu
* @cpu: the cpu to run on
* @fn: the function to run
* @arg: the function arg
+ * @key: The lock class key for lock debugging purposes
*
* It is up to the caller to ensure that the cpu doesn't go offline.
* The caller must not hold any locks which would prevent @fn from completing.
*
* Return: The value @fn returns.
*/
-long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu_key(int cpu, long (*fn)(void *),
+ void *arg, struct lock_class_key *key)
{
struct work_for_cpu wfc = { .fn = fn, .arg = arg };
- INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+ INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
schedule_work_on(cpu, &wfc.work);
flush_work(&wfc.work);
destroy_work_on_stack(&wfc.work);
return wfc.ret;
}
-EXPORT_SYMBOL_GPL(work_on_cpu);
+EXPORT_SYMBOL_GPL(work_on_cpu_key);
/**
- * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * work_on_cpu_safe_key - run a function in thread context on a particular cpu
* @cpu: the cpu to run on
* @fn: the function to run
* @arg: the function argument
+ * @key: The lock class key for lock debugging purposes
*
* Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
* any locks which would prevent @fn from completing.
*
* Return: The value @fn returns.
*/
-long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
+ void *arg, struct lock_class_key *key)
{
long ret = -ENODEV;
cpus_read_lock();
if (cpu_online(cpu))
- ret = work_on_cpu(cpu, fn, arg);
+ ret = work_on_cpu_key(cpu, fn, arg, key);
cpus_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(work_on_cpu_safe);
+EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
#endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER
@@ -5782,9 +5786,13 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
list_for_each_entry(wq, &workqueues, list) {
if (!(wq->flags & WQ_UNBOUND))
continue;
+
/* creating multiple pwqs breaks ordering guarantee */
- if (wq->flags & __WQ_ORDERED)
- continue;
+ if (!list_empty(&wq->pwqs)) {
+ if (wq->flags & __WQ_ORDERED_EXPLICIT)
+ continue;
+ wq->flags &= ~__WQ_ORDERED;
+ }
ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
if (IS_ERR(ctx)) {
@@ -5810,39 +5818,40 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
}
/**
- * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
- * @cpumask: the cpumask to set
+ * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
+ * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
*
- * The low-level workqueues cpumask is a global cpumask that limits
- * the affinity of all unbound workqueues. This function check the @cpumask
- * and apply it to all unbound workqueues and updates all pwqs of them.
- *
- * Return: 0 - Success
- * -EINVAL - Invalid @cpumask
- * -ENOMEM - Failed to allocate memory for attrs or pwqs.
+ * This function can be called from cpuset code to provide a set of isolated
+ * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold
+ * either cpus_read_lock or cpus_write_lock.
*/
-int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
{
- int ret = -EINVAL;
+ cpumask_var_t cpumask;
+ int ret = 0;
+
+ if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ lockdep_assert_cpus_held();
+ mutex_lock(&wq_pool_mutex);
+
+ /* Save the current isolated cpumask & export it via sysfs */
+ cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
/*
- * Not excluding isolated cpus on purpose.
- * If the user wishes to include them, we allow that.
+ * If the operation fails, it will fall back to
+ * wq_requested_unbound_cpumask which is initially set to
+ * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
+ * by any subsequent write to workqueue/cpumask sysfs file.
*/
- cpumask_and(cpumask, cpumask, cpu_possible_mask);
- if (!cpumask_empty(cpumask)) {
- apply_wqattrs_lock();
- if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
- ret = 0;
- goto out_unlock;
- }
-
+ if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
+ cpumask_copy(cpumask, wq_requested_unbound_cpumask);
+ if (!cpumask_equal(cpumask, wq_unbound_cpumask))
ret = workqueue_apply_unbound_cpumask(cpumask);
-out_unlock:
- apply_wqattrs_unlock();
- }
-
+ mutex_unlock(&wq_pool_mutex);
+ free_cpumask_var(cpumask);
return ret;
}
@@ -5964,6 +5973,19 @@ static struct attribute *wq_sysfs_attrs[] = {
};
ATTRIBUTE_GROUPS(wq_sysfs);
+static void apply_wqattrs_lock(void)
+{
+ /* CPUs should stay stable across pwq creations and installations */
+ cpus_read_lock();
+ mutex_lock(&wq_pool_mutex);
+}
+
+static void apply_wqattrs_unlock(void)
+{
+ mutex_unlock(&wq_pool_mutex);
+ cpus_read_unlock();
+}
+
static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -6140,19 +6162,74 @@ static struct bus_type wq_subsys = {
.dev_groups = wq_sysfs_groups,
};
-static ssize_t wq_unbound_cpumask_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+/**
+ * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
+ * @cpumask: the cpumask to set
+ *
+ * The low-level workqueues cpumask is a global cpumask that limits
+ * the affinity of all unbound workqueues. This function check the @cpumask
+ * and apply it to all unbound workqueues and updates all pwqs of them.
+ *
+ * Return: 0 - Success
+ * -EINVAL - Invalid @cpumask
+ * -ENOMEM - Failed to allocate memory for attrs or pwqs.
+ */
+static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+{
+ int ret = -EINVAL;
+
+ /*
+ * Not excluding isolated cpus on purpose.
+ * If the user wishes to include them, we allow that.
+ */
+ cpumask_and(cpumask, cpumask, cpu_possible_mask);
+ if (!cpumask_empty(cpumask)) {
+ apply_wqattrs_lock();
+ cpumask_copy(wq_requested_unbound_cpumask, cpumask);
+ if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ ret = workqueue_apply_unbound_cpumask(cpumask);
+
+out_unlock:
+ apply_wqattrs_unlock();
+ }
+
+ return ret;
+}
+
+static ssize_t __wq_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf, cpumask_var_t mask)
{
int written;
mutex_lock(&wq_pool_mutex);
- written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
- cpumask_pr_args(wq_unbound_cpumask));
+ written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
mutex_unlock(&wq_pool_mutex);
return written;
}
+static ssize_t wq_unbound_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
+}
+
+static ssize_t wq_requested_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
+}
+
+static ssize_t wq_isolated_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
+}
+
static ssize_t wq_unbound_cpumask_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
@@ -6170,9 +6247,13 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev,
return ret ? ret : count;
}
-static struct device_attribute wq_sysfs_cpumask_attr =
+static struct device_attribute wq_sysfs_cpumask_attrs[] = {
__ATTR(cpumask, 0644, wq_unbound_cpumask_show,
- wq_unbound_cpumask_store);
+ wq_unbound_cpumask_store),
+ __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL),
+ __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL),
+ __ATTR_NULL,
+};
static int __init wq_sysfs_init(void)
{
@@ -6185,7 +6266,13 @@ static int __init wq_sysfs_init(void)
dev_root = bus_get_dev_root(&wq_subsys);
if (dev_root) {
- err = device_create_file(dev_root, &wq_sysfs_cpumask_attr);
+ struct device_attribute *attr;
+
+ for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) {
+ err = device_create_file(dev_root, attr);
+ if (err)
+ break;
+ }
put_device(dev_root);
}
return err;
@@ -6497,6 +6584,17 @@ static inline void wq_watchdog_init(void) { }
#endif /* CONFIG_WQ_WATCHDOG */
+static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
+{
+ if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
+ pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
+ cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
+ return;
+ }
+
+ cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
+}
+
/**
* workqueue_init_early - early init for workqueue subsystem
*
@@ -6516,11 +6614,16 @@ void __init workqueue_init_early(void)
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
- cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
- cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
+ cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+ restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
+ restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
if (!cpumask_empty(&wq_cmdline_cpumask))
- cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask);
+ restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
+
+ cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
@@ -6535,9 +6638,6 @@ void __init workqueue_init_early(void)
BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));
- wq_update_pod_attrs_buf = alloc_workqueue_attrs();
- BUG_ON(!wq_update_pod_attrs_buf);
-
pt->nr_pods = 1;
cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
pt->pod_node[0] = NUMA_NO_NODE;
@@ -6605,13 +6705,13 @@ static void __init wq_cpu_intensive_thresh_init(void)
unsigned long thresh;
unsigned long bogo;
+ pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
+ BUG_ON(IS_ERR(pwq_release_worker));
+
/* if the user set it to a specific value, keep it */
if (wq_cpu_intensive_thresh_us != ULONG_MAX)
return;
- pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
- BUG_ON(IS_ERR(pwq_release_worker));
-
/*
* The default of 10ms is derived from the fact that most modern (as of
* 2023) processors can do a lot in 10ms and that it's just below what