summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt22
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c3
-rw-r--r--kernel/audit.c47
-rw-r--r--kernel/audit.h9
-rw-r--r--kernel/audit_fsnotify.c3
-rw-r--r--kernel/audit_tree.c25
-rw-r--r--kernel/audit_watch.c6
-rw-r--r--kernel/auditfilter.c22
-rw-r--r--kernel/auditsc.c532
-rw-r--r--kernel/bpf/Kconfig7
-rw-r--r--kernel/bpf/Makefile6
-rw-r--r--kernel/bpf/arraymap.c8
-rw-r--r--kernel/bpf/bloom_filter.c210
-rw-r--r--kernel/bpf/bpf_inode_storage.c6
-rw-r--r--kernel/bpf/bpf_iter.c35
-rw-r--r--kernel/bpf/bpf_local_storage.c50
-rw-r--r--kernel/bpf/bpf_struct_ops.c35
-rw-r--r--kernel/bpf/bpf_struct_ops_types.h3
-rw-r--r--kernel/bpf/bpf_task_storage.c10
-rw-r--r--kernel/bpf/btf.c717
-rw-r--r--kernel/bpf/cgroup.c58
-rw-r--r--kernel/bpf/core.c48
-rw-r--r--kernel/bpf/cpumap.c12
-rw-r--r--kernel/bpf/devmap.c36
-rw-r--r--kernel/bpf/disasm.c2
-rw-r--r--kernel/bpf/disasm.h2
-rw-r--r--kernel/bpf/hashtab.c13
-rw-r--r--kernel/bpf/helpers.c44
-rw-r--r--kernel/bpf/local_storage.c3
-rw-r--r--kernel/bpf/lpm_trie.c2
-rw-r--r--kernel/bpf/map_iter.c4
-rw-r--r--kernel/bpf/mmap_unlock_work.h65
-rw-r--r--kernel/bpf/net_namespace.c1
-rw-r--r--kernel/bpf/preload/.gitignore4
-rw-r--r--kernel/bpf/preload/Makefile26
-rw-r--r--kernel/bpf/preload/iterators/Makefile38
-rw-r--r--kernel/bpf/reuseport_array.c6
-rw-r--r--kernel/bpf/ringbuf.c2
-rw-r--r--kernel/bpf/stackmap.c81
-rw-r--r--kernel/bpf/syscall.c150
-rw-r--r--kernel/bpf/task_iter.c82
-rw-r--r--kernel/bpf/trampoline.c23
-rw-r--r--kernel/bpf/verifier.c1417
-rw-r--r--kernel/cgroup/cgroup-internal.h19
-rw-r--r--kernel/cgroup/cgroup-v1.c50
-rw-r--r--kernel/cgroup/cgroup.c282
-rw-r--r--kernel/cgroup/cpuset.c95
-rw-r--r--kernel/cgroup/misc.c31
-rw-r--r--kernel/cgroup/rstat.c53
-rw-r--r--kernel/compat.c21
-rw-r--r--kernel/cpu.c7
-rw-r--r--kernel/crash_core.c11
-rw-r--r--kernel/cred.c14
-rw-r--r--kernel/debug/debug_core.c5
-rw-r--r--kernel/debug/gdbstub.c5
-rw-r--r--kernel/debug/kdb/kdb_bp.c72
-rw-r--r--kernel/debug/kdb/kdb_bt.c16
-rw-r--r--kernel/debug/kdb/kdb_debugger.c1
-rw-r--r--kernel/debug/kdb/kdb_main.c657
-rw-r--r--kernel/debug/kdb/kdb_private.h22
-rw-r--r--kernel/debug/kdb/kdb_support.c442
-rw-r--r--kernel/dma/coherent.c5
-rw-r--r--kernel/dma/debug.c39
-rw-r--r--kernel/dma/debug.h24
-rw-r--r--kernel/dma/direct.c240
-rw-r--r--kernel/dma/mapping.c31
-rw-r--r--kernel/dma/pool.c4
-rw-r--r--kernel/dma/swiotlb.c19
-rw-r--r--kernel/entry/common.c8
-rw-r--r--kernel/entry/kvm.c4
-rw-r--r--kernel/entry/syscall_user_dispatch.c12
-rw-r--r--kernel/events/Makefile5
-rw-r--r--kernel/events/core.c128
-rw-r--r--kernel/events/internal.h7
-rw-r--r--kernel/events/uprobes.c3
-rw-r--r--kernel/exit.c79
-rw-r--r--kernel/extable.c35
-rw-r--r--kernel/fork.c144
-rw-r--r--kernel/futex.c4240
-rw-r--r--kernel/futex/Makefile3
-rw-r--r--kernel/futex/core.c1141
-rw-r--r--kernel/futex/futex.h293
-rw-r--r--kernel/futex/pi.c1233
-rw-r--r--kernel/futex/requeue.c897
-rw-r--r--kernel/futex/syscalls.c376
-rw-r--r--kernel/futex/waitwake.c708
-rw-r--r--kernel/irq/Kconfig10
-rw-r--r--kernel/irq/chip.c6
-rw-r--r--kernel/irq/generic-chip.c5
-rw-r--r--kernel/irq/handle.c29
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/irqdesc.c81
-rw-r--r--kernel/irq/irqdomain.c10
-rw-r--r--kernel/irq/manage.c14
-rw-r--r--kernel/irq/msi.c794
-rw-r--r--kernel/irq/spurious.c8
-rw-r--r--kernel/irq_work.c130
-rw-r--r--kernel/kallsyms.c46
-rw-r--r--kernel/kcov.c36
-rw-r--r--kernel/kcsan/Makefile3
-rw-r--r--kernel/kcsan/core.c420
-rw-r--r--kernel/kcsan/kcsan.h8
-rw-r--r--kernel/kcsan/kcsan_test.c474
-rw-r--r--kernel/kcsan/report.c112
-rw-r--r--kernel/kcsan/selftest.c215
-rw-r--r--kernel/kexec.c103
-rw-r--r--kernel/kexec_file.c5
-rw-r--r--kernel/kprobes.c512
-rw-r--r--kernel/kthread.c19
-rw-r--r--kernel/livepatch/patch.c12
-rw-r--r--kernel/livepatch/transition.c95
-rw-r--r--kernel/locking/lockdep.c26
-rw-r--r--kernel/locking/locktorture.c18
-rw-r--r--kernel/locking/mutex.c70
-rw-r--r--kernel/locking/rtmutex.c31
-rw-r--r--kernel/locking/rtmutex_api.c30
-rw-r--r--kernel/locking/rwbase_rt.c72
-rw-r--r--kernel/locking/rwsem.c265
-rw-r--r--kernel/locking/spinlock.c3
-rw-r--r--kernel/locking/spinlock_rt.c23
-rw-r--r--kernel/locking/test-ww_mutex.c87
-rw-r--r--kernel/locking/ww_rt_mutex.c25
-rw-r--r--kernel/module.c81
-rw-r--r--kernel/notifier.c15
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/pid.c36
-rw-r--r--kernel/power/energy_model.c86
-rw-r--r--kernel/power/hibernate.c18
-rw-r--r--kernel/power/power.h15
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/suspend.c18
-rw-r--r--kernel/power/swap.c37
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/printk/index.c5
-rw-r--r--kernel/printk/printk.c118
-rw-r--r--kernel/profile.c21
-rw-r--r--kernel/rcu/Kconfig20
-rw-r--r--kernel/rcu/rcu_segcblist.c10
-rw-r--r--kernel/rcu/rcu_segcblist.h12
-rw-r--r--kernel/rcu/rcuscale.c24
-rw-r--r--kernel/rcu/rcutorture.c327
-rw-r--r--kernel/rcu/refscale.c56
-rw-r--r--kernel/rcu/srcutiny.c2
-rw-r--r--kernel/rcu/tasks.h589
-rw-r--r--kernel/rcu/tree.c167
-rw-r--r--kernel/rcu/tree.h31
-rw-r--r--kernel/rcu/tree_exp.h15
-rw-r--r--kernel/rcu/tree_nocb.h162
-rw-r--r--kernel/rcu/tree_plugin.h261
-rw-r--r--kernel/rcu/tree_stall.h35
-rw-r--r--kernel/rcu/update.c12
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/resource.c54
-rw-r--r--kernel/rseq.c14
-rw-r--r--kernel/scftorture.c59
-rw-r--r--kernel/sched/Makefile11
-rw-r--r--kernel/sched/autogroup.c2
-rw-r--r--kernel/sched/core.c617
-rw-r--r--kernel/sched/core_sched.c79
-rw-r--r--kernel/sched/cpuacct.c107
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/cputime.c16
-rw-r--r--kernel/sched/deadline.c99
-rw-r--r--kernel/sched/debug.c113
-rw-r--r--kernel/sched/fair.c593
-rw-r--r--kernel/sched/features.h5
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/psi.c47
-rw-r--r--kernel/sched/rt.c165
-rw-r--r--kernel/sched/sched.h116
-rw-r--r--kernel/sched/stats.c104
-rw-r--r--kernel/sched/stats.h54
-rw-r--r--kernel/sched/stop_task.c4
-rw-r--r--kernel/sched/topology.c35
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/scs.c1
-rw-r--r--kernel/signal.c184
-rw-r--r--kernel/smp.c12
-rw-r--r--kernel/softirq.c3
-rw-r--r--kernel/stacktrace.c30
-rw-r--r--kernel/sys.c103
-rw-r--r--kernel/sys_ni.c9
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/test_kprobes.c313
-rw-r--r--kernel/time/clocksource.c52
-rw-r--r--kernel/time/posix-cpu-timers.c22
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/time/timer.c16
-rw-r--r--kernel/torture.c4
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c31
-rw-r--r--kernel/trace/bpf_trace.c193
-rw-r--r--kernel/trace/fgraph.c6
-rw-r--r--kernel/trace/ftrace.c381
-rw-r--r--kernel/trace/pid_list.c495
-rw-r--r--kernel/trace/pid_list.h88
-rw-r--r--kernel/trace/ring_buffer.c29
-rw-r--r--kernel/trace/trace.c231
-rw-r--r--kernel/trace/trace.h61
-rw-r--r--kernel/trace/trace_boot.c315
-rw-r--r--kernel/trace/trace_dynevent.c40
-rw-r--r--kernel/trace/trace_dynevent.h4
-rw-r--r--kernel/trace/trace_eprobe.c959
-rw-r--r--kernel/trace/trace_event_perf.c15
-rw-r--r--kernel/trace/trace_events.c83
-rw-r--r--kernel/trace/trace_events_hist.c719
-rw-r--r--kernel/trace/trace_events_synth.c36
-rw-r--r--kernel/trace/trace_events_trigger.c20
-rw-r--r--kernel/trace/trace_functions.c5
-rw-r--r--kernel/trace/trace_functions_graph.c4
-rw-r--r--kernel/trace/trace_hwlat.c44
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_kprobe.c60
-rw-r--r--kernel/trace/trace_osnoise.c676
-rw-r--r--kernel/trace/trace_output.c28
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/trace/trace_probe.c109
-rw-r--r--kernel/trace/trace_probe.h16
-rw-r--r--kernel/trace/trace_probe_tmpl.h6
-rw-r--r--kernel/trace/trace_recursion_record.c4
-rw-r--r--kernel/trace/trace_selftest.c92
-rw-r--r--kernel/trace/trace_stack.c6
-rw-r--r--kernel/trace/trace_stat.c6
-rw-r--r--kernel/trace/trace_synth.h2
-rw-r--r--kernel/trace/trace_uprobe.c46
-rw-r--r--kernel/trace/tracing_map.c43
-rw-r--r--kernel/tracepoint.c4
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/ucount.c76
-rw-r--r--kernel/user.c25
-rw-r--r--kernel/workqueue.c276
234 files changed, 18857 insertions, 10812 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 5876e30c5740..ce77f0265660 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,11 +1,23 @@
# SPDX-License-Identifier: GPL-2.0-only
+config PREEMPT_NONE_BUILD
+ bool
+
+config PREEMPT_VOLUNTARY_BUILD
+ bool
+
+config PREEMPT_BUILD
+ bool
+ select PREEMPTION
+ select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+
choice
prompt "Preemption Model"
default PREEMPT_NONE
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
+ select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
help
This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the
@@ -20,6 +32,7 @@ config PREEMPT_NONE
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
depends on !ARCH_NO_PREEMPT
+ select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@@ -38,9 +51,7 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPTION
- select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
- select PREEMPT_DYNAMIC if HAVE_PREEMPT_DYNAMIC
+ select PREEMPT_BUILD
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@@ -83,7 +94,10 @@ config PREEMPTION
select PREEMPT_COUNT
config PREEMPT_DYNAMIC
- bool
+ bool "Preemption behaviour defined on boot"
+ depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+ select PREEMPT_BUILD
+ default y
help
This option allows to define the preemption model on the kernel
command line parameter and thus override the default preemption
diff --git a/kernel/Makefile b/kernel/Makefile
index 4df609be42d0..186c49582f45 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -59,7 +59,7 @@ obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
-obj-$(CONFIG_FUTEX) += futex.o
+obj-$(CONFIG_FUTEX) += futex/
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
@@ -85,7 +85,6 @@ obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_IKHEADERS) += kheaders.o
obj-$(CONFIG_SMP) += stop_machine.o
-obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
diff --git a/kernel/acct.c b/kernel/acct.c
index a64102be2bb0..3df53cf1dcd5 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -60,7 +60,6 @@
#include <linux/sched/cputime.h>
#include <asm/div64.h>
-#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
#include <linux/fs_pin.h>
@@ -478,7 +477,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
/*
* Accounting records are not subject to resource limits.
*/
- flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ flim = rlimit(RLIMIT_FSIZE);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
/* Perform file operations on behalf of whoever enabled accounting */
orig_cred = override_creds(file->f_cred);
diff --git a/kernel/audit.c b/kernel/audit.c
index 121d37e700a6..e4bbe2c70c26 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -718,7 +718,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
{
int rc = 0;
struct sk_buff *skb;
- static unsigned int failed = 0;
+ unsigned int failed = 0;
/* NOTE: kauditd_thread takes care of all our locking, we just use
* the netlink info passed to us (e.g. sk and portid) */
@@ -735,32 +735,30 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
continue;
}
+retry:
/* grab an extra skb reference in case of error */
skb_get(skb);
rc = netlink_unicast(sk, skb, portid, 0);
if (rc < 0) {
- /* fatal failure for our queue flush attempt? */
+ /* send failed - try a few times unless fatal error */
if (++failed >= retry_limit ||
rc == -ECONNREFUSED || rc == -EPERM) {
- /* yes - error processing for the queue */
sk = NULL;
if (err_hook)
(*err_hook)(skb);
- if (!skb_hook)
- goto out;
- /* keep processing with the skb_hook */
+ if (rc == -EAGAIN)
+ rc = 0;
+ /* continue to drain the queue */
continue;
} else
- /* no - requeue to preserve ordering */
- skb_queue_head(queue, skb);
+ goto retry;
} else {
- /* it worked - drop the extra reference and continue */
+ /* skb sent - drop the extra reference and continue */
consume_skb(skb);
failed = 0;
}
}
-out:
return (rc >= 0 ? 0 : rc);
}
@@ -1446,7 +1444,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
return err;
}
- sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
+ sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
if (!sig_data) {
if (audit_sig_sid)
security_release_secctx(ctx, len);
@@ -1459,7 +1457,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
security_release_secctx(ctx, len);
}
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
- sig_data, sizeof(*sig_data) + len);
+ sig_data, struct_size(sig_data, ctx, len));
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
@@ -1542,6 +1540,20 @@ static void audit_receive(struct sk_buff *skb)
nlh = nlmsg_next(nlh, &len);
}
audit_ctl_unlock();
+
+ /* can't block with the ctrl lock, so penalize the sender now */
+ if (audit_backlog_limit &&
+ (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ /* wake kauditd to try and flush the queue */
+ wake_up_interruptible(&kauditd_wait);
+
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(audit_backlog_wait_time);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+ }
}
/* Log information about who is connecting to the audit multicast socket */
@@ -1609,7 +1621,8 @@ static int __net_init audit_net_init(struct net *net)
audit_panic("cannot initialize netlink socket in namespace");
return -ENOMEM;
}
- aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ /* limit the timeout in case auditd is blocked/stopped */
+ aunet->sk->sk_sndtimeo = HZ / 10;
return 0;
}
@@ -1825,7 +1838,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
* task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
* using a PID anchored in the caller's namespace
* 2. generator holding the audit_cmd_mutex - we don't want to block
- * while holding the mutex */
+ * while holding the mutex, although we do penalize the sender
+ * later in audit_receive() when it is safe to block
+ */
if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
long stime = audit_backlog_wait_time;
@@ -2132,7 +2147,7 @@ int audit_log_task_context(struct audit_buffer *ab)
int error;
u32 sid;
- security_task_getsecid_subj(current, &sid);
+ security_current_getsecid_subj(&sid);
if (!sid)
return 0;
@@ -2353,7 +2368,7 @@ int audit_signal_info(int sig, struct task_struct *t)
audit_sig_uid = auid;
else
audit_sig_uid = uid;
- security_task_getsecid_subj(current, &audit_sig_sid);
+ security_current_getsecid_subj(&audit_sig_sid);
}
return audit_signal_info_syscall(t);
diff --git a/kernel/audit.h b/kernel/audit.h
index d6a2c899a8db..c4498090a5bd 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -14,6 +14,7 @@
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
#include <linux/tty.h>
+#include <uapi/linux/openat2.h> // struct open_how
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
* for saving names from getname(). If we get more names we will allocate
@@ -100,10 +101,15 @@ struct audit_proctitle {
/* The per-task audit context. */
struct audit_context {
int dummy; /* must be the first element */
- int in_syscall; /* 1 if task is in a syscall */
+ enum {
+ AUDIT_CTX_UNUSED, /* audit_context is currently unused */
+ AUDIT_CTX_SYSCALL, /* in use by syscall */
+ AUDIT_CTX_URING, /* in use by io_uring */
+ } context;
enum audit_state state, current_state;
unsigned int serial; /* serial number for record */
int major; /* syscall number */
+ int uring_op; /* uring operation */
struct timespec64 ctime; /* time of syscall entry */
unsigned long argv[4]; /* syscall arguments */
long return_code;/* syscall return code */
@@ -188,6 +194,7 @@ struct audit_context {
int fd;
int flags;
} mmap;
+ struct open_how openat2;
struct {
int argc;
} execve;
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 60739d5e3373..02348b48447c 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -160,8 +160,7 @@ static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
- if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) ||
- WARN_ON_ONCE(!inode))
+ if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group))
return 0;
if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2cd7b5694422..e7315d487163 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -30,7 +30,7 @@ struct audit_chunk {
int count;
atomic_long_t refs;
struct rcu_head head;
- struct node {
+ struct audit_node {
struct list_head list;
struct audit_tree *owner;
unsigned index; /* index; upper bit indicates 'will prune' */
@@ -94,7 +94,7 @@ static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
- tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
+ tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL);
if (tree) {
refcount_set(&tree->count, 1);
tree->goner = 0;
@@ -269,7 +269,7 @@ bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
/* tagging and untagging inodes with trees */
-static struct audit_chunk *find_chunk(struct node *p)
+static struct audit_chunk *find_chunk(struct audit_node *p)
{
int index = p->index & ~(1U<<31);
p -= index;
@@ -322,7 +322,7 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old)
list_replace_rcu(&old->hash, &new->hash);
}
-static void remove_chunk_node(struct audit_chunk *chunk, struct node *p)
+static void remove_chunk_node(struct audit_chunk *chunk, struct audit_node *p)
{
struct audit_tree *owner = p->owner;
@@ -459,7 +459,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
struct fsnotify_mark *mark;
struct audit_chunk *chunk, *old;
- struct node *p;
+ struct audit_node *p;
int n;
mutex_lock(&audit_tree_group->mark_mutex);
@@ -570,11 +570,11 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
{
spin_lock(&hash_lock);
while (!list_empty(&victim->chunks)) {
- struct node *p;
+ struct audit_node *p;
struct audit_chunk *chunk;
struct fsnotify_mark *mark;
- p = list_first_entry(&victim->chunks, struct node, list);
+ p = list_first_entry(&victim->chunks, struct audit_node, list);
/* have we run out of marked? */
if (tagged && !(p->index & (1U<<31)))
break;
@@ -616,7 +616,7 @@ static void trim_marked(struct audit_tree *tree)
}
/* reorder */
for (p = tree->chunks.next; p != &tree->chunks; p = q) {
- struct node *node = list_entry(p, struct node, list);
+ struct audit_node *node = list_entry(p, struct audit_node, list);
q = p->next;
if (node->index & (1U<<31)) {
list_del_init(p);
@@ -684,7 +684,7 @@ void audit_trim_trees(void)
struct audit_tree *tree;
struct path path;
struct vfsmount *root_mnt;
- struct node *node;
+ struct audit_node *node;
int err;
tree = container_of(cursor.next, struct audit_tree, list);
@@ -726,7 +726,8 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
{
if (pathname[0] != '/' ||
- rule->listnr != AUDIT_FILTER_EXIT ||
+ (rule->listnr != AUDIT_FILTER_EXIT &&
+ rule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
rule->inode_f || rule->watch || rule->tree)
return -EINVAL;
@@ -839,7 +840,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
drop_collected_mounts(mnt);
if (!err) {
- struct node *node;
+ struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
@@ -938,7 +939,7 @@ int audit_tag_tree(char *old, char *new)
mutex_unlock(&audit_filter_mutex);
if (!failed) {
- struct node *node;
+ struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 2acf7ca49154..713b256be944 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -183,7 +183,8 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
return -EOPNOTSUPP;
if (path[0] != '/' || path[len-1] == '/' ||
- krule->listnr != AUDIT_FILTER_EXIT ||
+ (krule->listnr != AUDIT_FILTER_EXIT &&
+ krule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
krule->inode_f || krule->watch || krule->tree)
return -EINVAL;
@@ -472,8 +473,7 @@ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
parent = container_of(inode_mark, struct audit_parent, mark);
- if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) ||
- WARN_ON_ONCE(!inode))
+ if (WARN_ON_ONCE(inode_mark->group != audit_watch_group))
return 0;
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index db2c6b59dfc3..42d99896e7a6 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -44,7 +44,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[4]),
LIST_HEAD_INIT(audit_filter_list[5]),
LIST_HEAD_INIT(audit_filter_list[6]),
-#if AUDIT_NR_FILTERS != 7
+ LIST_HEAD_INIT(audit_filter_list[7]),
+#if AUDIT_NR_FILTERS != 8
#error Fix audit_filter_list initialiser
#endif
};
@@ -56,6 +57,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_rules_list[4]),
LIST_HEAD_INIT(audit_rules_list[5]),
LIST_HEAD_INIT(audit_rules_list[6]),
+ LIST_HEAD_INIT(audit_rules_list[7]),
};
DEFINE_MUTEX(audit_filter_mutex);
@@ -151,7 +153,8 @@ char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
static inline int audit_to_inode(struct audit_krule *krule,
struct audit_field *f)
{
- if (krule->listnr != AUDIT_FILTER_EXIT ||
+ if ((krule->listnr != AUDIT_FILTER_EXIT &&
+ krule->listnr != AUDIT_FILTER_URING_EXIT) ||
krule->inode_f || krule->watch || krule->tree ||
(f->op != Audit_equal && f->op != Audit_not_equal))
return -EINVAL;
@@ -248,6 +251,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
pr_err("AUDIT_FILTER_ENTRY is deprecated\n");
goto exit_err;
case AUDIT_FILTER_EXIT:
+ case AUDIT_FILTER_URING_EXIT:
case AUDIT_FILTER_TASK:
#endif
case AUDIT_FILTER_USER:
@@ -332,6 +336,10 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
if (entry->rule.listnr != AUDIT_FILTER_FS)
return -EINVAL;
break;
+ case AUDIT_PERM:
+ if (entry->rule.listnr == AUDIT_FILTER_URING_EXIT)
+ return -EINVAL;
+ break;
}
switch (entry->rule.listnr) {
@@ -629,7 +637,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
void *bufp;
int i;
- data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
+ data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
if (unlikely(!data))
return NULL;
memset(data, 0, sizeof(*data));
@@ -980,7 +988,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
}
entry->rule.prio = ~0ULL;
- if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
+ if (entry->rule.listnr == AUDIT_FILTER_EXIT ||
+ entry->rule.listnr == AUDIT_FILTER_URING_EXIT) {
if (entry->rule.flags & AUDIT_FILTER_PREPEND)
entry->rule.prio = ++prio_high;
else
@@ -1083,7 +1092,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
break;
skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
data,
- sizeof(*data) + data->buflen);
+ struct_size(data, buf, data->buflen));
if (skb)
skb_queue_tail(q, skb);
kfree(data);
@@ -1359,8 +1368,7 @@ int audit_filter(int msgtype, unsigned int listtype)
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
if (f->lsm_rule) {
- security_task_getsecid_subj(current,
- &sid);
+ security_current_getsecid_subj(&sid);
result = security_audit_rule_match(sid,
f->type, f->op, f->lsm_rule);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8dd73a64f921..fce5d43a933f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* auditsc.c -- System-call auditing support
* Handles all system-call specific auditing features.
*
@@ -6,20 +7,6 @@
* Copyright (C) 2005, 2006 IBM Corporation
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Many of the ideas implemented here are from Stephen C. Tweedie,
@@ -76,6 +63,7 @@
#include <linux/fsnotify_backend.h>
#include <uapi/linux/limits.h>
#include <uapi/linux/netfilter/nf_tables.h>
+#include <uapi/linux/openat2.h> // struct open_how
#include "audit.h"
@@ -166,7 +154,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
n = ctx->major;
switch (audit_classify_syscall(ctx->arch, n)) {
- case 0: /* native */
+ case AUDITSC_NATIVE:
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE, n))
return 1;
@@ -177,7 +165,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
audit_match_class(AUDIT_CLASS_CHATTR, n))
return 1;
return 0;
- case 1: /* 32bit on biarch */
+ case AUDITSC_COMPAT: /* 32bit on biarch */
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE_32, n))
return 1;
@@ -188,14 +176,16 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
audit_match_class(AUDIT_CLASS_CHATTR_32, n))
return 1;
return 0;
- case 2: /* open */
+ case AUDITSC_OPEN:
return mask & ACC_MODE(ctx->argv[1]);
- case 3: /* openat */
+ case AUDITSC_OPENAT:
return mask & ACC_MODE(ctx->argv[2]);
- case 4: /* socketcall */
+ case AUDITSC_SOCKETCALL:
return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
- case 5: /* execve */
+ case AUDITSC_EXECVE:
return mask & AUDIT_PERM_EXEC;
+ case AUDITSC_OPENAT2:
+ return mask & ACC_MODE((u32)((struct open_how *)ctx->argv[2])->flags);
default:
return 0;
}
@@ -480,6 +470,9 @@ static int audit_filter_rules(struct task_struct *tsk,
u32 sid;
unsigned int sessionid;
+ if (ctx && rule->prio <= ctx->prio)
+ return 0;
+
cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
for (i = 0; i < rule->field_count; i++) {
@@ -657,7 +650,7 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
case AUDIT_SADDR_FAM:
- if (ctx->sockaddr)
+ if (ctx && ctx->sockaddr)
result = audit_comparator(ctx->sockaddr->ss_family,
f->op, f->val);
break;
@@ -673,7 +666,16 @@ static int audit_filter_rules(struct task_struct *tsk,
logged upon error */
if (f->lsm_rule) {
if (need_sid) {
- security_task_getsecid_subj(tsk, &sid);
+ /* @tsk should always be equal to
+ * @current with the exception of
+ * fork()/copy_process() in which case
+ * the new @tsk creds are still a dup
+ * of @current's creds so we can still
+ * use security_current_getsecid_subj()
+ * here even though it always refs
+ * @current's creds
+ */
+ security_current_getsecid_subj(&sid);
need_sid = 0;
}
result = security_audit_rule_match(sid, f->type,
@@ -747,8 +749,6 @@ static int audit_filter_rules(struct task_struct *tsk,
}
if (ctx) {
- if (rule->prio <= ctx->prio)
- return 0;
if (rule->filterkey) {
kfree(ctx->filterkey);
ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
@@ -805,6 +805,34 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
return rule->mask[word] & bit;
}
+/**
+ * audit_filter_uring - apply filters to an io_uring operation
+ * @tsk: associated task
+ * @ctx: audit context
+ */
+static void audit_filter_uring(struct task_struct *tsk,
+ struct audit_context *ctx)
+{
+ struct audit_entry *e;
+ enum audit_state state;
+
+ if (auditd_test_task(tsk))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
+ list) {
+ if (audit_in_mask(&e->rule, ctx->uring_op) &&
+ audit_filter_rules(tsk, &e->rule, ctx, NULL, &state,
+ false)) {
+ rcu_read_unlock();
+ ctx->current_state = state;
+ return;
+ }
+ }
+ rcu_read_unlock();
+}
+
/* At syscall exit time, this filter is called if the audit_state is
* not low enough that auditing cannot take place, but is also not
* high enough that we already know we have to write an audit record
@@ -915,10 +943,81 @@ static inline void audit_free_aux(struct audit_context *context)
context->aux = aux->next;
kfree(aux);
}
+ context->aux = NULL;
while ((aux = context->aux_pids)) {
context->aux_pids = aux->next;
kfree(aux);
}
+ context->aux_pids = NULL;
+}
+
+/**
+ * audit_reset_context - reset a audit_context structure
+ * @ctx: the audit_context to reset
+ *
+ * All fields in the audit_context will be reset to an initial state, all
+ * references held by fields will be dropped, and private memory will be
+ * released. When this function returns the audit_context will be suitable
+ * for reuse, so long as the passed context is not NULL or a dummy context.
+ */
+static void audit_reset_context(struct audit_context *ctx)
+{
+ if (!ctx)
+ return;
+
+ /* if ctx is non-null, reset the "ctx->state" regardless */
+ ctx->context = AUDIT_CTX_UNUSED;
+ if (ctx->dummy)
+ return;
+
+ /*
+ * NOTE: It shouldn't matter in what order we release the fields, so
+ * release them in the order in which they appear in the struct;
+ * this gives us some hope of quickly making sure we are
+ * resetting the audit_context properly.
+ *
+ * Other things worth mentioning:
+ * - we don't reset "dummy"
+ * - we don't reset "state", we do reset "current_state"
+ * - we preserve "filterkey" if "state" is AUDIT_STATE_RECORD
+ * - much of this is likely overkill, but play it safe for now
+ * - we really need to work on improving the audit_context struct
+ */
+
+ ctx->current_state = ctx->state;
+ ctx->serial = 0;
+ ctx->major = 0;
+ ctx->uring_op = 0;
+ ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
+ memset(ctx->argv, 0, sizeof(ctx->argv));
+ ctx->return_code = 0;
+ ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0);
+ ctx->return_valid = AUDITSC_INVALID;
+ audit_free_names(ctx);
+ if (ctx->state != AUDIT_STATE_RECORD) {
+ kfree(ctx->filterkey);
+ ctx->filterkey = NULL;
+ }
+ audit_free_aux(ctx);
+ kfree(ctx->sockaddr);
+ ctx->sockaddr = NULL;
+ ctx->sockaddr_len = 0;
+ ctx->pid = ctx->ppid = 0;
+ ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0);
+ ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0);
+ ctx->personality = 0;
+ ctx->arch = 0;
+ ctx->target_pid = 0;
+ ctx->target_auid = ctx->target_uid = KUIDT_INIT(0);
+ ctx->target_sessionid = 0;
+ ctx->target_sid = 0;
+ ctx->target_comm[0] = '\0';
+ unroll_tree_refs(ctx, NULL, 0);
+ WARN_ON(!list_empty(&ctx->killed_trees));
+ ctx->type = 0;
+ audit_free_module(ctx);
+ ctx->fds[0] = -1;
+ audit_proctitle_free(ctx);
}
static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -928,6 +1027,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
context = kzalloc(sizeof(*context), GFP_KERNEL);
if (!context)
return NULL;
+ context->context = AUDIT_CTX_UNUSED;
context->state = state;
context->prio = state == AUDIT_STATE_RECORD ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
@@ -953,7 +1053,7 @@ int audit_alloc(struct task_struct *tsk)
char *key = NULL;
if (likely(!audit_ever_enabled))
- return 0; /* Return if not auditing. */
+ return 0;
state = audit_filter_task(tsk, &key);
if (state == AUDIT_STATE_DISABLED) {
@@ -973,16 +1073,37 @@ int audit_alloc(struct task_struct *tsk)
return 0;
}
+/**
+ * audit_alloc_kernel - allocate an audit_context for a kernel task
+ * @tsk: the kernel task
+ *
+ * Similar to the audit_alloc() function, but intended for kernel private
+ * threads. Returns zero on success, negative values on failure.
+ */
+int audit_alloc_kernel(struct task_struct *tsk)
+{
+ /*
+ * At the moment we are just going to call into audit_alloc() to
+ * simplify the code, but there two things to keep in mind with this
+ * approach:
+ *
+ * 1. Filtering internal kernel tasks is a bit laughable in almost all
+ * cases, but there is at least one case where there is a benefit:
+ * the '-a task,never' case allows the admin to effectively disable
+ * task auditing at runtime.
+ *
+ * 2. The {set,clear}_task_syscall_work() ops likely have zero effect
+ * on these internal kernel tasks, but they probably don't hurt either.
+ */
+ return audit_alloc(tsk);
+}
+
static inline void audit_free_context(struct audit_context *context)
{
- audit_free_module(context);
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
+ /* resetting is extra work, but it is likely just noise */
+ audit_reset_context(context);
free_tree_refs(context);
- audit_free_aux(context);
kfree(context->filterkey);
- kfree(context->sockaddr);
- audit_proctitle_free(context);
kfree(context);
}
@@ -1316,6 +1437,12 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
context->mmap.flags);
break;
+ case AUDIT_OPENAT2:
+ audit_log_format(ab, "oflag=0%llo mode=0%llo resolve=0x%llx",
+ context->openat2.flags,
+ context->openat2.mode,
+ context->openat2.resolve);
+ break;
case AUDIT_EXECVE:
audit_log_execve_info(context, &ab);
break;
@@ -1479,6 +1606,44 @@ out:
audit_log_end(ab);
}
+/**
+ * audit_log_uring - generate a AUDIT_URINGOP record
+ * @ctx: the audit context
+ */
+static void audit_log_uring(struct audit_context *ctx)
+{
+ struct audit_buffer *ab;
+ const struct cred *cred;
+
+ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_URINGOP);
+ if (!ab)
+ return;
+ cred = current_cred();
+ audit_log_format(ab, "uring_op=%d", ctx->uring_op);
+ if (ctx->return_valid != AUDITSC_INVALID)
+ audit_log_format(ab, " success=%s exit=%ld",
+ (ctx->return_valid == AUDITSC_SUCCESS ?
+ "yes" : "no"),
+ ctx->return_code);
+ audit_log_format(ab,
+ " items=%d"
+ " ppid=%d pid=%d uid=%u gid=%u euid=%u suid=%u"
+ " fsuid=%u egid=%u sgid=%u fsgid=%u",
+ ctx->name_count,
+ task_ppid_nr(current), task_tgid_nr(current),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kgid(&init_user_ns, cred->gid),
+ from_kuid(&init_user_ns, cred->euid),
+ from_kuid(&init_user_ns, cred->suid),
+ from_kuid(&init_user_ns, cred->fsuid),
+ from_kgid(&init_user_ns, cred->egid),
+ from_kgid(&init_user_ns, cred->sgid),
+ from_kgid(&init_user_ns, cred->fsgid));
+ audit_log_task_context(ab);
+ audit_log_key(ab, ctx->filterkey);
+ audit_log_end(ab);
+}
+
static void audit_log_exit(void)
{
int i, call_panic = 0;
@@ -1489,29 +1654,38 @@ static void audit_log_exit(void)
context->personality = current->personality;
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
- if (!ab)
- return; /* audit_panic has been called */
- audit_log_format(ab, "arch=%x syscall=%d",
- context->arch, context->major);
- if (context->personality != PER_LINUX)
- audit_log_format(ab, " per=%lx", context->personality);
- if (context->return_valid != AUDITSC_INVALID)
- audit_log_format(ab, " success=%s exit=%ld",
- (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
- context->return_code);
-
- audit_log_format(ab,
- " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
- context->argv[0],
- context->argv[1],
- context->argv[2],
- context->argv[3],
- context->name_count);
-
- audit_log_task_info(ab);
- audit_log_key(ab, context->filterkey);
- audit_log_end(ab);
+ switch (context->context) {
+ case AUDIT_CTX_SYSCALL:
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
+ if (!ab)
+ return;
+ audit_log_format(ab, "arch=%x syscall=%d",
+ context->arch, context->major);
+ if (context->personality != PER_LINUX)
+ audit_log_format(ab, " per=%lx", context->personality);
+ if (context->return_valid != AUDITSC_INVALID)
+ audit_log_format(ab, " success=%s exit=%ld",
+ (context->return_valid == AUDITSC_SUCCESS ?
+ "yes" : "no"),
+ context->return_code);
+ audit_log_format(ab,
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count);
+ audit_log_task_info(ab);
+ audit_log_key(ab, context->filterkey);
+ audit_log_end(ab);
+ break;
+ case AUDIT_CTX_URING:
+ audit_log_uring(context);
+ break;
+ default:
+ BUG();
+ break;
+ }
for (aux = context->aux; aux; aux = aux->next) {
@@ -1602,21 +1776,22 @@ static void audit_log_exit(void)
audit_log_name(context, n, NULL, i++, &call_panic);
}
- audit_log_proctitle();
+ if (context->context == AUDIT_CTX_SYSCALL)
+ audit_log_proctitle();
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
if (ab)
audit_log_end(ab);
if (call_panic)
- audit_panic("error converting sid to string");
+ audit_panic("error in audit_log_exit()");
}
/**
* __audit_free - free a per-task audit context
* @tsk: task whose audit context block to free
*
- * Called from copy_process and do_exit
+ * Called from copy_process, do_exit, and the io_uring code
*/
void __audit_free(struct task_struct *tsk)
{
@@ -1625,6 +1800,7 @@ void __audit_free(struct task_struct *tsk)
if (!context)
return;
+ /* this may generate CONFIG_CHANGE records */
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
@@ -1633,14 +1809,21 @@ void __audit_free(struct task_struct *tsk)
* random task_struct that doesn't doesn't have any meaningful data we
* need to log via audit_log_exit().
*/
- if (tsk == current && !context->dummy && context->in_syscall) {
+ if (tsk == current && !context->dummy) {
context->return_valid = AUDITSC_INVALID;
context->return_code = 0;
-
- audit_filter_syscall(tsk, context);
- audit_filter_inodes(tsk, context);
- if (context->current_state == AUDIT_STATE_RECORD)
- audit_log_exit();
+ if (context->context == AUDIT_CTX_SYSCALL) {
+ audit_filter_syscall(tsk, context);
+ audit_filter_inodes(tsk, context);
+ if (context->current_state == AUDIT_STATE_RECORD)
+ audit_log_exit();
+ } else if (context->context == AUDIT_CTX_URING) {
+ /* TODO: verify this case is real and valid */
+ audit_filter_uring(tsk, context);
+ audit_filter_inodes(tsk, context);
+ if (context->current_state == AUDIT_STATE_RECORD)
+ audit_log_uring(context);
+ }
}
audit_set_context(tsk, NULL);
@@ -1648,6 +1831,131 @@ void __audit_free(struct task_struct *tsk)
}
/**
+ * audit_return_fixup - fixup the return codes in the audit_context
+ * @ctx: the audit_context
+ * @success: true/false value to indicate if the operation succeeded or not
+ * @code: operation return code
+ *
+ * We need to fixup the return code in the audit logs if the actual return
+ * codes are later going to be fixed by the arch specific signal handlers.
+ */
+static void audit_return_fixup(struct audit_context *ctx,
+ int success, long code)
+{
+ /*
+ * This is actually a test for:
+ * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
+ * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
+ *
+ * but is faster than a bunch of ||
+ */
+ if (unlikely(code <= -ERESTARTSYS) &&
+ (code >= -ERESTART_RESTARTBLOCK) &&
+ (code != -ENOIOCTLCMD))
+ ctx->return_code = -EINTR;
+ else
+ ctx->return_code = code;
+ ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE);
+}
+
+/**
+ * __audit_uring_entry - prepare the kernel task's audit context for io_uring
+ * @op: the io_uring opcode
+ *
+ * This is similar to audit_syscall_entry() but is intended for use by io_uring
+ * operations. This function should only ever be called from
+ * audit_uring_entry() as we rely on the audit context checking present in that
+ * function.
+ */
+void __audit_uring_entry(u8 op)
+{
+ struct audit_context *ctx = audit_context();
+
+ if (ctx->state == AUDIT_STATE_DISABLED)
+ return;
+
+ /*
+ * NOTE: It's possible that we can be called from the process' context
+ * before it returns to userspace, and before audit_syscall_exit()
+ * is called. In this case there is not much to do, just record
+ * the io_uring details and return.
+ */
+ ctx->uring_op = op;
+ if (ctx->context == AUDIT_CTX_SYSCALL)
+ return;
+
+ ctx->dummy = !audit_n_rules;
+ if (!ctx->dummy && ctx->state == AUDIT_STATE_BUILD)
+ ctx->prio = 0;
+
+ ctx->context = AUDIT_CTX_URING;
+ ctx->current_state = ctx->state;
+ ktime_get_coarse_real_ts64(&ctx->ctime);
+}
+
+/**
+ * __audit_uring_exit - wrap up the kernel task's audit context after io_uring
+ * @success: true/false value to indicate if the operation succeeded or not
+ * @code: operation return code
+ *
+ * This is similar to audit_syscall_exit() but is intended for use by io_uring
+ * operations. This function should only ever be called from
+ * audit_uring_exit() as we rely on the audit context checking present in that
+ * function.
+ */
+void __audit_uring_exit(int success, long code)
+{
+ struct audit_context *ctx = audit_context();
+
+ if (ctx->context == AUDIT_CTX_SYSCALL) {
+ /*
+ * NOTE: See the note in __audit_uring_entry() about the case
+ * where we may be called from process context before we
+ * return to userspace via audit_syscall_exit(). In this
+ * case we simply emit a URINGOP record and bail, the
+ * normal syscall exit handling will take care of
+ * everything else.
+ * It is also worth mentioning that when we are called,
+ * the current process creds may differ from the creds
+ * used during the normal syscall processing; keep that
+ * in mind if/when we move the record generation code.
+ */
+
+ /*
+ * We need to filter on the syscall info here to decide if we
+ * should emit a URINGOP record. I know it seems odd but this
+ * solves the problem where users have a filter to block *all*
+ * syscall records in the "exit" filter; we want to preserve
+ * the behavior here.
+ */
+ audit_filter_syscall(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ audit_filter_uring(current, ctx);
+ audit_filter_inodes(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ return;
+
+ audit_log_uring(ctx);
+ return;
+ }
+
+ /* this may generate CONFIG_CHANGE records */
+ if (!list_empty(&ctx->killed_trees))
+ audit_kill_trees(ctx);
+
+ /* run through both filters to ensure we set the filterkey properly */
+ audit_filter_uring(current, ctx);
+ audit_filter_inodes(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ goto out;
+ audit_return_fixup(ctx, success, code);
+ audit_log_exit();
+
+out:
+ audit_reset_context(ctx);
+}
+
+/**
* __audit_syscall_entry - fill in an audit record at syscall entry
* @major: major syscall type (function)
* @a1: additional syscall register 1
@@ -1672,7 +1980,12 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
if (!audit_enabled || !context)
return;
- BUG_ON(context->in_syscall || context->name_count);
+ WARN_ON(context->context != AUDIT_CTX_UNUSED);
+ WARN_ON(context->name_count);
+ if (context->context != AUDIT_CTX_UNUSED || context->name_count) {
+ audit_panic("unrecoverable error in audit_syscall_entry()");
+ return;
+ }
state = context->state;
if (state == AUDIT_STATE_DISABLED)
@@ -1691,10 +2004,8 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
context->argv[1] = a2;
context->argv[2] = a3;
context->argv[3] = a4;
- context->serial = 0;
- context->in_syscall = 1;
+ context->context = AUDIT_CTX_SYSCALL;
context->current_state = state;
- context->ppid = 0;
ktime_get_coarse_real_ts64(&context->ctime);
}
@@ -1711,63 +2022,27 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
*/
void __audit_syscall_exit(int success, long return_code)
{
- struct audit_context *context;
+ struct audit_context *context = audit_context();
- context = audit_context();
- if (!context)
- return;
+ if (!context || context->dummy ||
+ context->context != AUDIT_CTX_SYSCALL)
+ goto out;
+ /* this may generate CONFIG_CHANGE records */
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
- if (!context->dummy && context->in_syscall) {
- if (success)
- context->return_valid = AUDITSC_SUCCESS;
- else
- context->return_valid = AUDITSC_FAILURE;
+ /* run through both filters to ensure we set the filterkey properly */
+ audit_filter_syscall(current, context);
+ audit_filter_inodes(current, context);
+ if (context->current_state < AUDIT_STATE_RECORD)
+ goto out;
- /*
- * we need to fix up the return code in the audit logs if the
- * actual return codes are later going to be fixed up by the
- * arch specific signal handlers
- *
- * This is actually a test for:
- * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
- * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
- *
- * but is faster than a bunch of ||
- */
- if (unlikely(return_code <= -ERESTARTSYS) &&
- (return_code >= -ERESTART_RESTARTBLOCK) &&
- (return_code != -ENOIOCTLCMD))
- context->return_code = -EINTR;
- else
- context->return_code = return_code;
-
- audit_filter_syscall(current, context);
- audit_filter_inodes(current, context);
- if (context->current_state == AUDIT_STATE_RECORD)
- audit_log_exit();
- }
-
- context->in_syscall = 0;
- context->prio = context->state == AUDIT_STATE_RECORD ? ~0ULL : 0;
-
- audit_free_module(context);
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
- audit_free_aux(context);
- context->aux = NULL;
- context->aux_pids = NULL;
- context->target_pid = 0;
- context->target_sid = 0;
- context->sockaddr_len = 0;
- context->type = 0;
- context->fds[0] = -1;
- if (context->state != AUDIT_STATE_RECORD) {
- kfree(context->filterkey);
- context->filterkey = NULL;
- }
+ audit_return_fixup(context, success, return_code);
+ audit_log_exit();
+
+out:
+ audit_reset_context(context);
}
static inline void handle_one(const struct inode *inode)
@@ -1919,7 +2194,7 @@ void __audit_getname(struct filename *name)
struct audit_context *context = audit_context();
struct audit_names *n;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
@@ -1991,7 +2266,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
int i;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
rcu_read_lock();
@@ -2109,7 +2384,7 @@ void __audit_inode_child(struct inode *parent,
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
int i;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
rcu_read_lock();
@@ -2208,7 +2483,7 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
int auditsc_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial)
{
- if (!ctx->in_syscall)
+ if (ctx->context == AUDIT_CTX_UNUSED)
return 0;
if (!ctx->serial)
ctx->serial = audit_serial();
@@ -2546,6 +2821,16 @@ void __audit_mmap_fd(int fd, int flags)
context->type = AUDIT_MMAP;
}
+void __audit_openat2_how(struct open_how *how)
+{
+ struct audit_context *context = audit_context();
+
+ context->openat2.flags = how->flags;
+ context->openat2.mode = how->mode;
+ context->openat2.resolve = how->resolve;
+ context->type = AUDIT_OPENAT2;
+}
+
void __audit_log_kern_module(char *name)
{
struct audit_context *context = audit_context();
@@ -2706,8 +2991,7 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names,
struct list_head *audit_killed_trees(void)
{
struct audit_context *ctx = audit_context();
-
- if (likely(!ctx || !ctx->in_syscall))
+ if (likely(!ctx || ctx->context == AUDIT_CTX_UNUSED))
return NULL;
return &ctx->killed_trees;
}
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index a82d6de86522..d24d518ddd63 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -64,6 +64,7 @@ config BPF_JIT_DEFAULT_ON
config BPF_UNPRIV_DEFAULT_OFF
bool "Disable unprivileged BPF by default"
+ default y
depends on BPF_SYSCALL
help
Disables unprivileged BPF by default by setting the corresponding
@@ -72,6 +73,12 @@ config BPF_UNPRIV_DEFAULT_OFF
disable it by setting it to 1 (from which no other transition to
0 is possible anymore).
+ Unprivileged BPF could be used to exploit certain potential
+ speculative execution side-channel vulnerabilities on unmitigated
+ affected hardware.
+
+ If you are unsure how to answer this question, answer Y.
+
source "kernel/bpf/preload/Kconfig"
config BPF_LSM
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 7f33098ca63f..c1a9be6a4b9f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -7,7 +7,7 @@ endif
CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
@@ -36,3 +36,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
+
+obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
+$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
+ $(call if_changed_rule,cc_o_c)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cebd4fb06d19..c7a5be3bf8be 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -645,7 +645,7 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
};
-static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn,
+static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
void *callback_ctx, u64 flags)
{
u32 i, key, num_elems = 0;
@@ -668,9 +668,8 @@ static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn,
val = array->value + array->elem_size * i;
num_elems++;
key = i;
- ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
- (u64)(long)&key, (u64)(long)val,
- (u64)(long)callback_ctx, 0);
+ ret = callback_fn((u64)(long)map, (u64)(long)&key,
+ (u64)(long)val, (u64)(long)callback_ctx, 0);
/* return value: 0 - continue, 1 - stop and return */
if (ret)
break;
@@ -1072,6 +1071,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
INIT_WORK(&aux->work, prog_array_map_clear_deferred);
INIT_LIST_HEAD(&aux->poke_progs);
mutex_init(&aux->poke_mutex);
+ spin_lock_init(&aux->owner.lock);
map = array_map_alloc(attr);
if (IS_ERR(map)) {
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
new file mode 100644
index 000000000000..b141a1346f72
--- /dev/null
+++ b/kernel/bpf/bloom_filter.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <linux/bitmap.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+
+#define BLOOM_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK)
+
+struct bpf_bloom_filter {
+ struct bpf_map map;
+ u32 bitset_mask;
+ u32 hash_seed;
+ /* If the size of the values in the bloom filter is u32 aligned,
+ * then it is more performant to use jhash2 as the underlying hash
+ * function, else we use jhash. This tracks the number of u32s
+ * in an u32-aligned value size. If the value size is not u32 aligned,
+ * this will be 0.
+ */
+ u32 aligned_u32_count;
+ u32 nr_hash_funcs;
+ unsigned long bitset[];
+};
+
+static u32 hash(struct bpf_bloom_filter *bloom, void *value,
+ u32 value_size, u32 index)
+{
+ u32 h;
+
+ if (bloom->aligned_u32_count)
+ h = jhash2(value, bloom->aligned_u32_count,
+ bloom->hash_seed + index);
+ else
+ h = jhash(value, value_size, bloom->hash_seed + index);
+
+ return h & bloom->bitset_mask;
+}
+
+static int bloom_map_peek_elem(struct bpf_map *map, void *value)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+ u32 i, h;
+
+ for (i = 0; i < bloom->nr_hash_funcs; i++) {
+ h = hash(bloom, value, map->value_size, i);
+ if (!test_bit(h, bloom->bitset))
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+ u32 i, h;
+
+ if (flags != BPF_ANY)
+ return -EINVAL;
+
+ for (i = 0; i < bloom->nr_hash_funcs; i++) {
+ h = hash(bloom, value, map->value_size, i);
+ set_bit(h, bloom->bitset);
+ }
+
+ return 0;
+}
+
+static int bloom_map_pop_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static int bloom_map_delete_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EOPNOTSUPP;
+}
+
+static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
+{
+ u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_bloom_filter *bloom;
+
+ if (!bpf_capable())
+ return ERR_PTR(-EPERM);
+
+ if (attr->key_size != 0 || attr->value_size == 0 ||
+ attr->max_entries == 0 ||
+ attr->map_flags & ~BLOOM_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags) ||
+ /* The lower 4 bits of map_extra (0xF) specify the number
+ * of hash functions
+ */
+ (attr->map_extra & ~0xF))
+ return ERR_PTR(-EINVAL);
+
+ nr_hash_funcs = attr->map_extra;
+ if (nr_hash_funcs == 0)
+ /* Default to using 5 hash functions if unspecified */
+ nr_hash_funcs = 5;
+
+ /* For the bloom filter, the optimal bit array size that minimizes the
+ * false positive probability is n * k / ln(2) where n is the number of
+ * expected entries in the bloom filter and k is the number of hash
+ * functions. We use 7 / 5 to approximate 1 / ln(2).
+ *
+ * We round this up to the nearest power of two to enable more efficient
+ * hashing using bitmasks. The bitmask will be the bit array size - 1.
+ *
+ * If this overflows a u32, the bit array size will have 2^32 (4
+ * GB) bits.
+ */
+ if (check_mul_overflow(attr->max_entries, nr_hash_funcs, &nr_bits) ||
+ check_mul_overflow(nr_bits / 5, (u32)7, &nr_bits) ||
+ nr_bits > (1UL << 31)) {
+ /* The bit array size is 2^32 bits but to avoid overflowing the
+ * u32, we use U32_MAX, which will round up to the equivalent
+ * number of bytes
+ */
+ bitset_bytes = BITS_TO_BYTES(U32_MAX);
+ bitset_mask = U32_MAX;
+ } else {
+ if (nr_bits <= BITS_PER_LONG)
+ nr_bits = BITS_PER_LONG;
+ else
+ nr_bits = roundup_pow_of_two(nr_bits);
+ bitset_bytes = BITS_TO_BYTES(nr_bits);
+ bitset_mask = nr_bits - 1;
+ }
+
+ bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
+ bloom = bpf_map_area_alloc(sizeof(*bloom) + bitset_bytes, numa_node);
+
+ if (!bloom)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&bloom->map, attr);
+
+ bloom->nr_hash_funcs = nr_hash_funcs;
+ bloom->bitset_mask = bitset_mask;
+
+ /* Check whether the value size is u32-aligned */
+ if ((attr->value_size & (sizeof(u32) - 1)) == 0)
+ bloom->aligned_u32_count =
+ attr->value_size / sizeof(u32);
+
+ if (!(attr->map_flags & BPF_F_ZERO_SEED))
+ bloom->hash_seed = get_random_int();
+
+ return &bloom->map;
+}
+
+static void bloom_map_free(struct bpf_map *map)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+
+ bpf_map_area_free(bloom);
+}
+
+static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ /* The eBPF program should use map_peek_elem instead */
+ return ERR_PTR(-EINVAL);
+}
+
+static int bloom_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ /* The eBPF program should use map_push_elem instead */
+ return -EINVAL;
+}
+
+static int bloom_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ /* Bloom filter maps are keyless */
+ return btf_type_is_void(key_type) ? 0 : -EINVAL;
+}
+
+static int bpf_bloom_map_btf_id;
+const struct bpf_map_ops bloom_filter_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = bloom_map_alloc,
+ .map_free = bloom_map_free,
+ .map_get_next_key = bloom_map_get_next_key,
+ .map_push_elem = bloom_map_push_elem,
+ .map_peek_elem = bloom_map_peek_elem,
+ .map_pop_elem = bloom_map_pop_elem,
+ .map_lookup_elem = bloom_map_lookup_elem,
+ .map_update_elem = bloom_map_update_elem,
+ .map_delete_elem = bloom_map_delete_elem,
+ .map_check_btf = bloom_map_check_btf,
+ .map_btf_name = "bpf_bloom_filter",
+ .map_btf_id = &bpf_bloom_map_btf_id,
+};
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 96ceed0e0fb5..e29d9e3d853e 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -17,6 +17,7 @@
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(inode_cache);
@@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
if (!bsb)
return NULL;
- inode_storage = rcu_dereference(bsb->storage);
+ inode_storage =
+ rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
if (!inode_storage)
return NULL;
@@ -172,6 +174,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
{
struct bpf_local_storage_data *sdata;
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
return (unsigned long)NULL;
@@ -204,6 +207,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
BPF_CALL_2(bpf_inode_storage_delete,
struct bpf_map *, map, struct inode *, inode)
{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!inode)
return -EINVAL;
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index b2ee45064e06..b7aef5b3416d 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -714,3 +714,38 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
+
+/* maximum number of loops */
+#define MAX_LOOPS BIT(23)
+
+BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
+ u64, flags)
+{
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 ret;
+ u32 i;
+
+ if (flags)
+ return -EINVAL;
+ if (nr_loops > MAX_LOOPS)
+ return -E2BIG;
+
+ for (i = 0; i < nr_loops; i++) {
+ ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret)
+ return i + 1;
+ }
+
+ return i;
+}
+
+const struct bpf_func_proto bpf_loop_proto = {
+ .func = bpf_loop,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b305270b7a4b..71de2a89869c 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -11,6 +11,9 @@
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/rcupdate_wait.h>
#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
@@ -81,6 +84,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
}
+void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage *local_storage;
+
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ kfree_rcu(local_storage, rcu);
+}
+
+static void bpf_selem_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ kfree_rcu(selem, rcu);
+}
+
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
@@ -93,7 +112,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
bool free_local_storage;
void *owner;
- smap = rcu_dereference(SDATA(selem)->smap);
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
owner = local_storage->owner;
/* All uncharging on the owner must be done first.
@@ -118,12 +137,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
*
* Although the unlock will be done under
* rcu_read_lock(), it is more intutivie to
- * read if kfree_rcu(local_storage, rcu) is done
+ * read if the freeing of the storage is done
* after the raw_spin_unlock_bh(&local_storage->lock).
*
* Hence, a "bool free_local_storage" is returned
- * to the caller which then calls the kfree_rcu()
- * after unlock.
+ * to the caller which then calls then frees the storage after
+ * all the RCU grace periods have expired.
*/
}
hlist_del_init_rcu(&selem->snode);
@@ -131,8 +150,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
- kfree_rcu(selem, rcu);
-
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
return free_local_storage;
}
@@ -146,7 +164,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
/* selem has already been unlinked from sk */
return;
- local_storage = rcu_dereference(selem->local_storage);
+ local_storage = rcu_dereference_check(selem->local_storage,
+ bpf_rcu_lock_held());
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
@@ -154,7 +173,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (free_local_storage)
- kfree_rcu(local_storage, rcu);
+ call_rcu_tasks_trace(&local_storage->rcu,
+ bpf_local_storage_free_rcu);
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -174,7 +194,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
/* selem has already be unlinked from smap */
return;
- smap = rcu_dereference(SDATA(selem)->smap);
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
b = select_bucket(smap, selem);
raw_spin_lock_irqsave(&b->lock, flags);
if (likely(selem_linked_to_map(selem)))
@@ -213,12 +233,14 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem;
/* Fast path (cache hit) */
- sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
+ sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
+ bpf_rcu_lock_held());
if (sdata && rcu_access_pointer(sdata->smap) == smap)
return sdata;
/* Slow path (cache miss) */
- hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
+ hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
+ rcu_read_lock_trace_held())
if (rcu_access_pointer(SDATA(selem)->smap) == smap)
break;
@@ -306,7 +328,8 @@ int bpf_local_storage_alloc(void *owner,
* bucket->list, first_selem can be freed immediately
* (instead of kfree_rcu) because
* bpf_local_storage_map_free() does a
- * synchronize_rcu() before walking the bucket->list.
+ * synchronize_rcu_mult (waiting for both sleepable and
+ * normal programs) before walking the bucket->list.
* Hence, no one is accessing selem from the
* bucket->list under rcu_read_lock().
*/
@@ -342,7 +365,8 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
!map_value_has_spin_lock(&smap->map)))
return ERR_PTR(-EINVAL);
- local_storage = rcu_dereference(*owner_storage(smap, owner));
+ local_storage = rcu_dereference_check(*owner_storage(smap, owner),
+ bpf_rcu_lock_held());
if (!local_storage || hlist_empty(&local_storage->list)) {
/* Very first elem for the owner */
err = check_flags(NULL, map_flags);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index d6731c32864e..21069dbe9138 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -93,6 +93,9 @@ const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
};
const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
+#ifdef CONFIG_NET
+ .test_run = bpf_struct_ops_test_run,
+#endif
};
static const struct btf_type *module_type;
@@ -162,7 +165,7 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
break;
}
- if (btf_member_bitfield_size(t, member)) {
+ if (__btf_member_bitfield_size(t, member)) {
pr_warn("bit field member %s in struct %s is not supported\n",
mname, st_ops->name);
break;
@@ -293,7 +296,7 @@ static int check_zero_holes(const struct btf_type *t, void *data)
const struct btf_type *mtype;
for_each_member(i, t, member) {
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (moff > prev_mend &&
memchr_inv(data + prev_mend, 0, moff - prev_mend))
return -EINVAL;
@@ -312,6 +315,20 @@ static int check_zero_holes(const struct btf_type *t, void *data)
return 0;
}
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs,
+ struct bpf_prog *prog,
+ const struct btf_func_model *model,
+ void *image, void *image_end)
+{
+ u32 flags;
+
+ tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
+ tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
+ flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
+ return arch_prepare_bpf_trampoline(NULL, image, image_end,
+ model, flags, tprogs, NULL);
+}
+
static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
@@ -323,7 +340,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
struct bpf_tramp_progs *tprogs = NULL;
void *udata, *kdata;
int prog_fd, err = 0;
- void *image;
+ void *image, *image_end;
u32 i;
if (flags)
@@ -363,13 +380,14 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
image = st_map->image;
+ image_end = st_map->image + PAGE_SIZE;
for_each_member(i, t, member) {
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
u32 moff;
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
@@ -429,12 +447,9 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto reset_unlock;
}
- tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
- tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
- err = arch_prepare_bpf_trampoline(NULL, image,
- st_map->image + PAGE_SIZE,
- &st_ops->func_models[i], 0,
- tprogs, NULL);
+ err = bpf_struct_ops_prepare_trampoline(tprogs, prog,
+ &st_ops->func_models[i],
+ image, image_end);
if (err < 0)
goto reset_unlock;
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
index 066d83ea1c99..5678a9ddf817 100644
--- a/kernel/bpf/bpf_struct_ops_types.h
+++ b/kernel/bpf/bpf_struct_ops_types.h
@@ -2,6 +2,9 @@
/* internal file - do not include directly */
#ifdef CONFIG_BPF_JIT
+#ifdef CONFIG_NET
+BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
+#endif
#ifdef CONFIG_INET
#include <net/tcp.h>
BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index ebfa8bc90892..5da7bed0f5f6 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -17,6 +17,7 @@
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
@@ -59,7 +60,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
struct bpf_local_storage *task_storage;
struct bpf_local_storage_map *smap;
- task_storage = rcu_dereference(task->bpf_storage);
+ task_storage =
+ rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held());
if (!task_storage)
return NULL;
@@ -229,6 +231,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
{
struct bpf_local_storage_data *sdata;
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
return (unsigned long)NULL;
@@ -260,6 +263,7 @@ BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
{
int ret;
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!task)
return -EINVAL;
@@ -323,7 +327,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
- .arg2_btf_id = &btf_task_struct_ids[0],
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
@@ -334,5 +338,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
- .arg2_btf_id = &btf_task_struct_ids[0],
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index dfe61df4f974..33bb8ae4a804 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -25,6 +25,7 @@
#include <linux/kobject.h>
#include <linux/sysfs.h>
#include <net/sock.h>
+#include "../tools/lib/bpf/relo_core.h"
/* BTF (BPF Type Format) is the meta data format which describes
* the data types of BPF program/map. Hence, it basically focus
@@ -281,6 +282,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_VAR] = "VAR",
[BTF_KIND_DATASEC] = "DATASEC",
[BTF_KIND_FLOAT] = "FLOAT",
+ [BTF_KIND_DECL_TAG] = "DECL_TAG",
+ [BTF_KIND_TYPE_TAG] = "TYPE_TAG",
};
const char *btf_type_str(const struct btf_type *t)
@@ -417,6 +420,7 @@ static bool btf_type_is_modifier(const struct btf_type *t)
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
return true;
}
@@ -459,6 +463,17 @@ static bool btf_type_is_datasec(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
+static bool btf_type_is_decl_tag(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
+}
+
+static bool btf_type_is_decl_tag_target(const struct btf_type *t)
+{
+ return btf_type_is_func(t) || btf_type_is_struct(t) ||
+ btf_type_is_var(t) || btf_type_is_typedef(t);
+}
+
u32 btf_nr_types(const struct btf *btf)
{
u32 total = 0;
@@ -537,6 +552,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
static bool btf_type_is_resolve_source_only(const struct btf_type *t)
{
return btf_type_is_var(t) ||
+ btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@@ -563,6 +579,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t)
btf_type_is_struct(t) ||
btf_type_is_array(t) ||
btf_type_is_var(t) ||
+ btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@@ -616,6 +633,11 @@ static const struct btf_var *btf_type_var(const struct btf_type *t)
return (const struct btf_var *)(t + 1);
}
+static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
+{
+ return (const struct btf_decl_tag *)(t + 1);
+}
+
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
{
return kind_ops[BTF_INFO_KIND(t->info)];
@@ -815,7 +837,7 @@ static const char *btf_show_name(struct btf_show *show)
const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)];
const char *name = NULL, *prefix = "", *parens = "";
const struct btf_member *m = show->state.member;
- const struct btf_type *t = show->state.type;
+ const struct btf_type *t;
const struct btf_array *array;
u32 id = show->state.type_id;
const char *member = NULL;
@@ -1718,6 +1740,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
id = type->type;
type = btf_type_by_id(btf, type->type);
break;
@@ -2326,6 +2349,8 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
+ const char *value;
+
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
@@ -2341,7 +2366,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- /* typedef type must have a valid name, and other ref types,
+ /* typedef/type_tag type must have a valid name, and other ref types,
* volatile, const, restrict, should have a null name.
*/
if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
@@ -2350,6 +2375,12 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
+ } else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) {
+ value = btf_name_by_offset(env->btf, t->name_off);
+ if (!value || !value[0]) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
} else {
if (t->name_off) {
btf_verifier_log_type(env, t, "Invalid name");
@@ -2939,7 +2970,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- offset = btf_member_bit_offset(t, member);
+ offset = __btf_member_bit_offset(t, member);
if (is_union && offset) {
btf_verifier_log_member(env, t, member,
"Invalid member bits_offset");
@@ -3064,7 +3095,7 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
if (off != -ENOENT)
/* only one such field is allowed */
return -E2BIG;
- off = btf_member_bit_offset(t, member);
+ off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
@@ -3154,8 +3185,8 @@ static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
btf_show_start_member(show, member);
- member_offset = btf_member_bit_offset(t, member);
- bitfield_size = btf_member_bitfield_size(t, member);
+ member_offset = __btf_member_bit_offset(t, member);
+ bitfield_size = __btf_member_bitfield_size(t, member);
bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
if (bitfield_size) {
@@ -3801,6 +3832,110 @@ static const struct btf_kind_operations float_ops = {
.show = btf_df_show,
};
+static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_decl_tag *tag;
+ u32 meta_needed = sizeof(*tag);
+ s32 component_idx;
+ const char *value;
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ value = btf_name_by_offset(env->btf, t->name_off);
+ if (!value || !value[0]) {
+ btf_verifier_log_type(env, t, "Invalid value");
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ component_idx = btf_type_decl_tag(t)->component_idx;
+ if (component_idx < -1) {
+ btf_verifier_log_type(env, t, "Invalid component_idx");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static int btf_decl_tag_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *next_type;
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+ s32 component_idx;
+ u32 vlen;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type || !btf_type_is_decl_tag_target(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ component_idx = btf_type_decl_tag(t)->component_idx;
+ if (component_idx != -1) {
+ if (btf_type_is_var(next_type) || btf_type_is_typedef(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid component_idx");
+ return -EINVAL;
+ }
+
+ if (btf_type_is_struct(next_type)) {
+ vlen = btf_type_vlen(next_type);
+ } else {
+ /* next_type should be a function */
+ next_type = btf_type_by_id(btf, next_type->type);
+ vlen = btf_type_vlen(next_type);
+ }
+
+ if ((u32)component_idx >= vlen) {
+ btf_verifier_log_type(env, v->t, "Invalid component_idx");
+ return -EINVAL;
+ }
+ }
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+
+ return 0;
+}
+
+static void btf_decl_tag_log(struct btf_verifier_env *env, const struct btf_type *t)
+{
+ btf_verifier_log(env, "type=%u component_idx=%d", t->type,
+ btf_type_decl_tag(t)->component_idx);
+}
+
+static const struct btf_kind_operations decl_tag_ops = {
+ .check_meta = btf_decl_tag_check_meta,
+ .resolve = btf_decl_tag_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_decl_tag_log,
+ .show = btf_df_show,
+};
+
static int btf_func_proto_check(struct btf_verifier_env *env,
const struct btf_type *t)
{
@@ -3935,6 +4070,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_VAR] = &var_ops,
[BTF_KIND_DATASEC] = &datasec_ops,
[BTF_KIND_FLOAT] = &float_ops,
+ [BTF_KIND_DECL_TAG] = &decl_tag_ops,
+ [BTF_KIND_TYPE_TAG] = &modifier_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -4019,6 +4156,10 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
return !btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
+ if (btf_type_is_decl_tag(t))
+ return btf_resolved_type_id(btf, type_id) &&
+ !btf_resolved_type_size(btf, type_id);
+
if (btf_type_is_modifier(t) || btf_type_is_ptr(t) ||
btf_type_is_var(t)) {
t = btf_type_id_resolve(btf, &type_id);
@@ -4332,8 +4473,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
log->len_total = log_size;
/* log attributes have to be sane */
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
- !log->level || !log->ubuf) {
+ if (!bpf_verifier_log_attr_valid(log)) {
err = -EINVAL;
goto errout;
}
@@ -4686,7 +4826,7 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
return prog->aux->attach_btf;
}
-static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
+static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
{
/* t comes in already as a pointer */
t = btf_type_by_id(btf, t->type);
@@ -4695,8 +4835,7 @@ static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
t = btf_type_by_id(btf, t->type);
- /* char, signed char, unsigned char */
- return btf_type_is_int(t) && t->size == 1;
+ return btf_type_is_int(t);
}
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
@@ -4801,10 +4940,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+ u32 type, flag;
- if (ctx_arg_info->offset == off &&
- (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||
- ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {
+ type = base_type(ctx_arg_info->reg_type);
+ flag = type_flag(ctx_arg_info->reg_type);
+ if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
+ (flag & PTR_MAYBE_NULL)) {
info->reg_type = ctx_arg_info->reg_type;
return true;
}
@@ -4817,7 +4958,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
*/
return true;
- if (is_string_ptr(btf, t))
+ if (is_int_ptr(btf, t))
return true;
/* this is a pointer to another type */
@@ -4920,7 +5061,7 @@ again:
if (array_elem->nelems != 0)
goto error;
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (off < moff)
goto error;
@@ -4943,14 +5084,14 @@ error:
for_each_member(i, t, member) {
/* offset of the field in bytes */
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (off + size <= moff)
/* won't find anything, field is already too far */
break;
- if (btf_member_bitfield_size(t, member)) {
- u32 end_bit = btf_member_bit_offset(t, member) +
- btf_member_bitfield_size(t, member);
+ if (__btf_member_bitfield_size(t, member)) {
+ u32 end_bit = __btf_member_bit_offset(t, member) +
+ __btf_member_bitfield_size(t, member);
/* off <= moff instead of off == moff because clang
* does not generate a BTF member for anonymous
@@ -5435,12 +5576,53 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
#endif
};
+/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
+static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
+ const struct btf *btf,
+ const struct btf_type *t, int rec)
+{
+ const struct btf_type *member_type;
+ const struct btf_member *member;
+ u32 i;
+
+ if (!btf_type_is_struct(t))
+ return false;
+
+ for_each_member(i, t, member) {
+ const struct btf_array *array;
+
+ member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+ if (btf_type_is_struct(member_type)) {
+ if (rec >= 3) {
+ bpf_log(log, "max struct nesting depth exceeded\n");
+ return false;
+ }
+ if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1))
+ return false;
+ continue;
+ }
+ if (btf_type_is_array(member_type)) {
+ array = btf_type_array(member_type);
+ if (!array->nelems)
+ return false;
+ member_type = btf_type_skip_modifiers(btf, array->type, NULL);
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ continue;
+ }
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ }
+ return true;
+}
+
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
bool ptr_to_mem_ok)
{
struct bpf_verifier_log *log = &env->log;
+ bool is_kfunc = btf_is_kernel(btf);
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
@@ -5493,7 +5675,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
- if (btf_is_kernel(btf)) {
+ if (btf_get_prog_ctx_type(log, btf, t,
+ env->prog->type, i)) {
+ /* If function expects ctx type in BTF check that caller
+ * is passing PTR_TO_CTX.
+ */
+ if (reg->type != PTR_TO_CTX) {
+ bpf_log(log,
+ "arg#%d expected pointer to ctx, but got %s\n",
+ i, btf_type_str(t));
+ return -EINVAL;
+ }
+ if (check_ctx_reg(env, reg, regno))
+ return -EINVAL;
+ } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) {
const struct btf_type *reg_ref_t;
const struct btf *reg_btf;
const char *reg_ref_tname;
@@ -5509,14 +5704,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (reg->type == PTR_TO_BTF_ID) {
reg_btf = reg->btf;
reg_ref_id = reg->btf_id;
- } else if (reg2btf_ids[reg->type]) {
+ } else {
reg_btf = btf_vmlinux;
reg_ref_id = *reg2btf_ids[reg->type];
- } else {
- bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
- func_name, i,
- btf_type_str(ref_t), ref_tname, regno);
- return -EINVAL;
}
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
@@ -5532,23 +5722,24 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
reg_ref_tname);
return -EINVAL;
}
- } else if (btf_get_prog_ctx_type(log, btf, t,
- env->prog->type, i)) {
- /* If function expects ctx type in BTF check that caller
- * is passing PTR_TO_CTX.
- */
- if (reg->type != PTR_TO_CTX) {
- bpf_log(log,
- "arg#%d expected pointer to ctx, but got %s\n",
- i, btf_type_str(t));
- return -EINVAL;
- }
- if (check_ctx_reg(env, reg, regno))
- return -EINVAL;
} else if (ptr_to_mem_ok) {
const struct btf_type *resolve_ret;
u32 type_size;
+ if (is_kfunc) {
+ /* Permit pointer to mem, but only when argument
+ * type is pointer to scalar, or struct composed
+ * (recursively) of scalars.
+ */
+ if (!btf_type_is_scalar(ref_t) &&
+ !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) {
+ bpf_log(log,
+ "arg#%d pointer type %s %s must point to scalar or struct with scalar\n",
+ i, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ }
+
resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
if (IS_ERR(resolve_ret)) {
bpf_log(log,
@@ -5561,6 +5752,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (check_mem_reg(env, reg, regno, type_size))
return -EINVAL;
} else {
+ bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i,
+ is_kfunc ? "kernel " : "", func_name, func_id);
return -EINVAL;
}
}
@@ -5610,7 +5803,7 @@ int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs)
{
- return btf_check_func_arg_match(env, btf, func_id, regs, false);
+ return btf_check_func_arg_match(env, btf, func_id, regs, true);
}
/* Convert BTF of a function into bpf_reg_state if possible
@@ -5718,7 +5911,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
}
- reg->type = PTR_TO_MEM_OR_NULL;
+ reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
reg->id = ++env->id_gen;
continue;
@@ -6029,6 +6222,8 @@ btf_module_read(struct file *file, struct kobject *kobj,
return len;
}
+static void purge_cand_cache(struct btf *btf);
+
static int btf_module_notify(struct notifier_block *nb, unsigned long op,
void *module)
{
@@ -6063,6 +6258,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
goto out;
}
+ purge_cand_cache(NULL);
mutex_lock(&btf_module_mutex);
btf_mod->module = module;
btf_mod->btf = btf;
@@ -6105,6 +6301,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
list_del(&btf_mod->list);
if (btf_mod->sysfs_attr)
sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
+ purge_cand_cache(btf_mod->btf);
btf_put(btf_mod->btf);
kfree(btf_mod->sysfs_attr);
kfree(btf_mod);
@@ -6208,10 +6405,442 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
.func = bpf_btf_find_by_name_kind,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_ANYTHING,
};
-BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct)
+BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
+#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type)
+BTF_TRACING_TYPE_xxx
+#undef BTF_TRACING_TYPE
+
+/* BTF ID set registration API for modules */
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+
+void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
+ struct kfunc_btf_id_set *s)
+{
+ mutex_lock(&l->mutex);
+ list_add(&s->list, &l->list);
+ mutex_unlock(&l->mutex);
+}
+EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set);
+
+void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
+ struct kfunc_btf_id_set *s)
+{
+ mutex_lock(&l->mutex);
+ list_del_init(&s->list);
+ mutex_unlock(&l->mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set);
+
+bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
+ struct module *owner)
+{
+ struct kfunc_btf_id_set *s;
+
+ mutex_lock(&klist->mutex);
+ list_for_each_entry(s, &klist->list, list) {
+ if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) {
+ mutex_unlock(&klist->mutex);
+ return true;
+ }
+ }
+ mutex_unlock(&klist->mutex);
+ return false;
+}
+
+#define DEFINE_KFUNC_BTF_ID_LIST(name) \
+ struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \
+ __MUTEX_INITIALIZER(name.mutex) }; \
+ EXPORT_SYMBOL_GPL(name)
+
+DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
+DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
+
+#endif
+
+int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+ const struct btf *targ_btf, __u32 targ_id)
+{
+ return -EOPNOTSUPP;
+}
+
+static bool bpf_core_is_flavor_sep(const char *s)
+{
+ /* check X___Y name pattern, where X and Y are not underscores */
+ return s[0] != '_' && /* X */
+ s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */
+ s[4] != '_'; /* Y */
+}
+
+size_t bpf_core_essential_name_len(const char *name)
+{
+ size_t n = strlen(name);
+ int i;
+
+ for (i = n - 5; i >= 0; i--) {
+ if (bpf_core_is_flavor_sep(name + i))
+ return i + 1;
+ }
+ return n;
+}
+
+struct bpf_cand_cache {
+ const char *name;
+ u32 name_len;
+ u16 kind;
+ u16 cnt;
+ struct {
+ const struct btf *btf;
+ u32 id;
+ } cands[];
+};
+
+static void bpf_free_cands(struct bpf_cand_cache *cands)
+{
+ if (!cands->cnt)
+ /* empty candidate array was allocated on stack */
+ return;
+ kfree(cands);
+}
+
+static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands)
+{
+ kfree(cands->name);
+ kfree(cands);
+}
+
+#define VMLINUX_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
+
+#define MODULE_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
+
+static DEFINE_MUTEX(cand_cache_mutex);
+
+static void __print_cand_cache(struct bpf_verifier_log *log,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ bpf_log(log, "[%d]%s(", i, cc->name);
+ for (j = 0; j < cc->cnt; j++) {
+ bpf_log(log, "%d", cc->cands[j].id);
+ if (j < cc->cnt - 1)
+ bpf_log(log, " ");
+ }
+ bpf_log(log, "), ");
+ }
+}
+
+static void print_cand_cache(struct bpf_verifier_log *log)
+{
+ mutex_lock(&cand_cache_mutex);
+ bpf_log(log, "vmlinux_cand_cache:");
+ __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ bpf_log(log, "\nmodule_cand_cache:");
+ __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ bpf_log(log, "\n");
+ mutex_unlock(&cand_cache_mutex);
+}
+
+static u32 hash_cands(struct bpf_cand_cache *cands)
+{
+ return jhash(cands->name, cands->name_len, 0);
+}
+
+static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size];
+
+ if (cc && cc->name_len == cands->name_len &&
+ !strncmp(cc->name, cands->name, cands->name_len))
+ return cc;
+ return NULL;
+}
+
+static size_t sizeof_cands(int cnt)
+{
+ return offsetof(struct bpf_cand_cache, cands[cnt]);
+}
+
+static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands;
+
+ if (*cc) {
+ bpf_free_cands_from_cache(*cc);
+ *cc = NULL;
+ }
+ new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ /* strdup the name, since it will stay in cache.
+ * the cands->name points to strings in prog's BTF and the prog can be unloaded.
+ */
+ new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL);
+ bpf_free_cands(cands);
+ if (!new_cands->name) {
+ kfree(new_cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ *cc = new_cands;
+ return new_cands;
+}
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ if (!btf) {
+ /* when new module is loaded purge all of module_cand_cache,
+ * since new module might have candidates with the name
+ * that matches cached cands.
+ */
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ continue;
+ }
+ /* when module is unloaded purge cache entries
+ * that match module's btf
+ */
+ for (j = 0; j < cc->cnt; j++)
+ if (cc->cands[j].btf == btf) {
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ break;
+ }
+ }
+
+}
+
+static void purge_cand_cache(struct btf *btf)
+{
+ mutex_lock(&cand_cache_mutex);
+ __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ mutex_unlock(&cand_cache_mutex);
+}
+#endif
+
+static struct bpf_cand_cache *
+bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
+ int targ_start_id)
+{
+ struct bpf_cand_cache *new_cands;
+ const struct btf_type *t;
+ const char *targ_name;
+ size_t targ_essent_len;
+ int n, i;
+
+ n = btf_nr_types(targ_btf);
+ for (i = targ_start_id; i < n; i++) {
+ t = btf_type_by_id(targ_btf, i);
+ if (btf_kind(t) != cands->kind)
+ continue;
+
+ targ_name = btf_name_by_offset(targ_btf, t->name_off);
+ if (!targ_name)
+ continue;
+
+ /* the resched point is before strncmp to make sure that search
+ * for non-existing name will have a chance to schedule().
+ */
+ cond_resched();
+
+ if (strncmp(cands->name, targ_name, cands->name_len) != 0)
+ continue;
+
+ targ_essent_len = bpf_core_essential_name_len(targ_name);
+ if (targ_essent_len != cands->name_len)
+ continue;
+
+ /* most of the time there is only one candidate for a given kind+name pair */
+ new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ memcpy(new_cands, cands, sizeof_cands(cands->cnt));
+ bpf_free_cands(cands);
+ cands = new_cands;
+ cands->cands[cands->cnt].btf = targ_btf;
+ cands->cands[cands->cnt].id = i;
+ cands->cnt++;
+ }
+ return cands;
+}
+
+static struct bpf_cand_cache *
+bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
+{
+ struct bpf_cand_cache *cands, *cc, local_cand = {};
+ const struct btf *local_btf = ctx->btf;
+ const struct btf_type *local_type;
+ const struct btf *main_btf;
+ size_t local_essent_len;
+ struct btf *mod_btf;
+ const char *name;
+ int id;
+
+ main_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(main_btf))
+ return ERR_CAST(main_btf);
+
+ local_type = btf_type_by_id(local_btf, local_type_id);
+ if (!local_type)
+ return ERR_PTR(-EINVAL);
+
+ name = btf_name_by_offset(local_btf, local_type->name_off);
+ if (str_is_empty(name))
+ return ERR_PTR(-EINVAL);
+ local_essent_len = bpf_core_essential_name_len(name);
+
+ cands = &local_cand;
+ cands->name = name;
+ cands->kind = btf_kind(local_type);
+ cands->name_len = local_essent_len;
+
+ cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ /* cands is a pointer to stack here */
+ if (cc) {
+ if (cc->cnt)
+ return cc;
+ goto check_modules;
+ }
+
+ /* Attempt to find target candidates in vmlinux BTF first */
+ cands = bpf_core_add_cands(cands, main_btf, 1);
+ if (IS_ERR(cands))
+ return ERR_CAST(cands);
+
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */
+
+ /* populate cache even when cands->cnt == 0 */
+ cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ if (IS_ERR(cc))
+ return ERR_CAST(cc);
+
+ /* if vmlinux BTF has any candidate, don't go for module BTFs */
+ if (cc->cnt)
+ return cc;
+
+check_modules:
+ /* cands is a pointer to stack here and cands->cnt == 0 */
+ cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ if (cc)
+ /* if cache has it return it even if cc->cnt == 0 */
+ return cc;
+
+ /* If candidate is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, mod_btf, id) {
+ if (!btf_is_module(mod_btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(mod_btf);
+ spin_unlock_bh(&btf_idr_lock);
+ cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
+ if (IS_ERR(cands)) {
+ btf_put(mod_btf);
+ return ERR_CAST(cands);
+ }
+ spin_lock_bh(&btf_idr_lock);
+ btf_put(mod_btf);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0
+ * or pointer to stack if cands->cnd == 0.
+ * Copy it into the cache even when cands->cnt == 0 and
+ * return the result.
+ */
+ return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+}
+
+int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
+ int relo_idx, void *insn)
+{
+ bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
+ struct bpf_core_cand_list cands = {};
+ struct bpf_core_spec *specs;
+ int err;
+
+ /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
+ * into arrays of btf_ids of struct fields and array indices.
+ */
+ specs = kcalloc(3, sizeof(*specs), GFP_KERNEL);
+ if (!specs)
+ return -ENOMEM;
+
+ if (need_cands) {
+ struct bpf_cand_cache *cc;
+ int i;
+
+ mutex_lock(&cand_cache_mutex);
+ cc = bpf_core_find_cands(ctx, relo->type_id);
+ if (IS_ERR(cc)) {
+ bpf_log(ctx->log, "target candidate search failed for %d\n",
+ relo->type_id);
+ err = PTR_ERR(cc);
+ goto out;
+ }
+ if (cc->cnt) {
+ cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL);
+ if (!cands.cands) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ for (i = 0; i < cc->cnt; i++) {
+ bpf_log(ctx->log,
+ "CO-RE relocating %s %s: found target candidate [%d]\n",
+ btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
+ cands.cands[i].btf = cc->cands[i].btf;
+ cands.cands[i].id = cc->cands[i].id;
+ }
+ cands.len = cc->cnt;
+ /* cand_cache_mutex needs to span the cache lookup and
+ * copy of btf pointer into bpf_core_cand_list,
+ * since module can be unloaded while bpf_core_apply_relo_insn
+ * is working with module's btf.
+ */
+ }
+
+ err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
+ relo, relo_idx, ctx->btf, &cands, specs);
+out:
+ kfree(specs);
+ if (need_cands) {
+ kfree(cands.cands);
+ mutex_unlock(&cand_cache_mutex);
+ if (ctx->log->level & BPF_LOG_LEVEL2)
+ print_cand_cache(ctx->log);
+ }
+ return err;
+}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 03145d45e3d5..514b4681a90a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -430,10 +430,10 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
* Exactly one of @prog or @link can be non-null.
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_attach(struct cgroup *cgrp,
- struct bpf_prog *prog, struct bpf_prog *replace_prog,
- struct bpf_cgroup_link *link,
- enum bpf_attach_type type, u32 flags)
+static int __cgroup_bpf_attach(struct cgroup *cgrp,
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type, u32 flags)
{
u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
struct bpf_prog *old_prog = NULL;
@@ -523,6 +523,20 @@ cleanup:
return err;
}
+static int cgroup_bpf_attach(struct cgroup *cgrp,
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type,
+ u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
/* Swap updated BPF program for given link in effective program arrays across
* all descendant cgroups. This function is guaranteed to succeed.
*/
@@ -672,14 +686,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to detach or NULL
- * @prog: A link to detach or NULL
+ * @link: A link to detach or NULL
* @type: Type of detach operation
*
* At most one of @prog or @link can be non-NULL.
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- struct bpf_cgroup_link *link, enum bpf_attach_type type)
+static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link, enum bpf_attach_type type)
{
enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
@@ -730,9 +744,20 @@ cleanup:
return err;
}
+static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
/* Must be called with cgroup_mutex held to avoid races. */
-int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
enum bpf_attach_type type = attr->query.attach_type;
@@ -789,6 +814,17 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
return ret;
}
+static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_query(cgrp, attr, uattr);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog)
{
@@ -1753,7 +1789,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@@ -1773,6 +1809,8 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sysctl_get_new_value_proto;
case BPF_FUNC_sysctl_set_new_value:
return &bpf_sysctl_set_new_value_proto;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
default:
return cgroup_base_func_proto(func_id, prog);
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9f4636d021b1..de3e5bc6781f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -32,6 +32,7 @@
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
+#include <linux/bpf_verifier.h>
#include <asm/barrier.h>
#include <asm/unaligned.h>
@@ -389,6 +390,13 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
i = end_new;
insn = prog->insnsi + end_old;
}
+ if (bpf_pseudo_func(insn)) {
+ ret = bpf_adj_delta_to_imm(insn, pos, end_old,
+ end_new, i, probe_pass);
+ if (ret)
+ return ret;
+ continue;
+ }
code = insn->code;
if ((BPF_CLASS(code) != BPF_JMP &&
BPF_CLASS(code) != BPF_JMP32) ||
@@ -524,6 +532,7 @@ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden __read_mostly;
long bpf_jit_limit __read_mostly;
+long bpf_jit_limit_max __read_mostly;
static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
@@ -817,7 +826,8 @@ u64 __weak bpf_jit_alloc_exec_limit(void)
static int __init bpf_jit_charge_init(void)
{
/* Only used as heuristic here to derive limit. */
- bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2,
+ bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
+ bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2,
PAGE_SIZE), LONG_MAX);
return 0;
}
@@ -827,7 +837,7 @@ int bpf_jit_charge_modmem(u32 pages)
{
if (atomic_long_add_return(pages, &bpf_jit_current) >
(bpf_jit_limit >> PAGE_SHIFT)) {
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!bpf_capable()) {
atomic_long_sub(pages, &bpf_jit_current);
return -EPERM;
}
@@ -1564,7 +1574,8 @@ select_insn:
if (unlikely(index >= array->map.max_entries))
goto out;
- if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+
+ if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
goto out;
tail_call_cnt++;
@@ -1821,20 +1832,26 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
{
+ bool ret;
+
if (fp->kprobe_override)
return false;
- if (!array->aux->type) {
+ spin_lock(&array->aux->owner.lock);
+
+ if (!array->aux->owner.type) {
/* There's no owner yet where we could check for
* compatibility.
*/
- array->aux->type = fp->type;
- array->aux->jited = fp->jited;
- return true;
+ array->aux->owner.type = fp->type;
+ array->aux->owner.jited = fp->jited;
+ ret = true;
+ } else {
+ ret = array->aux->owner.type == fp->type &&
+ array->aux->owner.jited == fp->jited;
}
-
- return array->aux->type == fp->type &&
- array->aux->jited == fp->jited;
+ spin_unlock(&array->aux->owner.lock);
+ return ret;
}
static int bpf_check_tail_call(const struct bpf_prog *fp)
@@ -1875,7 +1892,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
/**
* bpf_prog_select_runtime - select exec runtime for BPF program
- * @fp: bpf_prog populated with internal BPF program
+ * @fp: bpf_prog populated with BPF program
* @err: pointer to error variable
*
* Try to JIT eBPF program, if JIT is not available, use interpreter.
@@ -2255,6 +2272,9 @@ static void bpf_prog_free_deferred(struct work_struct *work)
int i;
aux = container_of(work, struct bpf_prog_aux, work);
+#ifdef CONFIG_BPF_SYSCALL
+ bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
+#endif
bpf_free_used_maps(aux);
bpf_free_used_btfs(aux);
if (bpf_prog_is_dev_bound(aux))
@@ -2281,7 +2301,6 @@ static void bpf_prog_free_deferred(struct work_struct *work)
}
}
-/* Free internal BPF program */
void bpf_prog_free(struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
@@ -2357,6 +2376,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
return NULL;
}
+const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
+{
+ return NULL;
+}
+
u64 __weak
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 585b2b77ccc4..b3e6b9422238 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -195,7 +195,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
}
return;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(skb->dev, rcpu->prog, act);
@@ -254,7 +254,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
break;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
fallthrough;
case XDP_DROP:
xdp_return_frame(xdpf);
@@ -746,15 +746,9 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
list_add(&bq->flush_node, flush_list);
}
-int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
- struct xdp_frame *xdpf;
-
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
/* Info needed when constructing SKB on remote CPU */
xdpf->dev_rx = dev_rx;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index f02d04540c0c..fe019dbdb3f0 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -348,7 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
frames[nframes++] = xdpf;
break;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(dev, xdp_prog, act);
@@ -467,24 +467,19 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
bq->q[bq->count++] = xdpf;
}
-static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx,
struct bpf_prog *xdp_prog)
{
- struct xdp_frame *xdpf;
int err;
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
- err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+ err = xdp_ok_fwd_dev(dev, xdpf->len);
if (unlikely(err))
return err;
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
return 0;
}
@@ -507,7 +502,7 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev
__skb_push(skb, skb->mac_len);
break;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(dst->dev, dst->xdp_prog, act);
@@ -520,27 +515,27 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev
return act;
}
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
- return __xdp_enqueue(dev, xdp, dev_rx, NULL);
+ return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
struct net_device *dev = dst->dev;
- return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
+ return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
}
-static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp)
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
{
if (!obj ||
!obj->dev->netdev_ops->ndo_xdp_xmit)
return false;
- if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
+ if (xdp_ok_fwd_dev(obj->dev, xdpf->len))
return false;
return true;
@@ -586,14 +581,13 @@ static int get_upper_ifindexes(struct net_device *dev, int *indexes)
return n;
}
-int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
struct bpf_map *map, bool exclude_ingress)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dst, *last_dst = NULL;
int excluded_devices[1+MAX_NEST_DEV];
struct hlist_head *head;
- struct xdp_frame *xdpf;
int num_excluded = 0;
unsigned int i;
int err;
@@ -603,15 +597,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
excluded_devices[num_excluded++] = dev_rx->ifindex;
}
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
for (i = 0; i < map->max_entries; i++) {
dst = rcu_dereference_check(dtab->netdev_map[i],
rcu_read_lock_bh_held());
- if (!is_valid_dst(dst, xdp))
+ if (!is_valid_dst(dst, xdpf))
continue;
if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
@@ -634,7 +624,7 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
head = dev_map_index_hash(dtab, i);
hlist_for_each_entry_rcu(dst, head, index_hlist,
lockdep_is_held(&dtab->index_lock)) {
- if (!is_valid_dst(dst, xdp))
+ if (!is_valid_dst(dst, xdpf))
continue;
if (is_ifindex_excluded(excluded_devices, num_excluded,
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index ca3cd9aaa6ce..7b4afb7d96db 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-only
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e546b18d27da..a4b040793f44 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 32471ba02708..d29af9988f37 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -668,7 +668,7 @@ static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
- *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
offsetof(struct htab_elem, key) +
@@ -709,7 +709,7 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
- *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
offsetof(struct htab_elem, lru_node) +
@@ -2049,7 +2049,7 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
};
-static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
+static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
void *callback_ctx, u64 flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
@@ -2089,9 +2089,8 @@ static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
val = elem->key + roundup_key_size;
}
num_elems++;
- ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
- (u64)(long)key, (u64)(long)val,
- (u64)(long)callback_ctx, 0);
+ ret = callback_fn((u64)(long)map, (u64)(long)key,
+ (u64)(long)val, (u64)(long)callback_ctx, 0);
/* return value: 0 - continue, 1 - stop and return */
if (ret) {
rcu_read_unlock();
@@ -2397,7 +2396,7 @@ static int htab_of_map_gen_lookup(struct bpf_map *map,
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
- *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
offsetof(struct htab_elem, key) +
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9aabf84afd4b..01cfdf40c838 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
@@ -530,7 +531,7 @@ const struct bpf_func_proto bpf_strtol_proto = {
.func = bpf_strtol,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
@@ -558,13 +559,27 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.func = bpf_strtoul,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
};
#endif
+BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
+{
+ return strncmp(s1, s2, s1_sz);
+}
+
+const struct bpf_func_proto bpf_strncmp_proto = {
+ .func = bpf_strncmp,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_PTR_TO_CONST_STR,
+};
+
BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
struct bpf_pidns_info *, nsdata, u32, size)
{
@@ -630,7 +645,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -667,7 +682,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
.func = bpf_per_cpu_ptr,
.gpl_only = false,
- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
.arg2_type = ARG_ANYTHING,
};
@@ -680,7 +695,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
.func = bpf_this_cpu_ptr,
.gpl_only = false,
- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
};
@@ -979,15 +994,13 @@ out:
return err;
}
-#define MAX_SNPRINTF_VARARGS 12
-
BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
const void *, data, u32, data_len)
{
int err, num_args;
u32 *bin_args;
- if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 ||
+ if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
(data_len && !data))
return -EINVAL;
num_args = data_len / 8;
@@ -1013,7 +1026,7 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg1_type = ARG_PTR_TO_MEM_OR_NULL,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_PTR_TO_CONST_STR,
- .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -1058,7 +1071,7 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
struct bpf_map *map = t->map;
void *value = t->value;
- void *callback_fn;
+ bpf_callback_t callback_fn;
void *key;
u32 idx;
@@ -1083,8 +1096,7 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
key = value - round_up(map->key_size, 8);
}
- BPF_CAST_CALL(callback_fn)((u64)(long)map, (u64)(long)key,
- (u64)(long)value, 0, 0);
+ callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
/* The verifier checked that return value is zero. */
this_cpu_write(hrtimer_running, NULL);
@@ -1367,8 +1379,6 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
- case BPF_FUNC_ktime_get_coarse_ns:
- return &bpf_ktime_get_coarse_ns_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
@@ -1381,6 +1391,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_query_proto;
case BPF_FUNC_for_each_map_elem:
return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_loop:
+ return &bpf_loop_proto;
+ case BPF_FUNC_strncmp:
+ return &bpf_strncmp_proto;
default:
break;
}
@@ -1437,6 +1451,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_snprintf_proto;
case BPF_FUNC_task_pt_regs:
return &bpf_task_pt_regs_proto;
+ case BPF_FUNC_trace_vprintk:
+ return bpf_get_trace_vprintk_proto();
default:
return NULL;
}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 035e9e3a7132..23f7f9d08a62 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -163,8 +163,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return 0;
}
- new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) +
- map->value_size,
+ new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
__GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
map->numa_node);
if (!new)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 423549d2c52e..5763cc7ac4f1 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -412,7 +412,7 @@ static int trie_update_elem(struct bpf_map *map,
rcu_assign_pointer(im_node->child[1], node);
}
- /* Finally, assign the intermediate node to the determined spot */
+ /* Finally, assign the intermediate node to the determined slot */
rcu_assign_pointer(*slot, im_node);
out:
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 6a9542af4212..b0fa190b0979 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = {
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map_elem, key),
- PTR_TO_RDONLY_BUF_OR_NULL },
+ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
{ offsetof(struct bpf_iter__bpf_map_elem, value),
- PTR_TO_RDWR_BUF_OR_NULL },
+ PTR_TO_BUF | PTR_MAYBE_NULL },
},
};
diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h
new file mode 100644
index 000000000000..5d18d7d85bef
--- /dev/null
+++ b/kernel/bpf/mmap_unlock_work.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2021 Facebook
+ */
+
+#ifndef __MMAP_UNLOCK_WORK_H__
+#define __MMAP_UNLOCK_WORK_H__
+#include <linux/irq_work.h>
+
+/* irq_work to run mmap_read_unlock() in irq_work */
+struct mmap_unlock_irq_work {
+ struct irq_work irq_work;
+ struct mm_struct *mm;
+};
+
+DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+/*
+ * We cannot do mmap_read_unlock() when the irq is disabled, because of
+ * risk to deadlock with rq_lock. To look up vma when the irqs are
+ * disabled, we need to run mmap_read_unlock() in irq_work. We use a
+ * percpu variable to do the irq_work. If the irq_work is already used
+ * by another lookup, we fall over.
+ */
+static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr)
+{
+ struct mmap_unlock_irq_work *work = NULL;
+ bool irq_work_busy = false;
+
+ if (irqs_disabled()) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ work = this_cpu_ptr(&mmap_unlock_work);
+ if (irq_work_is_busy(&work->irq_work)) {
+ /* cannot queue more up_read, fallback */
+ irq_work_busy = true;
+ }
+ } else {
+ /*
+ * PREEMPT_RT does not allow to trylock mmap sem in
+ * interrupt disabled context. Force the fallback code.
+ */
+ irq_work_busy = true;
+ }
+ }
+
+ *work_ptr = work;
+ return irq_work_busy;
+}
+
+static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm)
+{
+ if (!work) {
+ mmap_read_unlock(mm);
+ } else {
+ work->mm = mm;
+
+ /* The lock will be released once we're out of interrupt
+ * context. Tell lockdep that we've released it now so
+ * it doesn't complain that we forgot to release it.
+ */
+ rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_);
+ irq_work_queue(&work->irq_work);
+ }
+}
+
+#endif /* __MMAP_UNLOCK_WORK_H__ */
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 542f275bf252..868cc2c43899 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
+#include <linux/bpf-netns.h>
#include <linux/filter.h>
#include <net/net_namespace.h>
diff --git a/kernel/bpf/preload/.gitignore b/kernel/bpf/preload/.gitignore
index 856a4c5ad0dd..9452322902a5 100644
--- a/kernel/bpf/preload/.gitignore
+++ b/kernel/bpf/preload/.gitignore
@@ -1,4 +1,2 @@
-/FEATURE-DUMP.libbpf
-/bpf_helper_defs.h
-/feature
+/libbpf
/bpf_preload_umd
diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile
index 1951332dd15f..1400ac58178e 100644
--- a/kernel/bpf/preload/Makefile
+++ b/kernel/bpf/preload/Makefile
@@ -1,21 +1,35 @@
# SPDX-License-Identifier: GPL-2.0
LIBBPF_SRCS = $(srctree)/tools/lib/bpf/
-LIBBPF_A = $(obj)/libbpf.a
-LIBBPF_OUT = $(abspath $(obj))
+LIBBPF_OUT = $(abspath $(obj))/libbpf
+LIBBPF_A = $(LIBBPF_OUT)/libbpf.a
+LIBBPF_DESTDIR = $(LIBBPF_OUT)
+LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include
# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test
# in tools/scripts/Makefile.include always succeeds when building the kernel
# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg".
-$(LIBBPF_A):
- $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ $(LIBBPF_OUT)/libbpf.a
+$(LIBBPF_A): | $(LIBBPF_OUT)
+ $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ \
+ DESTDIR=$(LIBBPF_DESTDIR) prefix= \
+ $(LIBBPF_OUT)/libbpf.a install_headers
+
+libbpf_hdrs: $(LIBBPF_A)
+
+.PHONY: libbpf_hdrs
+
+$(LIBBPF_OUT):
+ $(call msg,MKDIR,$@)
+ $(Q)mkdir -p $@
userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \
- -I $(srctree)/tools/lib/ -Wno-unused-result
+ -I $(LIBBPF_INCLUDE) -Wno-unused-result
userprogs := bpf_preload_umd
-clean-files := $(userprogs) bpf_helper_defs.h FEATURE-DUMP.libbpf staticobjs/ feature/
+clean-files := libbpf/
+
+$(obj)/iterators/iterators.o: | libbpf_hdrs
bpf_preload_umd-objs := iterators/iterators.o
bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
index 28fa8c1440f4..b8bd60511227 100644
--- a/kernel/bpf/preload/iterators/Makefile
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -1,18 +1,26 @@
# SPDX-License-Identifier: GPL-2.0
OUTPUT := .output
+abs_out := $(abspath $(OUTPUT))
+
CLANG ?= clang
LLC ?= llc
LLVM_STRIP ?= llvm-strip
+
+TOOLS_PATH := $(abspath ../../../../tools)
+BPFTOOL_SRC := $(TOOLS_PATH)/bpf/bpftool
+BPFTOOL_OUTPUT := $(abs_out)/bpftool
DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool
BPFTOOL ?= $(DEFAULT_BPFTOOL)
-LIBBPF_SRC := $(abspath ../../../../tools/lib/bpf)
-BPFOBJ := $(OUTPUT)/libbpf.a
-BPF_INCLUDE := $(OUTPUT)
-INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../../../tools/lib) \
- -I$(abspath ../../../../tools/include/uapi)
+
+LIBBPF_SRC := $(TOOLS_PATH)/lib/bpf
+LIBBPF_OUTPUT := $(abs_out)/libbpf
+LIBBPF_DESTDIR := $(LIBBPF_OUTPUT)
+LIBBPF_INCLUDE := $(LIBBPF_DESTDIR)/include
+BPFOBJ := $(LIBBPF_OUTPUT)/libbpf.a
+
+INCLUDES := -I$(OUTPUT) -I$(LIBBPF_INCLUDE) -I$(TOOLS_PATH)/include/uapi
CFLAGS := -g -Wall
-abs_out := $(abspath $(OUTPUT))
ifeq ($(V),1)
Q =
msg =
@@ -44,14 +52,18 @@ $(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
-c $(filter %.c,$^) -o $@ && \
$(LLVM_STRIP) -g $@
-$(OUTPUT):
+$(OUTPUT) $(LIBBPF_OUTPUT) $(BPFTOOL_OUTPUT):
$(call msg,MKDIR,$@)
- $(Q)mkdir -p $(OUTPUT)
+ $(Q)mkdir -p $@
-$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)
+$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT)
$(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \
- OUTPUT=$(abspath $(dir $@))/ $(abspath $@)
+ OUTPUT=$(abspath $(dir $@))/ prefix= \
+ DESTDIR=$(LIBBPF_DESTDIR) $(abspath $@) install_headers
-$(DEFAULT_BPFTOOL):
- $(Q)$(MAKE) $(submake_extras) -C ../../../../tools/bpf/bpftool \
- prefix= OUTPUT=$(abs_out)/ DESTDIR=$(abs_out) install
+$(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BPFTOOL_OUTPUT)
+ $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) \
+ OUTPUT=$(BPFTOOL_OUTPUT)/ \
+ LIBBPF_OUTPUT=$(LIBBPF_OUTPUT)/ \
+ LIBBPF_DESTDIR=$(LIBBPF_DESTDIR)/ \
+ prefix= DESTDIR=$(abs_out)/ install-bin
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 93a55391791a..556a769b5b80 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -152,16 +152,12 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct reuseport_array *array;
- u64 array_size;
if (!bpf_capable())
return ERR_PTR(-EPERM);
- array_size = sizeof(*array);
- array_size += (u64)attr->max_entries * sizeof(struct sock *);
-
/* allocate all map elements and zero-initialize them */
- array = bpf_map_area_alloc(array_size, numa_node);
+ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 9e0c10c6892a..638d7fd7b375 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = {
.func = bpf_ringbuf_output,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index e8eefdf8cf3e..49e567209c6b 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -7,10 +7,10 @@
#include <linux/kernel.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
-#include <linux/irq_work.h>
#include <linux/btf_ids.h>
#include <linux/buildid.h>
#include "percpu_freelist.h"
+#include "mmap_unlock_work.h"
#define STACK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
@@ -31,25 +31,6 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[];
};
-/* irq_work to run up_read() for build_id lookup in nmi context */
-struct stack_map_irq_work {
- struct irq_work irq_work;
- struct mm_struct *mm;
-};
-
-static void do_up_read(struct irq_work *entry)
-{
- struct stack_map_irq_work *work;
-
- if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
- return;
-
- work = container_of(entry, struct stack_map_irq_work, irq_work);
- mmap_read_unlock_non_owner(work->mm);
-}
-
-static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
-
static inline bool stack_map_use_build_id(struct bpf_map *map)
{
return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -63,7 +44,8 @@ static inline int stack_map_data_size(struct bpf_map *map)
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
- u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
+ u64 elem_size = sizeof(struct stack_map_bucket) +
+ (u64)smap->map.value_size;
int err;
smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
@@ -148,38 +130,16 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
u64 *ips, u32 trace_nr, bool user)
{
int i;
+ struct mmap_unlock_irq_work *work = NULL;
+ bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
struct vm_area_struct *vma;
- bool irq_work_busy = false;
- struct stack_map_irq_work *work = NULL;
-
- if (irqs_disabled()) {
- if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
- work = this_cpu_ptr(&up_read_work);
- if (irq_work_is_busy(&work->irq_work)) {
- /* cannot queue more up_read, fallback */
- irq_work_busy = true;
- }
- } else {
- /*
- * PREEMPT_RT does not allow to trylock mmap sem in
- * interrupt disabled context. Force the fallback code.
- */
- irq_work_busy = true;
- }
- }
- /*
- * We cannot do up_read() when the irq is disabled, because of
- * risk to deadlock with rq_lock. To do build_id lookup when the
- * irqs are disabled, we need to run up_read() in irq_work. We use
- * a percpu variable to do the irq_work. If the irq_work is
- * already used by another lookup, we fall back to report ips.
- *
- * Same fallback is used for kernel stack (!user) on a stackmap
- * with build_id.
+ /* If the irq_work is in use, fall back to report ips. Same
+ * fallback is used for kernel stack (!user) on a stackmap with
+ * build_id.
*/
if (!user || !current || !current->mm || irq_work_busy ||
- !mmap_read_trylock_non_owner(current->mm)) {
+ !mmap_read_trylock(current->mm)) {
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
@@ -202,13 +162,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
- vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
}
-
- if (!work) {
- mmap_read_unlock_non_owner(current->mm);
- } else {
- work->mm = current->mm;
- irq_work_queue(&work->irq_work);
- }
+ bpf_mmap_unlock_mm(work, current->mm);
}
static struct perf_callchain_entry *
@@ -535,7 +489,7 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
- .arg1_btf_id = &btf_task_struct_ids[0],
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
@@ -712,16 +666,3 @@ const struct bpf_map_ops stack_trace_map_ops = {
.map_btf_name = "bpf_stack_map",
.map_btf_id = &stack_trace_map_btf_id,
};
-
-static int __init stack_map_init(void)
-{
- int cpu;
- struct stack_map_irq_work *work;
-
- for_each_possible_cpu(cpu) {
- work = per_cpu_ptr(&up_read_work, cpu);
- init_irq_work(&work->irq_work, do_up_read);
- }
- return 0;
-}
-subsys_initcall(stack_map_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4e50c0bfdb7d..fa4505f9b611 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
#include <linux/bpf_verifier.h>
@@ -132,6 +133,21 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
}
+static void bpf_map_write_active_inc(struct bpf_map *map)
+{
+ atomic64_inc(&map->writecnt);
+}
+
+static void bpf_map_write_active_dec(struct bpf_map *map)
+{
+ atomic64_dec(&map->writecnt);
+}
+
+bool bpf_map_write_active(const struct bpf_map *map)
+{
+ return atomic64_read(&map->writecnt) != 0;
+}
+
static u32 bpf_map_value_size(const struct bpf_map *map)
{
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -199,7 +215,8 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
err = bpf_fd_reuseport_array_update_elem(map, key, value,
flags);
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
+ map->map_type == BPF_MAP_TYPE_STACK ||
+ map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_push_elem(map, value, flags);
} else {
rcu_read_lock();
@@ -238,7 +255,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
- map->map_type == BPF_MAP_TYPE_STACK) {
+ map->map_type == BPF_MAP_TYPE_STACK ||
+ map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_peek_elem(map, value);
} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
/* struct_ops map requires directly updating "value" */
@@ -348,6 +366,7 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
map->max_entries = attr->max_entries;
map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
map->numa_node = bpf_map_attr_numa_node(attr);
+ map->map_extra = attr->map_extra;
}
static int bpf_map_alloc_id(struct bpf_map *map)
@@ -543,8 +562,10 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
array = container_of(map, struct bpf_array, map);
- type = array->aux->type;
- jited = array->aux->jited;
+ spin_lock(&array->aux->owner.lock);
+ type = array->aux->owner.type;
+ jited = array->aux->owner.jited;
+ spin_unlock(&array->aux->owner.lock);
}
seq_printf(m,
@@ -553,6 +574,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"value_size:\t%u\n"
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
+ "map_extra:\t%#llx\n"
"memlock:\t%lu\n"
"map_id:\t%u\n"
"frozen:\t%u\n",
@@ -561,6 +583,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->value_size,
map->max_entries,
map->map_flags,
+ (unsigned long long)map->map_extra,
bpf_map_memory_footprint(map),
map->id,
READ_ONCE(map->frozen));
@@ -594,11 +617,8 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
- if (vma->vm_flags & VM_MAYWRITE) {
- mutex_lock(&map->freeze_mutex);
- map->writecnt++;
- mutex_unlock(&map->freeze_mutex);
- }
+ if (vma->vm_flags & VM_MAYWRITE)
+ bpf_map_write_active_inc(map);
}
/* called for all unmapped memory region (including initial) */
@@ -606,11 +626,8 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
- if (vma->vm_flags & VM_MAYWRITE) {
- mutex_lock(&map->freeze_mutex);
- map->writecnt--;
- mutex_unlock(&map->freeze_mutex);
- }
+ if (vma->vm_flags & VM_MAYWRITE)
+ bpf_map_write_active_dec(map);
}
static const struct vm_operations_struct bpf_map_default_vmops = {
@@ -661,7 +678,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
goto out;
if (vma->vm_flags & VM_MAYWRITE)
- map->writecnt++;
+ bpf_map_write_active_inc(map);
out:
mutex_unlock(&map->freeze_mutex);
return err;
@@ -810,7 +827,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
return ret;
}
-#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
+#define BPF_MAP_CREATE_LAST_FIELD map_extra
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -831,6 +848,10 @@ static int map_create(union bpf_attr *attr)
return -EINVAL;
}
+ if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
+ attr->map_extra != 0)
+ return -EINVAL;
+
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;
@@ -1080,6 +1101,14 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!value)
goto free_key;
+ if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
+ if (copy_from_user(value, uvalue, value_size))
+ err = -EFAULT;
+ else
+ err = bpf_map_copy_value(map, key, value, attr->flags);
+ goto free_value;
+ }
+
err = bpf_map_copy_value(map, key, value, attr->flags);
if (err)
goto free_value;
@@ -1120,6 +1149,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
@@ -1155,6 +1185,7 @@ free_value:
free_key:
kvfree(key);
err_put:
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
@@ -1177,6 +1208,7 @@ static int map_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
@@ -1207,6 +1239,7 @@ static int map_delete_elem(union bpf_attr *attr)
out:
kvfree(key);
err_put:
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
@@ -1337,12 +1370,11 @@ int generic_map_update_batch(struct bpf_map *map,
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 value_size, cp, max_count;
- int ufd = attr->map_fd;
+ int ufd = attr->batch.map_fd;
void *key, *value;
struct fd f;
int err = 0;
- f = fdget(ufd);
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
@@ -1367,6 +1399,7 @@ int generic_map_update_batch(struct bpf_map *map,
return -ENOMEM;
}
+ f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
for (cp = 0; cp < max_count; cp++) {
err = -EFAULT;
if (copy_from_user(key, keys + cp * map->key_size,
@@ -1386,6 +1419,7 @@ int generic_map_update_batch(struct bpf_map *map,
kvfree(value);
kvfree(key);
+ fdput(f);
return err;
}
@@ -1513,6 +1547,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
@@ -1577,6 +1612,7 @@ free_value:
free_key:
kvfree(key);
err_put:
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
@@ -1604,8 +1640,7 @@ static int map_freeze(const union bpf_attr *attr)
}
mutex_lock(&map->freeze_mutex);
-
- if (map->writecnt) {
+ if (bpf_map_write_active(map)) {
err = -EBUSY;
goto err_put;
}
@@ -1804,8 +1839,14 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
return 0;
}
+struct bpf_prog_kstats {
+ u64 nsecs;
+ u64 cnt;
+ u64 misses;
+};
+
static void bpf_prog_get_stats(const struct bpf_prog *prog,
- struct bpf_prog_stats *stats)
+ struct bpf_prog_kstats *stats)
{
u64 nsecs = 0, cnt = 0, misses = 0;
int cpu;
@@ -1818,9 +1859,9 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog,
st = per_cpu_ptr(prog->stats, cpu);
do {
start = u64_stats_fetch_begin_irq(&st->syncp);
- tnsecs = st->nsecs;
- tcnt = st->cnt;
- tmisses = st->misses;
+ tnsecs = u64_stats_read(&st->nsecs);
+ tcnt = u64_stats_read(&st->cnt);
+ tmisses = u64_stats_read(&st->misses);
} while (u64_stats_fetch_retry_irq(&st->syncp, start));
nsecs += tnsecs;
cnt += tcnt;
@@ -1836,7 +1877,7 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- struct bpf_prog_stats stats;
+ struct bpf_prog_kstats stats;
bpf_prog_get_stats(prog, &stats);
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
@@ -1848,7 +1889,8 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
"prog_id:\t%u\n"
"run_time_ns:\t%llu\n"
"run_cnt:\t%llu\n"
- "recursion_misses:\t%llu\n",
+ "recursion_misses:\t%llu\n"
+ "verified_insns:\t%u\n",
prog->type,
prog->jited,
prog_tag,
@@ -1856,7 +1898,8 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
prog->aux->id,
stats.nsecs,
stats.cnt,
- stats.misses);
+ stats.misses,
+ prog->aux->verified_insns);
}
#endif
@@ -2156,7 +2199,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD fd_array
+#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
{
@@ -3575,7 +3618,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
struct bpf_prog_info info;
u32 info_len = attr->info.info_len;
- struct bpf_prog_stats stats;
+ struct bpf_prog_kstats stats;
char __user *uinsns;
u32 ulen;
int err;
@@ -3625,6 +3668,8 @@ static int bpf_prog_get_info_by_fd(struct file *file,
info.run_cnt = stats.cnt;
info.recursion_misses = stats.misses;
+ info.verified_insns = prog->aux->verified_insns;
+
if (!bpf_capable()) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
@@ -3871,6 +3916,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
info.value_size = map->value_size;
info.max_entries = map->max_entries;
info.map_flags = map->map_flags;
+ info.map_extra = map->map_extra;
memcpy(info.name, map->name, sizeof(map->name));
if (map->btf) {
@@ -4140,6 +4186,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
union bpf_attr __user *uattr,
int cmd)
{
+ bool has_read = cmd == BPF_MAP_LOOKUP_BATCH ||
+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
+ bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
struct bpf_map *map;
int err, ufd;
struct fd f;
@@ -4152,16 +4201,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if ((cmd == BPF_MAP_LOOKUP_BATCH ||
- cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
- !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+ if (has_write)
+ bpf_map_write_active_inc(map);
+ if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
err = -EPERM;
goto err_put;
}
-
- if (cmd != BPF_MAP_LOOKUP_BATCH &&
- !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
@@ -4174,8 +4220,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
BPF_DO_BATCH(map->ops->map_update_batch);
else
BPF_DO_BATCH(map->ops->map_delete_batch);
-
err_put:
+ if (has_write)
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
@@ -4726,7 +4773,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@@ -4753,6 +4800,31 @@ static const struct bpf_func_proto bpf_sys_close_proto = {
.arg1_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
+{
+ if (flags)
+ return -EINVAL;
+
+ if (name_sz <= 1 || name[name_sz - 1])
+ return -EINVAL;
+
+ if (!bpf_dump_raw_ok(current_cred()))
+ return -EPERM;
+
+ *res = kallsyms_lookup_name(name);
+ return *res ? 0 : -ENOENT;
+}
+
+const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
+ .func = bpf_kallsyms_lookup_name,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_LONG,
+};
+
static const struct bpf_func_proto *
syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -4763,6 +4835,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_btf_find_by_name_kind_proto;
case BPF_FUNC_sys_close:
return &bpf_sys_close_proto;
+ case BPF_FUNC_kallsyms_lookup_name:
+ return &bpf_kallsyms_lookup_name_proto;
default:
return tracing_prog_func_proto(func_id, prog);
}
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index b48750bfba5a..d94696198ef8 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -8,6 +8,7 @@
#include <linux/fdtable.h>
#include <linux/filter.h>
#include <linux/btf_ids.h>
+#include "mmap_unlock_work.h"
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
@@ -524,10 +525,6 @@ static const struct seq_operations task_vma_seq_ops = {
.show = task_vma_seq_show,
};
-BTF_ID_LIST(btf_task_file_ids)
-BTF_ID(struct, file)
-BTF_ID(struct, vm_area_struct)
-
static const struct bpf_iter_seq_info task_seq_info = {
.seq_ops = &task_seq_ops,
.init_seq_private = init_seq_pidns,
@@ -586,23 +583,88 @@ static struct bpf_iter_reg task_vma_reg_info = {
.seq_info = &task_vma_seq_info,
};
+BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
+ bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
+{
+ struct mmap_unlock_irq_work *work = NULL;
+ struct vm_area_struct *vma;
+ bool irq_work_busy = false;
+ struct mm_struct *mm;
+ int ret = -ENOENT;
+
+ if (flags)
+ return -EINVAL;
+
+ if (!task)
+ return -ENOENT;
+
+ mm = task->mm;
+ if (!mm)
+ return -ENOENT;
+
+ irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+
+ if (irq_work_busy || !mmap_read_trylock(mm))
+ return -EBUSY;
+
+ vma = find_vma(mm, start);
+
+ if (vma && vma->vm_start <= start && vma->vm_end > start) {
+ callback_fn((u64)(long)task, (u64)(long)vma,
+ (u64)(long)callback_ctx, 0, 0);
+ ret = 0;
+ }
+ bpf_mmap_unlock_mm(work, mm);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_find_vma_proto = {
+ .func = bpf_find_vma,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_FUNC,
+ .arg4_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg5_type = ARG_ANYTHING,
+};
+
+DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+static void do_mmap_read_unlock(struct irq_work *entry)
+{
+ struct mmap_unlock_irq_work *work;
+
+ if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
+ return;
+
+ work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
+ mmap_read_unlock_non_owner(work->mm);
+}
+
static int __init task_iter_init(void)
{
- int ret;
+ struct mmap_unlock_irq_work *work;
+ int ret, cpu;
+
+ for_each_possible_cpu(cpu) {
+ work = per_cpu_ptr(&mmap_unlock_work, cpu);
+ init_irq_work(&work->irq_work, do_mmap_read_unlock);
+ }
- task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
+ task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
ret = bpf_iter_reg_target(&task_reg_info);
if (ret)
return ret;
- task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
- task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0];
+ task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
+ task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
ret = bpf_iter_reg_target(&task_file_reg_info);
if (ret)
return ret;
- task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
- task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
+ task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
+ task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
return bpf_iter_reg_target(&task_vma_reg_info);
}
late_initcall(task_iter_init);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index fe1e857324e6..4b6974a195c1 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -10,6 +10,7 @@
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
#include <linux/module.h>
+#include <linux/static_call.h>
/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -26,6 +27,14 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
/* serializes access to trampoline_table */
static DEFINE_MUTEX(trampoline_mutex);
+bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type eatype = prog->expected_attach_type;
+
+ return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN;
+}
+
void *bpf_jit_alloc_exec_page(void)
{
void *image;
@@ -526,7 +535,7 @@ out:
}
#define NO_START_TIME 1
-static u64 notrace bpf_prog_start_time(void)
+static __always_inline u64 notrace bpf_prog_start_time(void)
{
u64 start = NO_START_TIME;
@@ -544,7 +553,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
stats = this_cpu_ptr(prog->stats);
u64_stats_update_begin(&stats->syncp);
- stats->misses++;
+ u64_stats_inc(&stats->misses);
u64_stats_update_end(&stats->syncp);
}
@@ -585,11 +594,13 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
* Hence check that 'start' is valid.
*/
start > NO_START_TIME) {
+ unsigned long flags;
+
stats = this_cpu_ptr(prog->stats);
- u64_stats_update_begin(&stats->syncp);
- stats->cnt++;
- stats->nsecs += sched_clock() - start;
- u64_stats_update_end(&stats->syncp);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->cnt);
+ u64_stats_add(&stats->nsecs, sched_clock() - start);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
}
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 047ac4b4703b..bfb45381fb3f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4,6 +4,7 @@
* Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
*/
#include <uapi/linux/btf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
@@ -240,12 +241,6 @@ static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
}
-static bool bpf_pseudo_func(const struct bpf_insn *insn)
-{
- return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
- insn->src_reg == BPF_PSEUDO_FUNC;
-}
-
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
@@ -299,13 +294,15 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
"verifier log line truncated - local buffer too short\n");
- n = min(log->len_total - log->len_used - 1, n);
- log->kbuf[n] = '\0';
-
if (log->level == BPF_LOG_KERNEL) {
- pr_err("BPF:%s\n", log->kbuf);
+ bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+
+ pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
return;
}
+
+ n = min(log->len_total - log->len_used - 1, n);
+ log->kbuf[n] = '\0';
if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
log->len_used += n;
else
@@ -445,18 +442,6 @@ static bool reg_type_not_null(enum bpf_reg_type type)
type == PTR_TO_SOCK_COMMON;
}
-static bool reg_type_may_be_null(enum bpf_reg_type type)
-{
- return type == PTR_TO_MAP_VALUE_OR_NULL ||
- type == PTR_TO_SOCKET_OR_NULL ||
- type == PTR_TO_SOCK_COMMON_OR_NULL ||
- type == PTR_TO_TCP_SOCK_OR_NULL ||
- type == PTR_TO_BTF_ID_OR_NULL ||
- type == PTR_TO_MEM_OR_NULL ||
- type == PTR_TO_RDONLY_BUF_OR_NULL ||
- type == PTR_TO_RDWR_BUF_OR_NULL;
-}
-
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
return reg->type == PTR_TO_MAP_VALUE &&
@@ -465,12 +450,14 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
{
- return type == PTR_TO_SOCKET ||
- type == PTR_TO_SOCKET_OR_NULL ||
- type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_TCP_SOCK_OR_NULL ||
- type == PTR_TO_MEM ||
- type == PTR_TO_MEM_OR_NULL;
+ return base_type(type) == PTR_TO_SOCKET ||
+ base_type(type) == PTR_TO_TCP_SOCK ||
+ base_type(type) == PTR_TO_MEM;
+}
+
+static bool type_is_rdonly_mem(u32 type)
+{
+ return type & MEM_RDONLY;
}
static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
@@ -478,14 +465,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
return type == ARG_PTR_TO_SOCK_COMMON;
}
-static bool arg_type_may_be_null(enum bpf_arg_type type)
+static bool type_may_be_null(u32 type)
{
- return type == ARG_PTR_TO_MAP_VALUE_OR_NULL ||
- type == ARG_PTR_TO_MEM_OR_NULL ||
- type == ARG_PTR_TO_CTX_OR_NULL ||
- type == ARG_PTR_TO_SOCKET_OR_NULL ||
- type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
- type == ARG_PTR_TO_STACK_OR_NULL;
+ return type & PTR_MAYBE_NULL;
}
/* Determine whether the function releases some resources allocated by another
@@ -545,39 +527,54 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
insn->imm == BPF_CMPXCHG;
}
-/* string representation of 'enum bpf_reg_type' */
-static const char * const reg_type_str[] = {
- [NOT_INIT] = "?",
- [SCALAR_VALUE] = "inv",
- [PTR_TO_CTX] = "ctx",
- [CONST_PTR_TO_MAP] = "map_ptr",
- [PTR_TO_MAP_VALUE] = "map_value",
- [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
- [PTR_TO_STACK] = "fp",
- [PTR_TO_PACKET] = "pkt",
- [PTR_TO_PACKET_META] = "pkt_meta",
- [PTR_TO_PACKET_END] = "pkt_end",
- [PTR_TO_FLOW_KEYS] = "flow_keys",
- [PTR_TO_SOCKET] = "sock",
- [PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
- [PTR_TO_SOCK_COMMON] = "sock_common",
- [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
- [PTR_TO_TCP_SOCK] = "tcp_sock",
- [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
- [PTR_TO_TP_BUFFER] = "tp_buffer",
- [PTR_TO_XDP_SOCK] = "xdp_sock",
- [PTR_TO_BTF_ID] = "ptr_",
- [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
- [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
- [PTR_TO_MEM] = "mem",
- [PTR_TO_MEM_OR_NULL] = "mem_or_null",
- [PTR_TO_RDONLY_BUF] = "rdonly_buf",
- [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
- [PTR_TO_RDWR_BUF] = "rdwr_buf",
- [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
- [PTR_TO_FUNC] = "func",
- [PTR_TO_MAP_KEY] = "map_key",
-};
+/* string representation of 'enum bpf_reg_type'
+ *
+ * Note that reg_type_str() can not appear more than once in a single verbose()
+ * statement.
+ */
+static const char *reg_type_str(struct bpf_verifier_env *env,
+ enum bpf_reg_type type)
+{
+ char postfix[16] = {0}, prefix[16] = {0};
+ static const char * const str[] = {
+ [NOT_INIT] = "?",
+ [SCALAR_VALUE] = "inv",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_STACK] = "fp",
+ [PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_META] = "pkt_meta",
+ [PTR_TO_PACKET_END] = "pkt_end",
+ [PTR_TO_FLOW_KEYS] = "flow_keys",
+ [PTR_TO_SOCKET] = "sock",
+ [PTR_TO_SOCK_COMMON] = "sock_common",
+ [PTR_TO_TCP_SOCK] = "tcp_sock",
+ [PTR_TO_TP_BUFFER] = "tp_buffer",
+ [PTR_TO_XDP_SOCK] = "xdp_sock",
+ [PTR_TO_BTF_ID] = "ptr_",
+ [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
+ [PTR_TO_MEM] = "mem",
+ [PTR_TO_BUF] = "buf",
+ [PTR_TO_FUNC] = "func",
+ [PTR_TO_MAP_KEY] = "map_key",
+ };
+
+ if (type & PTR_MAYBE_NULL) {
+ if (base_type(type) == PTR_TO_BTF_ID ||
+ base_type(type) == PTR_TO_PERCPU_BTF_ID)
+ strncpy(postfix, "or_null_", 16);
+ else
+ strncpy(postfix, "_or_null", 16);
+ }
+
+ if (type & MEM_RDONLY)
+ strncpy(prefix, "rdonly_", 16);
+
+ snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
+ prefix, str[base_type(type)], postfix);
+ return env->type_str_buf;
+}
static char slot_type_char[] = {
[STACK_INVALID] = '?',
@@ -612,8 +609,61 @@ static const char *kernel_type_name(const struct btf* btf, u32 id)
return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
}
+static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
+{
+ env->scratched_regs |= 1U << regno;
+}
+
+static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
+{
+ env->scratched_stack_slots |= 1UL << spi;
+}
+
+static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
+{
+ return (env->scratched_regs >> regno) & 1;
+}
+
+static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
+{
+ return (env->scratched_stack_slots >> regno) & 1;
+}
+
+static bool verifier_state_scratched(const struct bpf_verifier_env *env)
+{
+ return env->scratched_regs || env->scratched_stack_slots;
+}
+
+static void mark_verifier_state_clean(struct bpf_verifier_env *env)
+{
+ env->scratched_regs = 0U;
+ env->scratched_stack_slots = 0UL;
+}
+
+/* Used for printing the entire verifier state. */
+static void mark_verifier_state_scratched(struct bpf_verifier_env *env)
+{
+ env->scratched_regs = ~0U;
+ env->scratched_stack_slots = ~0UL;
+}
+
+/* The reg state of a pointer or a bounded scalar was saved when
+ * it was spilled to the stack.
+ */
+static bool is_spilled_reg(const struct bpf_stack_state *stack)
+{
+ return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
+}
+
+static void scrub_spilled_slot(u8 *stype)
+{
+ if (*stype != STACK_INVALID)
+ *stype = STACK_MISC;
+}
+
static void print_verifier_state(struct bpf_verifier_env *env,
- const struct bpf_func_state *state)
+ const struct bpf_func_state *state,
+ bool print_all)
{
const struct bpf_reg_state *reg;
enum bpf_reg_type t;
@@ -626,9 +676,11 @@ static void print_verifier_state(struct bpf_verifier_env *env,
t = reg->type;
if (t == NOT_INIT)
continue;
+ if (!print_all && !reg_scratched(env, i))
+ continue;
verbose(env, " R%d", i);
print_liveness(env, reg->live);
- verbose(env, "=%s", reg_type_str[t]);
+ verbose(env, "=%s", reg_type_str(env, t));
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
@@ -636,9 +688,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
/* reg->off should be 0 for SCALAR_VALUE */
verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- if (t == PTR_TO_BTF_ID ||
- t == PTR_TO_BTF_ID_OR_NULL ||
- t == PTR_TO_PERCPU_BTF_ID)
+ if (base_type(t) == PTR_TO_BTF_ID ||
+ base_type(t) == PTR_TO_PERCPU_BTF_ID)
verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
verbose(env, "(id=%d", reg->id);
if (reg_type_may_be_refcounted_or_null(t))
@@ -647,10 +698,9 @@ static void print_verifier_state(struct bpf_verifier_env *env,
verbose(env, ",off=%d", reg->off);
if (type_is_pkt_pointer(t))
verbose(env, ",r=%d", reg->range);
- else if (t == CONST_PTR_TO_MAP ||
- t == PTR_TO_MAP_KEY ||
- t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL)
+ else if (base_type(t) == CONST_PTR_TO_MAP ||
+ base_type(t) == PTR_TO_MAP_KEY ||
+ base_type(t) == PTR_TO_MAP_VALUE)
verbose(env, ",ks=%d,vs=%d",
reg->map_ptr->key_size,
reg->map_ptr->value_size);
@@ -715,12 +765,14 @@ static void print_verifier_state(struct bpf_verifier_env *env,
types_buf[BPF_REG_SIZE] = 0;
if (!valid)
continue;
+ if (!print_all && !stack_slot_scratched(env, i))
+ continue;
verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
print_liveness(env, state->stack[i].spilled_ptr.live);
- if (state->stack[i].slot_type[0] == STACK_SPILL) {
+ if (is_spilled_reg(&state->stack[i])) {
reg = &state->stack[i].spilled_ptr;
t = reg->type;
- verbose(env, "=%s", reg_type_str[t]);
+ verbose(env, "=%s", reg_type_str(env, t));
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
@@ -740,6 +792,26 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (state->in_async_callback_fn)
verbose(env, " async_cb");
verbose(env, "\n");
+ mark_verifier_state_clean(env);
+}
+
+static inline u32 vlog_alignment(u32 pos)
+{
+ return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
+ BPF_LOG_MIN_ALIGNMENT) - pos - 1;
+}
+
+static void print_insn_state(struct bpf_verifier_env *env,
+ const struct bpf_func_state *state)
+{
+ if (env->prev_log_len && env->prev_log_len == env->log.len_used) {
+ /* remove new line character */
+ bpf_vlog_reset(&env->log, env->prev_log_len - 1);
+ verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' ');
+ } else {
+ verbose(env, "%d:", env->insn_idx);
+ }
+ print_verifier_state(env, state, false);
}
/* copy array src of length n * size bytes to dst. dst is reallocated if it's too
@@ -1133,8 +1205,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
{
- switch (reg->type) {
- case PTR_TO_MAP_VALUE_OR_NULL: {
+ if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
const struct bpf_map *map = reg->map_ptr;
if (map->inner_map_meta) {
@@ -1143,7 +1214,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
/* transfer reg's id which is unique for every map_lookup_elem
* as UID of the inner map.
*/
- reg->map_uid = reg->id;
+ if (map_value_has_timer(map->inner_map_meta))
+ reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -1152,32 +1224,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
} else {
reg->type = PTR_TO_MAP_VALUE;
}
- break;
- }
- case PTR_TO_SOCKET_OR_NULL:
- reg->type = PTR_TO_SOCKET;
- break;
- case PTR_TO_SOCK_COMMON_OR_NULL:
- reg->type = PTR_TO_SOCK_COMMON;
- break;
- case PTR_TO_TCP_SOCK_OR_NULL:
- reg->type = PTR_TO_TCP_SOCK;
- break;
- case PTR_TO_BTF_ID_OR_NULL:
- reg->type = PTR_TO_BTF_ID;
- break;
- case PTR_TO_MEM_OR_NULL:
- reg->type = PTR_TO_MEM;
- break;
- case PTR_TO_RDONLY_BUF_OR_NULL:
- reg->type = PTR_TO_RDONLY_BUF;
- break;
- case PTR_TO_RDWR_BUF_OR_NULL:
- reg->type = PTR_TO_RDWR_BUF;
- break;
- default:
- WARN_ONCE(1, "unknown nullable register type");
+ return;
}
+
+ reg->type &= ~PTR_MAYBE_NULL;
}
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
@@ -1357,22 +1407,28 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
+static bool __reg32_bound_s64(s32 a)
+{
+ return a >= 0 && a <= S32_MAX;
+}
+
static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
{
reg->umin_value = reg->u32_min_value;
reg->umax_value = reg->u32_max_value;
- /* Attempt to pull 32-bit signed bounds into 64-bit bounds
- * but must be positive otherwise set to worse case bounds
- * and refine later from tnum.
+
+ /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
+ * be positive otherwise set to worse case bounds and refine later
+ * from tnum.
*/
- if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0)
- reg->smax_value = reg->s32_max_value;
- else
- reg->smax_value = U32_MAX;
- if (reg->s32_min_value >= 0)
+ if (__reg32_bound_s64(reg->s32_min_value) &&
+ __reg32_bound_s64(reg->s32_max_value)) {
reg->smin_value = reg->s32_min_value;
- else
+ reg->smax_value = reg->s32_max_value;
+ } else {
reg->smin_value = 0;
+ reg->smax_value = U32_MAX;
+ }
}
static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
@@ -1406,12 +1462,12 @@ static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
static bool __reg64_bound_s32(s64 a)
{
- return a > S32_MIN && a < S32_MAX;
+ return a >= S32_MIN && a <= S32_MAX;
}
static bool __reg64_bound_u32(u64 a)
{
- return a > U32_MIN && a < U32_MAX;
+ return a >= U32_MIN && a <= U32_MAX;
}
static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
@@ -1529,6 +1585,7 @@ static void init_func_state(struct bpf_verifier_env *env,
state->frameno = frameno;
state->subprogno = subprogno;
init_reg_state(env, state);
+ mark_verifier_state_scratched(env);
}
/* Similar to push_stack(), but for async callbacks */
@@ -1626,52 +1683,168 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
return env->subprog_cnt - 1;
}
+#define MAX_KFUNC_DESCS 256
+#define MAX_KFUNC_BTFS 256
+
struct bpf_kfunc_desc {
struct btf_func_model func_model;
u32 func_id;
s32 imm;
+ u16 offset;
+};
+
+struct bpf_kfunc_btf {
+ struct btf *btf;
+ struct module *module;
+ u16 offset;
};
-#define MAX_KFUNC_DESCS 256
struct bpf_kfunc_desc_tab {
struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
u32 nr_descs;
};
-static int kfunc_desc_cmp_by_id(const void *a, const void *b)
+struct bpf_kfunc_btf_tab {
+ struct bpf_kfunc_btf descs[MAX_KFUNC_BTFS];
+ u32 nr_descs;
+};
+
+static int kfunc_desc_cmp_by_id_off(const void *a, const void *b)
{
const struct bpf_kfunc_desc *d0 = a;
const struct bpf_kfunc_desc *d1 = b;
/* func_id is not greater than BTF_MAX_TYPE */
- return d0->func_id - d1->func_id;
+ return d0->func_id - d1->func_id ?: d0->offset - d1->offset;
+}
+
+static int kfunc_btf_cmp_by_off(const void *a, const void *b)
+{
+ const struct bpf_kfunc_btf *d0 = a;
+ const struct bpf_kfunc_btf *d1 = b;
+
+ return d0->offset - d1->offset;
}
static const struct bpf_kfunc_desc *
-find_kfunc_desc(const struct bpf_prog *prog, u32 func_id)
+find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
{
struct bpf_kfunc_desc desc = {
.func_id = func_id,
+ .offset = offset,
};
struct bpf_kfunc_desc_tab *tab;
tab = prog->aux->kfunc_tab;
return bsearch(&desc, tab->descs, tab->nr_descs,
- sizeof(tab->descs[0]), kfunc_desc_cmp_by_id);
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off);
}
-static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id)
+static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
+ s16 offset, struct module **btf_modp)
+{
+ struct bpf_kfunc_btf kf_btf = { .offset = offset };
+ struct bpf_kfunc_btf_tab *tab;
+ struct bpf_kfunc_btf *b;
+ struct module *mod;
+ struct btf *btf;
+ int btf_fd;
+
+ tab = env->prog->aux->kfunc_btf_tab;
+ b = bsearch(&kf_btf, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_btf_cmp_by_off);
+ if (!b) {
+ if (tab->nr_descs == MAX_KFUNC_BTFS) {
+ verbose(env, "too many different module BTFs\n");
+ return ERR_PTR(-E2BIG);
+ }
+
+ if (bpfptr_is_null(env->fd_array)) {
+ verbose(env, "kfunc offset > 0 without fd_array is invalid\n");
+ return ERR_PTR(-EPROTO);
+ }
+
+ if (copy_from_bpfptr_offset(&btf_fd, env->fd_array,
+ offset * sizeof(btf_fd),
+ sizeof(btf_fd)))
+ return ERR_PTR(-EFAULT);
+
+ btf = btf_get_by_fd(btf_fd);
+ if (IS_ERR(btf)) {
+ verbose(env, "invalid module BTF fd specified\n");
+ return btf;
+ }
+
+ if (!btf_is_module(btf)) {
+ verbose(env, "BTF fd for kfunc is not a module BTF\n");
+ btf_put(btf);
+ return ERR_PTR(-EINVAL);
+ }
+
+ mod = btf_try_get_module(btf);
+ if (!mod) {
+ btf_put(btf);
+ return ERR_PTR(-ENXIO);
+ }
+
+ b = &tab->descs[tab->nr_descs++];
+ b->btf = btf;
+ b->module = mod;
+ b->offset = offset;
+
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_btf_cmp_by_off, NULL);
+ }
+ if (btf_modp)
+ *btf_modp = b->module;
+ return b->btf;
+}
+
+void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
+{
+ if (!tab)
+ return;
+
+ while (tab->nr_descs--) {
+ module_put(tab->descs[tab->nr_descs].module);
+ btf_put(tab->descs[tab->nr_descs].btf);
+ }
+ kfree(tab);
+}
+
+static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
+ u32 func_id, s16 offset,
+ struct module **btf_modp)
+{
+ if (offset) {
+ if (offset < 0) {
+ /* In the future, this can be allowed to increase limit
+ * of fd index into fd_array, interpreted as u16.
+ */
+ verbose(env, "negative offset disallowed for kernel module function call\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ return __find_kfunc_desc_btf(env, offset, btf_modp);
+ }
+ return btf_vmlinux ?: ERR_PTR(-ENOENT);
+}
+
+static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
{
const struct btf_type *func, *func_proto;
+ struct bpf_kfunc_btf_tab *btf_tab;
struct bpf_kfunc_desc_tab *tab;
struct bpf_prog_aux *prog_aux;
struct bpf_kfunc_desc *desc;
const char *func_name;
+ struct btf *desc_btf;
unsigned long addr;
int err;
prog_aux = env->prog->aux;
tab = prog_aux->kfunc_tab;
+ btf_tab = prog_aux->kfunc_btf_tab;
if (!tab) {
if (!btf_vmlinux) {
verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n");
@@ -1699,7 +1872,29 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id)
prog_aux->kfunc_tab = tab;
}
- if (find_kfunc_desc(env->prog, func_id))
+ /* func_id == 0 is always invalid, but instead of returning an error, be
+ * conservative and wait until the code elimination pass before returning
+ * error, so that invalid calls that get pruned out can be in BPF programs
+ * loaded from userspace. It is also required that offset be untouched
+ * for such calls.
+ */
+ if (!func_id && !offset)
+ return 0;
+
+ if (!btf_tab && offset) {
+ btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL);
+ if (!btf_tab)
+ return -ENOMEM;
+ prog_aux->kfunc_btf_tab = btf_tab;
+ }
+
+ desc_btf = find_kfunc_desc_btf(env, func_id, offset, NULL);
+ if (IS_ERR(desc_btf)) {
+ verbose(env, "failed to find BTF for kernel function\n");
+ return PTR_ERR(desc_btf);
+ }
+
+ if (find_kfunc_desc(env->prog, func_id, offset))
return 0;
if (tab->nr_descs == MAX_KFUNC_DESCS) {
@@ -1707,20 +1902,20 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id)
return -E2BIG;
}
- func = btf_type_by_id(btf_vmlinux, func_id);
+ func = btf_type_by_id(desc_btf, func_id);
if (!func || !btf_type_is_func(func)) {
verbose(env, "kernel btf_id %u is not a function\n",
func_id);
return -EINVAL;
}
- func_proto = btf_type_by_id(btf_vmlinux, func->type);
+ func_proto = btf_type_by_id(desc_btf, func->type);
if (!func_proto || !btf_type_is_func_proto(func_proto)) {
verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
func_id);
return -EINVAL;
}
- func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+ func_name = btf_name_by_offset(desc_btf, func->name_off);
addr = kallsyms_lookup_name(func_name);
if (!addr) {
verbose(env, "cannot find address for kernel function %s\n",
@@ -1730,13 +1925,14 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id)
desc = &tab->descs[tab->nr_descs++];
desc->func_id = func_id;
- desc->imm = BPF_CAST_CALL(addr) - __bpf_call_base;
- err = btf_distill_func_proto(&env->log, btf_vmlinux,
+ desc->imm = BPF_CALL_IMM(addr);
+ desc->offset = offset;
+ err = btf_distill_func_proto(&env->log, desc_btf,
func_proto, func_name,
&desc->func_model);
if (!err)
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
- kfunc_desc_cmp_by_id, NULL);
+ kfunc_desc_cmp_by_id_off, NULL);
return err;
}
@@ -1807,16 +2003,10 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
return -EPERM;
}
- if (bpf_pseudo_func(insn)) {
- ret = add_subprog(env, i + insn->imm + 1);
- if (ret >= 0)
- /* remember subprog */
- insn[1].imm = ret;
- } else if (bpf_pseudo_call(insn)) {
+ if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn))
ret = add_subprog(env, i + insn->imm + 1);
- } else {
- ret = add_kfunc_call(env, insn->imm);
- }
+ else
+ ret = add_kfunc_call(env, insn->imm, insn->off);
if (ret < 0)
return ret;
@@ -1899,7 +2089,7 @@ static int mark_reg_read(struct bpf_verifier_env *env,
break;
if (parent->live & REG_LIVE_DONE) {
verbose(env, "verifier BUG type %s var_off %lld off %d\n",
- reg_type_str[parent->type],
+ reg_type_str(env, parent->type),
parent->var_off.value, parent->off);
return -EFAULT;
}
@@ -2083,6 +2273,8 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return -EINVAL;
}
+ mark_reg_scratched(env, regno);
+
reg = &regs[regno];
rw64 = is_reg64(env, insn, regno, reg, t);
if (t == SRC_OP) {
@@ -2152,12 +2344,17 @@ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
{
const struct btf_type *func;
+ struct btf *desc_btf;
if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
return NULL;
- func = btf_type_by_id(btf_vmlinux, insn->imm);
- return btf_name_by_offset(btf_vmlinux, func->name_off);
+ desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off, NULL);
+ if (IS_ERR(desc_btf))
+ return "<error>";
+
+ func = btf_type_by_id(desc_btf, insn->imm);
+ return btf_name_by_offset(desc_btf, func->name_off);
}
/* For given verifier state backtrack_insn() is called from the last insn to
@@ -2182,7 +2379,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (insn->code == 0)
return 0;
- if (env->log.level & BPF_LOG_LEVEL) {
+ if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
verbose(env, "%d: ", idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
@@ -2232,8 +2429,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
*/
if (insn->src_reg != BPF_REG_FP)
return 0;
- if (BPF_SIZE(insn->code) != BPF_DW)
- return 0;
/* dreg = *(u64 *)[fp - off] was a fill from the stack.
* that [fp - off] slot contains scalar that needs to be
@@ -2256,8 +2451,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
/* scalars can only be spilled into stack */
if (insn->dst_reg != BPF_REG_FP)
return 0;
- if (BPF_SIZE(insn->code) != BPF_DW)
- return 0;
spi = (-insn->off - 1) / BPF_REG_SIZE;
if (spi >= 64) {
verbose(env, "BUG spi %d\n", spi);
@@ -2373,7 +2566,7 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
reg->precise = true;
}
for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
- if (func->stack[j].slot_type[0] != STACK_SPILL)
+ if (!is_spilled_reg(&func->stack[j]))
continue;
reg = &func->stack[j].spilled_ptr;
if (reg->type != SCALAR_VALUE)
@@ -2415,7 +2608,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
}
while (spi >= 0) {
- if (func->stack[spi].slot_type[0] != STACK_SPILL) {
+ if (!is_spilled_reg(&func->stack[spi])) {
stack_mask = 0;
break;
}
@@ -2440,7 +2633,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;
- if (env->log.level & BPF_LOG_LEVEL)
+ if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
for (i = last_idx;;) {
if (skip_first) {
@@ -2514,7 +2707,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
return 0;
}
- if (func->stack[i].slot_type[0] != STACK_SPILL) {
+ if (!is_spilled_reg(&func->stack[i])) {
stack_mask &= ~(1ull << i);
continue;
}
@@ -2527,11 +2720,11 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
new_marks = true;
reg->precise = true;
}
- if (env->log.level & BPF_LOG_LEVEL) {
- print_verifier_state(env, func);
- verbose(env, "parent %s regs=%x stack=%llx marks\n",
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "parent %s regs=%x stack=%llx marks:",
new_marks ? "didn't have" : "already had",
reg_mask, stack_mask);
+ print_verifier_state(env, func, true);
}
if (!reg_mask && !stack_mask)
@@ -2557,9 +2750,8 @@ static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
static bool is_spillable_regtype(enum bpf_reg_type type)
{
- switch (type) {
+ switch (base_type(type)) {
case PTR_TO_MAP_VALUE:
- case PTR_TO_MAP_VALUE_OR_NULL:
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
@@ -2568,21 +2760,13 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_FLOW_KEYS:
case CONST_PTR_TO_MAP:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
- case PTR_TO_BTF_ID_OR_NULL:
- case PTR_TO_RDONLY_BUF:
- case PTR_TO_RDONLY_BUF_OR_NULL:
- case PTR_TO_RDWR_BUF:
- case PTR_TO_RDWR_BUF_OR_NULL:
+ case PTR_TO_BUF:
case PTR_TO_PERCPU_BTF_ID:
case PTR_TO_MEM:
- case PTR_TO_MEM_OR_NULL:
case PTR_TO_FUNC:
case PTR_TO_MAP_KEY:
return true;
@@ -2626,15 +2810,21 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
}
static void save_register_state(struct bpf_func_state *state,
- int spi, struct bpf_reg_state *reg)
+ int spi, struct bpf_reg_state *reg,
+ int size)
{
int i;
state->stack[spi].spilled_ptr = *reg;
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ if (size == BPF_REG_SIZE)
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
- for (i = 0; i < BPF_REG_SIZE; i++)
- state->stack[spi].slot_type[i] = STACK_SPILL;
+ for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
+ state->stack[spi].slot_type[i - 1] = STACK_SPILL;
+
+ /* size < 8 bytes spill */
+ for (; i; i--)
+ scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]);
}
/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
@@ -2681,7 +2871,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
}
- if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) &&
+ mark_stack_slot_scratched(env, spi);
+ if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
!register_is_null(reg) && env->bpf_capable) {
if (dst_reg != BPF_REG_FP) {
/* The backtracking logic can only recognize explicit
@@ -2694,7 +2885,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
if (err)
return err;
}
- save_register_state(state, spi, reg);
+ save_register_state(state, spi, reg, size);
} else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -2706,16 +2897,16 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
return -EINVAL;
}
- save_register_state(state, spi, reg);
+ save_register_state(state, spi, reg, size);
} else {
u8 type = STACK_MISC;
/* regular write of data into stack destroys any spilled ptr */
state->stack[spi].spilled_ptr.type = NOT_INIT;
/* Mark slots as STACK_MISC if they belonged to spilled ptr. */
- if (state->stack[spi].slot_type[0] == STACK_SPILL)
+ if (is_spilled_reg(&state->stack[spi]))
for (i = 0; i < BPF_REG_SIZE; i++)
- state->stack[spi].slot_type[i] = STACK_MISC;
+ scrub_spilled_slot(&state->stack[spi].slot_type[i]);
/* only mark the slot as written if all 8 bytes were written
* otherwise read propagation may incorrectly stop too soon
@@ -2802,6 +2993,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
+ mark_stack_slot_scratched(env, spi);
if (!env->allow_ptr_leaks
&& *stype != NOT_INIT
@@ -2918,31 +3110,52 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
struct bpf_func_state *state = vstate->frame[vstate->curframe];
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
struct bpf_reg_state *reg;
- u8 *stype;
+ u8 *stype, type;
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
- if (stype[0] == STACK_SPILL) {
- if (size != BPF_REG_SIZE) {
+ if (is_spilled_reg(&reg_state->stack[spi])) {
+ u8 spill_size = 1;
+
+ for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--)
+ spill_size++;
+
+ if (size != BPF_REG_SIZE || spill_size != BPF_REG_SIZE) {
if (reg->type != SCALAR_VALUE) {
verbose_linfo(env, env->insn_idx, "; ");
verbose(env, "invalid size of register fill\n");
return -EACCES;
}
- if (dst_regno >= 0) {
+
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+ if (dst_regno < 0)
+ return 0;
+
+ if (!(off % BPF_REG_SIZE) && size == spill_size) {
+ /* The earlier check_reg_arg() has decided the
+ * subreg_def for this insn. Save it first.
+ */
+ s32 subreg_def = state->regs[dst_regno].subreg_def;
+
+ state->regs[dst_regno] = *reg;
+ state->regs[dst_regno].subreg_def = subreg_def;
+ } else {
+ for (i = 0; i < size; i++) {
+ type = stype[(slot - i) % BPF_REG_SIZE];
+ if (type == STACK_SPILL)
+ continue;
+ if (type == STACK_MISC)
+ continue;
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
mark_reg_unknown(env, state->regs, dst_regno);
- state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
}
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
return 0;
}
- for (i = 1; i < BPF_REG_SIZE; i++) {
- if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
- verbose(env, "corrupted spill memory\n");
- return -EACCES;
- }
- }
if (dst_regno >= 0) {
/* restore register state from stack */
@@ -2965,8 +3178,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
}
mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
} else {
- u8 type;
-
for (i = 0; i < size; i++) {
type = stype[(slot - i) % BPF_REG_SIZE];
if (type == STACK_MISC)
@@ -3199,11 +3410,8 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
/* We may have adjusted the register pointing to memory region, so we
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
- */
- if (env->log.level & BPF_LOG_LEVEL)
- print_verifier_state(env, state);
-
- /* The minimum value is only important with signed
+ *
+ * The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
* index'es we need to make sure that whatever we use
@@ -3398,7 +3606,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
*/
*reg_type = info.reg_type;
- if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) {
+ if (base_type(*reg_type) == PTR_TO_BTF_ID) {
*btf = info.btf;
*btf_id = info.btf_id;
} else {
@@ -3466,7 +3674,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
}
verbose(env, "R%d invalid %s access off=%d size=%d\n",
- regno, reg_type_str[reg->type], off, size);
+ regno, reg_type_str(env, reg->type), off, size);
return -EACCES;
}
@@ -3884,7 +4092,22 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
static bool bpf_map_is_rdonly(const struct bpf_map *map)
{
- return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen;
+ /* A map is considered read-only if the following condition are true:
+ *
+ * 1) BPF program side cannot change any of the map content. The
+ * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
+ * and was set at map creation time.
+ * 2) The map value(s) have been initialized from user space by a
+ * loader and then "frozen", such that no new map update/delete
+ * operations from syscall side are possible for the rest of
+ * the map's lifetime from that point onwards.
+ * 3) Any parallel/pending map update/delete operations from syscall
+ * side have been completed. Only after that point, it's safe to
+ * assume that map value(s) are immutable.
+ */
+ return (map->map_flags & BPF_F_RDONLY_PROG) &&
+ READ_ONCE(map->frozen) &&
+ !bpf_map_write_active(map);
}
static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
@@ -4178,15 +4401,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
mark_reg_unknown(env, regs, value_regno);
}
}
- } else if (reg->type == PTR_TO_MEM) {
+ } else if (base_type(reg->type) == PTR_TO_MEM) {
+ bool rdonly_mem = type_is_rdonly_mem(reg->type);
+
+ if (type_may_be_null(reg->type)) {
+ verbose(env, "R%d invalid mem access '%s'\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
+ if (t == BPF_WRITE && rdonly_mem) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose(env, "R%d leaks addr into mem\n", value_regno);
return -EACCES;
}
+
err = check_mem_region_access(env, regno, off, size,
reg->mem_size, false);
- if (!err && t == BPF_READ && value_regno >= 0)
+ if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
@@ -4216,7 +4454,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else {
mark_reg_known_zero(env, regs,
value_regno);
- if (reg_type_may_be_null(reg_type))
+ if (type_may_be_null(reg_type))
regs[value_regno].id = ++env->id_gen;
/* A load of ctx field could have different
* actual load size with the one encoded in the
@@ -4224,8 +4462,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* a sub-register.
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
- if (reg_type == PTR_TO_BTF_ID ||
- reg_type == PTR_TO_BTF_ID_OR_NULL) {
+ if (base_type(reg_type) == PTR_TO_BTF_ID) {
regs[value_regno].btf = btf;
regs[value_regno].btf_id = btf_id;
}
@@ -4278,7 +4515,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (type_is_sk_pointer(reg->type)) {
if (t == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n",
- regno, reg_type_str[reg->type]);
+ regno, reg_type_str(env, reg->type));
return -EACCES;
}
err = check_sock_access(env, insn_idx, regno, off, size, t);
@@ -4294,26 +4531,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (reg->type == CONST_PTR_TO_MAP) {
err = check_ptr_to_map_access(env, regs, regno, off, size, t,
value_regno);
- } else if (reg->type == PTR_TO_RDONLY_BUF) {
- if (t == BPF_WRITE) {
- verbose(env, "R%d cannot write into %s\n",
- regno, reg_type_str[reg->type]);
- return -EACCES;
+ } else if (base_type(reg->type) == PTR_TO_BUF) {
+ bool rdonly_mem = type_is_rdonly_mem(reg->type);
+ const char *buf_info;
+ u32 *max_access;
+
+ if (rdonly_mem) {
+ if (t == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ buf_info = "rdonly";
+ max_access = &env->prog->aux->max_rdonly_access;
+ } else {
+ buf_info = "rdwr";
+ max_access = &env->prog->aux->max_rdwr_access;
}
+
err = check_buffer_access(env, reg, regno, off, size, false,
- "rdonly",
- &env->prog->aux->max_rdonly_access);
- if (!err && value_regno >= 0)
- mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_RDWR_BUF) {
- err = check_buffer_access(env, reg, regno, off, size, false,
- "rdwr",
- &env->prog->aux->max_rdwr_access);
- if (!err && t == BPF_READ && value_regno >= 0)
+ buf_info, max_access);
+
+ if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
mark_reg_unknown(env, regs, value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EACCES;
}
@@ -4364,9 +4607,16 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
if (insn->imm == BPF_CMPXCHG) {
/* Check comparison of R0 with memory location */
- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ const u32 aux_reg = BPF_REG_0;
+
+ err = check_reg_arg(env, aux_reg, SRC_OP);
if (err)
return err;
+
+ if (is_pointer_value(env, aux_reg)) {
+ verbose(env, "R%d leaks addr into mem\n", aux_reg);
+ return -EACCES;
+ }
}
if (is_pointer_value(env, insn->src_reg)) {
@@ -4380,7 +4630,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
is_sk_reg(env, insn->dst_reg)) {
verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
- reg_type_str[reg_state(env, insn->dst_reg)->type]);
+ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
return -EACCES;
}
@@ -4401,13 +4651,19 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
load_reg = -1;
}
- /* check whether we can read the memory */
+ /* Check whether we can read the memory, with second call for fetch
+ * case to simulate the register fill.
+ */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ, load_reg, true);
+ BPF_SIZE(insn->code), BPF_READ, -1, true);
+ if (!err && load_reg >= 0)
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, load_reg,
+ true);
if (err)
return err;
- /* check whether we can write into the same memory */
+ /* Check whether we can write into the same memory. */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1, true);
if (err)
@@ -4514,17 +4770,17 @@ static int check_stack_range_initialized(
goto mark;
}
- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+ if (is_spilled_reg(&state->stack[spi]) &&
state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
goto mark;
- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+ if (is_spilled_reg(&state->stack[spi]) &&
(state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
env->allow_ptr_leaks)) {
if (clobber) {
__mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
for (j = 0; j < BPF_REG_SIZE; j++)
- state->stack[spi].slot_type[j] = STACK_MISC;
+ scrub_spilled_slot(&state->stack[spi].slot_type[j]);
}
goto mark;
}
@@ -4557,8 +4813,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ const char *buf_info;
+ u32 *max_access;
- switch (reg->type) {
+ switch (base_type(reg->type)) {
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
return check_packet_access(env, regno, reg->off, access_size,
@@ -4577,18 +4835,20 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_mem_region_access(env, regno, reg->off,
access_size, reg->mem_size,
zero_size_allowed);
- case PTR_TO_RDONLY_BUF:
- if (meta && meta->raw_mode)
- return -EACCES;
- return check_buffer_access(env, reg, regno, reg->off,
- access_size, zero_size_allowed,
- "rdonly",
- &env->prog->aux->max_rdonly_access);
- case PTR_TO_RDWR_BUF:
+ case PTR_TO_BUF:
+ if (type_is_rdonly_mem(reg->type)) {
+ if (meta && meta->raw_mode)
+ return -EACCES;
+
+ buf_info = "rdonly";
+ max_access = &env->prog->aux->max_rdonly_access;
+ } else {
+ buf_info = "rdwr";
+ max_access = &env->prog->aux->max_rdwr_access;
+ }
return check_buffer_access(env, reg, regno, reg->off,
access_size, zero_size_allowed,
- "rdwr",
- &env->prog->aux->max_rdwr_access);
+ buf_info, max_access);
case PTR_TO_STACK:
return check_stack_range_initialized(
env,
@@ -4600,9 +4860,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
register_is_null(reg))
return 0;
- verbose(env, "R%d type=%s expected=%s\n", regno,
- reg_type_str[reg->type],
- reg_type_str[PTR_TO_STACK]);
+ verbose(env, "R%d type=%s ", regno,
+ reg_type_str(env, reg->type));
+ verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
return -EACCES;
}
}
@@ -4613,7 +4873,7 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
if (register_is_null(reg))
return 0;
- if (reg_type_may_be_null(reg->type)) {
+ if (type_may_be_null(reg->type)) {
/* Assuming that the register contains a value check if the memory
* access is safe. Temporarily save and restore the register's state as
* the conversion shouldn't be visible to a caller.
@@ -4761,9 +5021,8 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
{
- return type == ARG_PTR_TO_MEM ||
- type == ARG_PTR_TO_MEM_OR_NULL ||
- type == ARG_PTR_TO_UNINIT_MEM;
+ return base_type(type) == ARG_PTR_TO_MEM ||
+ base_type(type) == ARG_PTR_TO_UNINIT_MEM;
}
static bool arg_type_is_mem_size(enum bpf_arg_type type)
@@ -4813,7 +5072,10 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env,
return -EINVAL;
}
break;
-
+ case BPF_MAP_TYPE_BLOOM_FILTER:
+ if (meta->func_id == BPF_FUNC_map_peek_elem)
+ *arg_type = ARG_PTR_TO_MAP_VALUE;
+ break;
default:
break;
}
@@ -4865,8 +5127,7 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
PTR_TO_MEM,
- PTR_TO_RDONLY_BUF,
- PTR_TO_RDWR_BUF,
+ PTR_TO_BUF,
},
};
@@ -4897,31 +5158,26 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
[ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
[ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types,
- [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types,
[ARG_CONST_SIZE] = &scalar_types,
[ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
[ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
[ARG_CONST_MAP_PTR] = &const_map_ptr_types,
[ARG_PTR_TO_CTX] = &context_types,
- [ARG_PTR_TO_CTX_OR_NULL] = &context_types,
[ARG_PTR_TO_SOCK_COMMON] = &sock_types,
#ifdef CONFIG_NET
[ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
#endif
[ARG_PTR_TO_SOCKET] = &fullsock_types,
- [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types,
[ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
[ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
[ARG_PTR_TO_MEM] = &mem_types,
- [ARG_PTR_TO_MEM_OR_NULL] = &mem_types,
[ARG_PTR_TO_UNINIT_MEM] = &mem_types,
[ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
- [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types,
[ARG_PTR_TO_INT] = &int_ptr_types,
[ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
[ARG_PTR_TO_FUNC] = &func_ptr_types,
- [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types,
+ [ARG_PTR_TO_STACK] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
[ARG_PTR_TO_TIMER] = &timer_types,
};
@@ -4935,12 +5191,27 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
const struct bpf_reg_types *compatible;
int i, j;
- compatible = compatible_reg_types[arg_type];
+ compatible = compatible_reg_types[base_type(arg_type)];
if (!compatible) {
verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
return -EFAULT;
}
+ /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,
+ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY
+ *
+ * Same for MAYBE_NULL:
+ *
+ * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
+ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
+ *
+ * Therefore we fold these flags depending on the arg_type before comparison.
+ */
+ if (arg_type & MEM_RDONLY)
+ type &= ~MEM_RDONLY;
+ if (arg_type & PTR_MAYBE_NULL)
+ type &= ~PTR_MAYBE_NULL;
+
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
expected = compatible->types[i];
if (expected == NOT_INIT)
@@ -4950,14 +5221,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
goto found;
}
- verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]);
+ verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
for (j = 0; j + 1 < i; j++)
- verbose(env, "%s, ", reg_type_str[compatible->types[j]]);
- verbose(env, "%s\n", reg_type_str[compatible->types[j]]);
+ verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
+ verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
return -EACCES;
found:
- if (type == PTR_TO_BTF_ID) {
+ if (reg->type == PTR_TO_BTF_ID) {
if (!arg_btf_id) {
if (!compatible->btf_id) {
verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
@@ -5016,15 +5287,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
return -EACCES;
}
- if (arg_type == ARG_PTR_TO_MAP_VALUE ||
- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
- arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
+ if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
+ base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
err = resolve_map_arg_type(env, meta, &arg_type);
if (err)
return err;
}
- if (register_is_null(reg) && arg_type_may_be_null(arg_type))
+ if (register_is_null(reg) && type_may_be_null(arg_type))
/* A NULL register has a SCALAR_VALUE type, so skip
* type checking.
*/
@@ -5093,10 +5363,11 @@ skip_type_check:
err = check_helper_mem_access(env, regno,
meta->map_ptr->key_size, false,
NULL);
- } else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
- (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
- !register_is_null(reg)) ||
- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
+ base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ if (type_may_be_null(arg_type) && register_is_null(reg))
+ return 0;
+
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
*/
@@ -5388,6 +5659,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_task_storage_delete)
goto error;
break;
+ case BPF_MAP_TYPE_BLOOM_FILTER:
+ if (func_id != BPF_FUNC_map_peek_elem &&
+ func_id != BPF_FUNC_map_push_elem)
+ goto error;
+ break;
default:
break;
}
@@ -5455,13 +5731,18 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
map->map_type != BPF_MAP_TYPE_SOCKHASH)
goto error;
break;
- case BPF_FUNC_map_peek_elem:
case BPF_FUNC_map_pop_elem:
- case BPF_FUNC_map_push_elem:
if (map->map_type != BPF_MAP_TYPE_QUEUE &&
map->map_type != BPF_MAP_TYPE_STACK)
goto error;
break;
+ case BPF_FUNC_map_peek_elem:
+ case BPF_FUNC_map_push_elem:
+ if (map->map_type != BPF_MAP_TYPE_QUEUE &&
+ map->map_type != BPF_MAP_TYPE_STACK &&
+ map->map_type != BPF_MAP_TYPE_BLOOM_FILTER)
+ goto error;
+ break;
case BPF_FUNC_sk_storage_get:
case BPF_FUNC_sk_storage_delete:
if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
@@ -5750,6 +6031,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
if (insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
insn->imm == BPF_FUNC_timer_set_callback) {
struct bpf_verifier_state *async_cb;
@@ -5808,9 +6090,9 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
- print_verifier_state(env, caller);
+ print_verifier_state(env, caller, true);
verbose(env, "callee:\n");
- print_verifier_state(env, callee);
+ print_verifier_state(env, callee, true);
}
return 0;
}
@@ -5901,6 +6183,27 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_loop_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
+ * u64 flags);
+ * callback_fn(u32 index, void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1].type = SCALAR_VALUE;
+ callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
+ return 0;
+}
+
static int set_timer_callback_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee,
@@ -5930,6 +6233,33 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_find_vma_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_find_vma(struct task_struct *task, u64 addr,
+ * void *callback_fn, void *callback_ctx, u64 flags)
+ * (callback_fn)(struct task_struct *task,
+ * struct vm_area_struct *vma, void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+ callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].btf = btf_vmlinux;
+ callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA],
+
+ /* pointer to stack or null */
+ callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_callback_fn = true;
+ return 0;
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -5977,9 +6307,9 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
*insn_idx = callee->callsite + 1;
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
- print_verifier_state(env, callee);
+ print_verifier_state(env, callee, true);
verbose(env, "to caller at %d:\n", *insn_idx);
- print_verifier_state(env, caller);
+ print_verifier_state(env, caller, true);
}
/* clear everything in the callee */
free_func_state(callee);
@@ -6145,13 +6475,11 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
static int check_get_func_ip(struct bpf_verifier_env *env)
{
- enum bpf_attach_type eatype = env->prog->expected_attach_type;
enum bpf_prog_type type = resolve_prog_type(env->prog);
int func_id = BPF_FUNC_get_func_ip;
if (type == BPF_PROG_TYPE_TRACING) {
- if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT &&
- eatype != BPF_MODIFY_RETURN) {
+ if (!bpf_prog_has_trampoline(env->prog)) {
verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
func_id_name(func_id), func_id);
return -ENOTSUPP;
@@ -6170,6 +6498,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
int *insn_idx_p)
{
const struct bpf_func_proto *fn = NULL;
+ enum bpf_return_type ret_type;
+ enum bpf_type_flag ret_flag;
struct bpf_reg_state *regs;
struct bpf_call_arg_meta meta;
int insn_idx = *insn_idx_p;
@@ -6247,13 +6577,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return err;
}
- if (func_id == BPF_FUNC_tail_call) {
- err = check_reference_leak(env);
- if (err) {
- verbose(env, "tail_call would lead to reference leak\n");
- return err;
- }
- } else if (is_release_function(func_id)) {
+ if (is_release_function(func_id)) {
err = release_reference(env, meta.ref_obj_id);
if (err) {
verbose(env, "func %s#%d reference has not been acquired before\n",
@@ -6264,35 +6588,47 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs = cur_regs(env);
- /* check that flags argument in get_local_storage(map, flags) is 0,
- * this is required because get_local_storage() can't return an error.
- */
- if (func_id == BPF_FUNC_get_local_storage &&
- !register_is_null(&regs[BPF_REG_2])) {
- verbose(env, "get_local_storage() doesn't support non-zero flags\n");
- return -EINVAL;
- }
-
- if (func_id == BPF_FUNC_for_each_map_elem) {
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ err = check_reference_leak(env);
+ if (err) {
+ verbose(env, "tail_call would lead to reference leak\n");
+ return err;
+ }
+ break;
+ case BPF_FUNC_get_local_storage:
+ /* check that flags argument in get_local_storage(map, flags) is 0,
+ * this is required because get_local_storage() can't return an error.
+ */
+ if (!register_is_null(&regs[BPF_REG_2])) {
+ verbose(env, "get_local_storage() doesn't support non-zero flags\n");
+ return -EINVAL;
+ }
+ break;
+ case BPF_FUNC_for_each_map_elem:
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_map_elem_callback_state);
- if (err < 0)
- return -EINVAL;
- }
-
- if (func_id == BPF_FUNC_timer_set_callback) {
+ break;
+ case BPF_FUNC_timer_set_callback:
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_timer_callback_state);
- if (err < 0)
- return -EINVAL;
- }
-
- if (func_id == BPF_FUNC_snprintf) {
+ break;
+ case BPF_FUNC_find_vma:
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_find_vma_callback_state);
+ break;
+ case BPF_FUNC_snprintf:
err = check_bpf_snprintf_call(env, regs);
- if (err < 0)
- return err;
+ break;
+ case BPF_FUNC_loop:
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_loop_callback_state);
+ break;
}
+ if (err)
+ return err;
+
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
@@ -6303,13 +6639,14 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
/* update return register (already marked as written above) */
- if (fn->ret_type == RET_INTEGER) {
+ ret_type = fn->ret_type;
+ ret_flag = type_flag(fn->ret_type);
+ if (ret_type == RET_INTEGER) {
/* sets type to SCALAR_VALUE */
mark_reg_unknown(env, regs, BPF_REG_0);
- } else if (fn->ret_type == RET_VOID) {
+ } else if (ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
- } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
- fn->ret_type == RET_PTR_TO_MAP_VALUE) {
+ } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) {
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
/* remember map_ptr, so that check_map_access()
@@ -6323,28 +6660,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].map_uid = meta.map_uid;
- if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
- if (map_value_has_spin_lock(meta.map_ptr))
- regs[BPF_REG_0].id = ++env->id_gen;
- } else {
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
+ if (!type_may_be_null(ret_type) &&
+ map_value_has_spin_lock(meta.map_ptr)) {
+ regs[BPF_REG_0].id = ++env->id_gen;
}
- } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
+ } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) {
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
- } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
+ } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) {
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
- } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
+ } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) {
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
- } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
+ } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = meta.mem_size;
- } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
+ } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
const struct btf_type *t;
mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -6362,29 +6696,30 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
tname, PTR_ERR(ret));
return -EINVAL;
}
- regs[BPF_REG_0].type =
- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
- PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = tsize;
} else {
- regs[BPF_REG_0].type =
- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
- PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
+ /* MEM_RDONLY may be carried from ret_flag, but it
+ * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
+ * it will confuse the check of PTR_TO_BTF_ID in
+ * check_mem_access().
+ */
+ ret_flag &= ~MEM_RDONLY;
+
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
regs[BPF_REG_0].btf = meta.ret_btf;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
}
- } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
- fn->ret_type == RET_PTR_TO_BTF_ID) {
+ } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
int ret_btf_id;
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ?
- PTR_TO_BTF_ID :
- PTR_TO_BTF_ID_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
ret_btf_id = *fn->ret_btf_id;
if (ret_btf_id == 0) {
- verbose(env, "invalid return type %d of func %s#%d\n",
- fn->ret_type, func_id_name(func_id), func_id);
+ verbose(env, "invalid return type %u of func %s#%d\n",
+ base_type(ret_type), func_id_name(func_id),
+ func_id);
return -EINVAL;
}
/* current BPF helper definitions are only coming from
@@ -6393,12 +6728,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].btf = btf_vmlinux;
regs[BPF_REG_0].btf_id = ret_btf_id;
} else {
- verbose(env, "unknown return type %d of func %s#%d\n",
- fn->ret_type, func_id_name(func_id), func_id);
+ verbose(env, "unknown return type %u of func %s#%d\n",
+ base_type(ret_type), func_id_name(func_id), func_id);
return -EINVAL;
}
- if (reg_type_may_be_null(regs[BPF_REG_0].type))
+ if (type_may_be_null(regs[BPF_REG_0].type))
regs[BPF_REG_0].id = ++env->id_gen;
if (is_ptr_cast_function(func_id)) {
@@ -6485,23 +6820,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
struct bpf_reg_state *regs = cur_regs(env);
const char *func_name, *ptr_type_name;
u32 i, nargs, func_id, ptr_type_id;
+ struct module *btf_mod = NULL;
const struct btf_param *args;
+ struct btf *desc_btf;
int err;
+ /* skip for now, but return error when we find this in fixup_kfunc_call */
+ if (!insn->imm)
+ return 0;
+
+ desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off, &btf_mod);
+ if (IS_ERR(desc_btf))
+ return PTR_ERR(desc_btf);
+
func_id = insn->imm;
- func = btf_type_by_id(btf_vmlinux, func_id);
- func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
- func_proto = btf_type_by_id(btf_vmlinux, func->type);
+ func = btf_type_by_id(desc_btf, func_id);
+ func_name = btf_name_by_offset(desc_btf, func->name_off);
+ func_proto = btf_type_by_id(desc_btf, func->type);
if (!env->ops->check_kfunc_call ||
- !env->ops->check_kfunc_call(func_id)) {
+ !env->ops->check_kfunc_call(func_id, btf_mod)) {
verbose(env, "calling kernel function %s is not allowed\n",
func_name);
return -EACCES;
}
/* Check the arguments */
- err = btf_check_kfunc_arg_match(env, btf_vmlinux, func_id, regs);
+ err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs);
if (err)
return err;
@@ -6509,15 +6854,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
mark_reg_not_init(env, regs, caller_saved[i]);
/* Check return type */
- t = btf_type_skip_modifiers(btf_vmlinux, func_proto->type, NULL);
+ t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
if (btf_type_is_scalar(t)) {
mark_reg_unknown(env, regs, BPF_REG_0);
mark_btf_func_reg_size(env, BPF_REG_0, t->size);
} else if (btf_type_is_ptr(t)) {
- ptr_type = btf_type_skip_modifiers(btf_vmlinux, t->type,
+ ptr_type = btf_type_skip_modifiers(desc_btf, t->type,
&ptr_type_id);
if (!btf_type_is_struct(ptr_type)) {
- ptr_type_name = btf_name_by_offset(btf_vmlinux,
+ ptr_type_name = btf_name_by_offset(desc_btf,
ptr_type->name_off);
verbose(env, "kernel function %s returns pointer type %s %s is not supported\n",
func_name, btf_type_str(ptr_type),
@@ -6525,7 +6870,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].btf = btf_vmlinux;
+ regs[BPF_REG_0].btf = desc_btf;
regs[BPF_REG_0].type = PTR_TO_BTF_ID;
regs[BPF_REG_0].btf_id = ptr_type_id;
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
@@ -6536,7 +6881,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
for (i = 0; i < nargs; i++) {
u32 regno = i + 1;
- t = btf_type_skip_modifiers(btf_vmlinux, args[i].type, NULL);
+ t = btf_type_skip_modifiers(desc_btf, args[i].type, NULL);
if (btf_type_is_ptr(t))
mark_btf_func_reg_size(env, regno, sizeof(void *));
else
@@ -6597,25 +6942,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
verbose(env, "math between %s pointer and %lld is not allowed\n",
- reg_type_str[type], val);
+ reg_type_str(env, type), val);
return false;
}
if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
verbose(env, "%s pointer offset %d is not allowed\n",
- reg_type_str[type], reg->off);
+ reg_type_str(env, type), reg->off);
return false;
}
if (smin == S64_MIN) {
verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
- reg_type_str[type]);
+ reg_type_str(env, type));
return false;
}
if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
verbose(env, "value %lld makes %s pointer be out of bounds\n",
- smin, reg_type_str[type]);
+ smin, reg_type_str(env, type));
return false;
}
@@ -6992,11 +7337,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
return -EACCES;
}
- switch (ptr_reg->type) {
- case PTR_TO_MAP_VALUE_OR_NULL:
+ if (ptr_reg->type & PTR_MAYBE_NULL) {
verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
- dst, reg_type_str[ptr_reg->type]);
+ dst, reg_type_str(env, ptr_reg->type));
return -EACCES;
+ }
+
+ switch (base_type(ptr_reg->type)) {
case CONST_PTR_TO_MAP:
/* smin_val represents the known value */
if (known && smin_val == 0 && opcode == BPF_ADD)
@@ -7004,14 +7351,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
fallthrough;
case PTR_TO_PACKET_END:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
- dst, reg_type_str[ptr_reg->type]);
+ dst, reg_type_str(env, ptr_reg->type));
return -EACCES;
default:
break;
@@ -7984,12 +8328,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(env, state);
+ print_verifier_state(env, state, true);
verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(env, state);
+ print_verifier_state(env, state, true);
verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
@@ -8098,6 +8442,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->dst_reg);
}
zext_32_to_64(dst_reg);
+
+ __update_reg_bounds(dst_reg);
+ __reg_deduce_bounds(dst_reg);
+ __reg_bound_offset(dst_reg);
}
} else {
/* case: R = imm
@@ -8212,7 +8560,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
new_range = dst_reg->off;
if (range_right_open)
- new_range--;
+ new_range++;
/* Examples for register markings:
*
@@ -8730,17 +9078,17 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
struct bpf_reg_state *reg, u32 id,
bool is_null)
{
- if (reg_type_may_be_null(reg->type) && reg->id == id &&
+ if (type_may_be_null(reg->type) && reg->id == id &&
!WARN_ON_ONCE(!reg->id)) {
- /* Old offset (both fixed and variable parts) should
- * have been known-zero, because we don't allow pointer
- * arithmetic on pointers that might be NULL.
- */
if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
!tnum_equals_const(reg->var_off, 0) ||
reg->off)) {
- __mark_reg_known_zero(reg);
- reg->off = 0;
+ /* Old offset (both fixed and variable parts) should
+ * have been known-zero, because we don't allow pointer
+ * arithmetic on pointers that might be NULL. If we
+ * see this happening, don't convert the register.
+ */
+ return;
}
if (is_null) {
reg->type = SCALAR_VALUE;
@@ -9108,7 +9456,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*/
if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
- reg_type_may_be_null(dst_reg->type)) {
+ type_may_be_null(dst_reg->type)) {
/* Mark all identical registers in each branch as either
* safe or unknown depending R == 0 or R != 0 conditional.
*/
@@ -9124,7 +9472,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EACCES;
}
if (env->log.level & BPF_LOG_LEVEL)
- print_verifier_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch->frame[this_branch->curframe]);
return 0;
}
@@ -9163,7 +9511,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
mark_reg_known_zero(env, regs, insn->dst_reg);
dst_reg->type = aux->btf_var.reg_type;
- switch (dst_reg->type) {
+ switch (base_type(dst_reg->type)) {
case PTR_TO_MEM:
dst_reg->mem_size = aux->btf_var.mem_size;
break;
@@ -9181,7 +9529,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (insn->src_reg == BPF_PSEUDO_FUNC) {
struct bpf_prog_aux *aux = env->prog->aux;
- u32 subprogno = insn[1].imm;
+ u32 subprogno = find_subprog(env,
+ env->insn_idx + insn->imm + 1);
if (!aux->func_info) {
verbose(env, "missing btf func_info\n");
@@ -9361,7 +9710,7 @@ static int check_return_code(struct bpf_verifier_env *env)
/* enforce return zero from async callbacks like timer */
if (reg->type != SCALAR_VALUE) {
verbose(env, "In async callback the register R0 is not a known value (%s)\n",
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EINVAL;
}
@@ -9375,7 +9724,7 @@ static int check_return_code(struct bpf_verifier_env *env)
if (is_subprog) {
if (reg->type != SCALAR_VALUE) {
verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EINVAL;
}
return 0;
@@ -9439,7 +9788,7 @@ static int check_return_code(struct bpf_verifier_env *env)
if (reg->type != SCALAR_VALUE) {
verbose(env, "At program exit the register R0 is not a known value (%s)\n",
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EINVAL;
}
@@ -9912,6 +10261,8 @@ static int check_btf_line(struct bpf_verifier_env *env,
nr_linfo = attr->line_info_cnt;
if (!nr_linfo)
return 0;
+ if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
+ return -EINVAL;
rec_size = attr->line_info_rec_size;
if (rec_size < MIN_BPF_LINEINFO_SIZE ||
@@ -10020,6 +10371,78 @@ err_free:
return err;
}
+#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo)
+#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_core_relo(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 i, nr_core_relo, ncopy, expected_size, rec_size;
+ struct bpf_core_relo core_relo = {};
+ struct bpf_prog *prog = env->prog;
+ const struct btf *btf = prog->aux->btf;
+ struct bpf_core_ctx ctx = {
+ .log = &env->log,
+ .btf = btf,
+ };
+ bpfptr_t u_core_relo;
+ int err;
+
+ nr_core_relo = attr->core_relo_cnt;
+ if (!nr_core_relo)
+ return 0;
+ if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
+ return -EINVAL;
+
+ rec_size = attr->core_relo_rec_size;
+ if (rec_size < MIN_CORE_RELO_SIZE ||
+ rec_size > MAX_CORE_RELO_SIZE ||
+ rec_size % sizeof(u32))
+ return -EINVAL;
+
+ u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
+ expected_size = sizeof(struct bpf_core_relo);
+ ncopy = min_t(u32, expected_size, rec_size);
+
+ /* Unlike func_info and line_info, copy and apply each CO-RE
+ * relocation record one at a time.
+ */
+ for (i = 0; i < nr_core_relo; i++) {
+ /* future proofing when sizeof(bpf_core_relo) changes */
+ err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in core_relo");
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, core_relo_rec_size),
+ &expected_size, sizeof(expected_size)))
+ err = -EFAULT;
+ }
+ break;
+ }
+
+ if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
+ verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
+ i, core_relo.insn_off, prog->len);
+ err = -EINVAL;
+ break;
+ }
+
+ err = bpf_core_apply(&ctx, &core_relo, i,
+ &prog->insnsi[core_relo.insn_off / 8]);
+ if (err)
+ break;
+ bpfptr_add(&u_core_relo, rec_size);
+ }
+ return err;
+}
+
static int check_btf_info(struct bpf_verifier_env *env,
const union bpf_attr *attr,
bpfptr_t uattr)
@@ -10050,6 +10473,10 @@ static int check_btf_info(struct bpf_verifier_env *env,
if (err)
return err;
+ err = check_core_relo(env, attr, uattr);
+ if (err)
+ return err;
+
return 0;
}
@@ -10218,7 +10645,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
return true;
if (rcur->type == NOT_INIT)
return false;
- switch (rold->type) {
+ switch (base_type(rold->type)) {
case SCALAR_VALUE:
if (env->explore_alu_limits)
return false;
@@ -10240,6 +10667,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
}
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
+ /* a PTR_TO_MAP_VALUE could be safe to use as a
+ * PTR_TO_MAP_VALUE_OR_NULL into the same map.
+ * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
+ * checked, doing so could have affected others with the same
+ * id, and we can't check for that because we lost the id when
+ * we converted to a PTR_TO_MAP_VALUE.
+ */
+ if (type_may_be_null(rold->type)) {
+ if (!type_may_be_null(rcur->type))
+ return false;
+ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
+ return false;
+ /* Check our ids match any regs they're supposed to */
+ return check_ids(rold->id, rcur->id, idmap);
+ }
+
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
* 'id' is not compared, since it's only used for maps with
@@ -10251,20 +10694,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_MAP_VALUE_OR_NULL:
- /* a PTR_TO_MAP_VALUE could be safe to use as a
- * PTR_TO_MAP_VALUE_OR_NULL into the same map.
- * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
- * checked, doing so could have affected others with the same
- * id, and we can't check for that because we lost the id when
- * we converted to a PTR_TO_MAP_VALUE.
- */
- if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
- return false;
- if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
- return false;
- /* Check our ids match any regs they're supposed to */
- return check_ids(rold->id, rcur->id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
if (rcur->type != rold->type)
@@ -10293,11 +10722,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
case PTR_TO_PACKET_END:
case PTR_TO_FLOW_KEYS:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
/* Only valid matches are exact, which memcmp() above
* would have accepted
@@ -10354,9 +10780,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
* return false to continue verification of this path
*/
return false;
- if (i % BPF_REG_SIZE)
+ if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
continue;
- if (old->stack[spi].slot_type[0] != STACK_SPILL)
+ if (!is_spilled_reg(&old->stack[spi]))
continue;
if (!regsafe(env, &old->stack[spi].spilled_ptr,
&cur->stack[spi].spilled_ptr, idmap))
@@ -10563,7 +10989,7 @@ static int propagate_precision(struct bpf_verifier_env *env,
}
for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (state->stack[i].slot_type[0] != STACK_SPILL)
+ if (!is_spilled_reg(&state->stack[i]))
continue;
state_reg = &state->stack[i].spilled_ptr;
if (state_reg->type != SCALAR_VALUE ||
@@ -10823,17 +11249,13 @@ next:
/* Return true if it's OK to have the same insn return a different type. */
static bool reg_type_mismatch_ok(enum bpf_reg_type type)
{
- switch (type) {
+ switch (base_type(type)) {
case PTR_TO_CTX:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
- case PTR_TO_BTF_ID_OR_NULL:
return false;
default:
return true;
@@ -10913,16 +11335,12 @@ static int do_check(struct bpf_verifier_env *env)
if (need_resched())
cond_resched();
- if (env->log.level & BPF_LOG_LEVEL2 ||
- (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "%d:", env->insn_idx);
- else
- verbose(env, "\nfrom %d to %d%s:",
- env->prev_insn_idx, env->insn_idx,
- env->cur_state->speculative ?
- " (speculative execution)" : "");
- print_verifier_state(env, state->frame[state->curframe]);
+ if (env->log.level & BPF_LOG_LEVEL2 && do_print_state) {
+ verbose(env, "\nfrom %d to %d%s:",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ print_verifier_state(env, state->frame[state->curframe], true);
do_print_state = false;
}
@@ -10933,9 +11351,15 @@ static int do_check(struct bpf_verifier_env *env)
.private_data = env,
};
+ if (verifier_state_scratched(env))
+ print_insn_state(env, state->frame[state->curframe]);
+
verbose_linfo(env, env->insn_idx, "; ");
+ env->prev_log_len = env->log.len_used;
verbose(env, "%d: ", env->insn_idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+ env->prev_insn_print_len = env->log.len_used - env->prev_log_len;
+ env->prev_log_len = env->log.len_used;
}
if (bpf_prog_is_dev_bound(env->prog->aux)) {
@@ -11057,7 +11481,7 @@ static int do_check(struct bpf_verifier_env *env)
if (is_ctx_reg(env, insn->dst_reg)) {
verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
insn->dst_reg,
- reg_type_str[reg_state(env, insn->dst_reg)->type]);
+ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
return -EACCES;
}
@@ -11074,7 +11498,8 @@ static int do_check(struct bpf_verifier_env *env)
env->jmps_processed++;
if (opcode == BPF_CALL) {
if (BPF_SRC(insn->code) != BPF_K ||
- insn->off != 0 ||
+ (insn->src_reg != BPF_PSEUDO_KFUNC_CALL
+ && insn->off != 0) ||
(insn->src_reg != BPF_REG_0 &&
insn->src_reg != BPF_PSEUDO_CALL &&
insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
@@ -11143,6 +11568,7 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
process_bpf_exit:
+ mark_verifier_state_scratched(env);
update_branch_counts(env, env->cur_state);
err = pop_stack(env, &prev_insn_idx,
&env->insn_idx, pop_log);
@@ -11308,7 +11734,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
err = -EINVAL;
goto err_put;
}
- aux->btf_var.reg_type = PTR_TO_MEM;
+ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
aux->btf_var.mem_size = tsize;
} else {
aux->btf_var.reg_type = PTR_TO_BTF_ID;
@@ -11433,6 +11859,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
+ if (map_value_has_timer(map)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_timer yet\n");
+ return -EINVAL;
+ }
+ }
+
if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
@@ -11461,6 +11894,9 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
break;
case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ case BPF_MAP_TYPE_SK_STORAGE:
+ case BPF_MAP_TYPE_TASK_STORAGE:
break;
default:
verbose(env,
@@ -12348,14 +12784,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
return 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (bpf_pseudo_func(insn)) {
- env->insn_aux_data[i].call_imm = insn->imm;
- /* subprog is encoded in insn[1].imm */
+ if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
continue;
- }
- if (!bpf_pseudo_call(insn))
- continue;
/* Upon error here we cannot fall back to interpreter but
* need a hard reject of the program. Thus -EFAULT is
* propagated in any case.
@@ -12376,6 +12807,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
env->insn_aux_data[i].call_imm = insn->imm;
/* point imm to __bpf_call_base+1 from JITs point of view */
insn->imm = 1;
+ if (bpf_pseudo_func(insn))
+ /* jit (e.g. x86_64) may emit fewer instructions
+ * if it learns a u32 imm is the same as a u64 imm.
+ * Force a non zero here.
+ */
+ insn[1].imm = 1;
}
err = bpf_prog_alloc_jited_linfo(prog);
@@ -12430,6 +12867,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
+ func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
func[i]->aux->linfo = prog->aux->linfo;
func[i]->aux->nr_linfo = prog->aux->nr_linfo;
func[i]->aux->jited_linfo = prog->aux->jited_linfo;
@@ -12459,7 +12897,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (bpf_pseudo_func(insn)) {
- subprog = insn[1].imm;
+ subprog = insn->off;
insn[0].imm = (u32)(long)func[subprog]->bpf_func;
insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
continue;
@@ -12467,8 +12905,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
if (!bpf_pseudo_call(insn))
continue;
subprog = insn->off;
- insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func);
}
/* we use the aux data to keep a list of the start addresses
@@ -12511,7 +12948,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
if (bpf_pseudo_func(insn)) {
insn[0].imm = env->insn_aux_data[i].call_imm;
- insn[1].imm = find_subprog(env, i + insn[0].imm + 1);
+ insn[1].imm = insn->off;
+ insn->off = 0;
continue;
}
if (!bpf_pseudo_call(insn))
@@ -12616,10 +13054,15 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
{
const struct bpf_kfunc_desc *desc;
+ if (!insn->imm) {
+ verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
+ return -EINVAL;
+ }
+
/* insn->imm has the btf func_id. Replace it with
* an address (relative to __bpf_base_call).
*/
- desc = find_kfunc_desc(env->prog, insn->imm);
+ desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
if (!desc) {
verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n",
insn->imm);
@@ -12637,6 +13080,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
+ enum bpf_attach_type eatype = prog->expected_attach_type;
bool expect_blinding = bpf_jit_blinding_enabled(prog);
enum bpf_prog_type prog_type = resolve_prog_type(prog);
struct bpf_insn *insn = prog->insnsi;
@@ -12900,7 +13344,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
insn->imm == BPF_FUNC_map_push_elem ||
insn->imm == BPF_FUNC_map_pop_elem ||
insn->imm == BPF_FUNC_map_peek_elem ||
- insn->imm == BPF_FUNC_redirect_map)) {
+ insn->imm == BPF_FUNC_redirect_map ||
+ insn->imm == BPF_FUNC_for_each_map_elem)) {
aux = &env->insn_aux_data[i + delta];
if (bpf_map_ptr_poisoned(aux))
goto patch_call_imm;
@@ -12944,36 +13389,37 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
(int (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_redirect,
(int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
+ (int (*)(struct bpf_map *map,
+ bpf_callback_t callback_fn,
+ void *callback_ctx,
+ u64 flags))NULL));
patch_map_ops_generic:
switch (insn->imm) {
case BPF_FUNC_map_lookup_elem:
- insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
continue;
case BPF_FUNC_map_update_elem:
- insn->imm = BPF_CAST_CALL(ops->map_update_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_update_elem);
continue;
case BPF_FUNC_map_delete_elem:
- insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
continue;
case BPF_FUNC_map_push_elem:
- insn->imm = BPF_CAST_CALL(ops->map_push_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_push_elem);
continue;
case BPF_FUNC_map_pop_elem:
- insn->imm = BPF_CAST_CALL(ops->map_pop_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
continue;
case BPF_FUNC_map_peek_elem:
- insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
continue;
case BPF_FUNC_redirect_map:
- insn->imm = BPF_CAST_CALL(ops->map_redirect) -
- __bpf_call_base;
+ insn->imm = BPF_CALL_IMM(ops->map_redirect);
+ continue;
+ case BPF_FUNC_for_each_map_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
continue;
}
@@ -13005,11 +13451,79 @@ patch_map_ops_generic:
continue;
}
+ /* Implement bpf_get_func_arg inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
+ insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
+ insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
+ insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ insn_buf[7] = BPF_JMP_A(1);
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ cnt = 9;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ /* Implement bpf_get_func_ret inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ret) {
+ if (eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
+ insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ cnt = 6;
+ } else {
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
+ cnt = 1;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ /* Implement get_func_arg_cnt inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg_cnt) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
/* Implement bpf_get_func_ip inline. */
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_ip) {
- /* Load IP address from ctx - 8 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ /* Load IP address from ctx - 16 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
if (!new_prog)
@@ -13123,7 +13637,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
mark_reg_known_zero(env, regs, i);
else if (regs[i].type == SCALAR_VALUE)
mark_reg_unknown(env, regs, i);
- else if (regs[i].type == PTR_TO_MEM_OR_NULL) {
+ else if (base_type(regs[i].type) == PTR_TO_MEM) {
const u32 mem_size = regs[i].mem_size;
mark_reg_known_zero(env, regs, i);
@@ -13317,7 +13831,7 @@ BTF_SET_START(btf_non_sleepable_error_inject)
/* Three functions below can be called from sleepable and non-sleepable context.
* Assume non-sleepable from bpf safety point of view.
*/
-BTF_ID(func, __add_to_page_cache_locked)
+BTF_ID(func, __filemap_add_folio)
BTF_ID(func, should_fail_alloc_page)
BTF_ID(func, should_failslab)
BTF_SET_END(btf_non_sleepable_error_inject)
@@ -13711,13 +14225,15 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
log->ubuf = (char __user *) (unsigned long) attr->log_buf;
log->len_total = attr->log_size;
- ret = -EINVAL;
/* log attributes have to be sane */
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 ||
- !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)
+ if (!bpf_verifier_log_attr_valid(log)) {
+ ret = -EINVAL;
goto err_unlock;
+ }
}
+ mark_verifier_state_clean(env);
+
if (IS_ERR(btf_vmlinux)) {
/* Either gcc or pahole or kernel are broken. */
verbose(env, "in-kernel BTF is malformed\n");
@@ -13824,6 +14340,7 @@ skip_full_check:
env->verification_time = ktime_get_ns() - start_time;
print_verification_stats(env);
+ env->prog->aux->verified_insns = env->insn_processed;
if (log->level && bpf_verifier_log_full(log))
ret = -ENOSPC;
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index bfbeabc17a9d..6e36e854b512 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -65,6 +65,25 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
return container_of(kfc, struct cgroup_fs_context, kfc);
}
+struct cgroup_pidlist;
+
+struct cgroup_file_ctx {
+ struct cgroup_namespace *ns;
+
+ struct {
+ void *trigger;
+ } psi;
+
+ struct {
+ bool started;
+ struct css_task_iter iter;
+ } procs;
+
+ struct {
+ struct cgroup_pidlist *pidlist;
+ } procs1;
+};
+
/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 35b920328344..41e0837a5a0b 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -63,9 +63,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
for_each_root(root) {
struct cgroup *from_cgrp;
- if (root == &cgrp_dfl_root)
- continue;
-
spin_lock_irq(&css_set_lock);
from_cgrp = task_cgroup_from_root(from, root);
spin_unlock_irq(&css_set_lock);
@@ -397,6 +394,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
* next pid to display, if any
*/
struct kernfs_open_file *of = s->private;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
@@ -406,25 +404,24 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
mutex_lock(&cgrp->pidlist_mutex);
/*
- * !NULL @of->priv indicates that this isn't the first start()
- * after open. If the matching pidlist is around, we can use that.
- * Look for it. Note that @of->priv can't be used directly. It
- * could already have been destroyed.
+ * !NULL @ctx->procs1.pidlist indicates that this isn't the first
+ * start() after open. If the matching pidlist is around, we can use
+ * that. Look for it. Note that @ctx->procs1.pidlist can't be used
+ * directly. It could already have been destroyed.
*/
- if (of->priv)
- of->priv = cgroup_pidlist_find(cgrp, type);
+ if (ctx->procs1.pidlist)
+ ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
/*
* Either this is the first start() after open or the matching
* pidlist has been destroyed inbetween. Create a new one.
*/
- if (!of->priv) {
- ret = pidlist_array_load(cgrp, type,
- (struct cgroup_pidlist **)&of->priv);
+ if (!ctx->procs1.pidlist) {
+ ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
if (ret)
return ERR_PTR(ret);
}
- l = of->priv;
+ l = ctx->procs1.pidlist;
if (pid) {
int end = l->length;
@@ -452,7 +449,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
if (l)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
@@ -463,7 +461,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
@@ -507,10 +506,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
goto out_unlock;
/*
- * Even if we're attaching all tasks in the thread group, we only
- * need to check permissions on one of them.
+ * Even if we're attaching all tasks in the thread group, we only need
+ * to check permissions on one of them. Check permissions using the
+ * credentials from file open to protect against inherited fd attacks.
*/
- cred = current_cred();
+ cred = of->file->f_cred;
tcred = get_task_cred(task);
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
@@ -662,11 +662,9 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
/*
- * ideally we don't want subsystems moving around while we do this.
- * cgroup_mutex is also necessary to guarantee an atomic snapshot of
- * subsys/hierarchy state.
+ * Grab the subsystems state racily. No need to add avenue to
+ * cgroup_mutex contention.
*/
- mutex_lock(&cgroup_mutex);
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
@@ -674,7 +672,6 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i));
- mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -701,8 +698,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
kernfs_type(kn) != KERNFS_DIR)
return -EINVAL;
- mutex_lock(&cgroup_mutex);
-
/*
* We aren't being called from kernfs and there's no guarantee on
* @kn->priv's validity. For this and css_tryget_online_from_dir(),
@@ -710,9 +705,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
*/
rcu_read_lock();
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
- if (!cgrp || cgroup_is_dead(cgrp)) {
+ if (!cgrp || !cgroup_tryget(cgrp)) {
rcu_read_unlock();
- mutex_unlock(&cgroup_mutex);
return -ENOENT;
}
rcu_read_unlock();
@@ -740,7 +734,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
}
css_task_iter_end(&it);
- mutex_unlock(&cgroup_mutex);
+ cgroup_put(cgrp);
return 0;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 881ce1470beb..b31e1465868a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -30,6 +30,7 @@
#include "cgroup-internal.h"
+#include <linux/bpf-cgroup.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
@@ -1740,6 +1741,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
int ssid, i, ret;
+ u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -1756,8 +1758,28 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
+
+ /*
+ * Collect ssid's that need to be disabled from default
+ * hierarchy.
+ */
+ if (ss->root == &cgrp_dfl_root)
+ dfl_disable_ss_mask |= 1 << ssid;
+
} while_each_subsys_mask();
+ if (dfl_disable_ss_mask) {
+ struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
+
+ /*
+ * Controllers from default hierarchy that need to be rebound
+ * are all disabled together in one go.
+ */
+ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
+
do_each_subsys_mask(ss, ssid, ss_mask) {
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
@@ -1766,10 +1788,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
WARN_ON(!css || cgroup_css(dcgrp, ss));
- /* disable from the source */
- src_root->subsys_mask &= ~(1 << ssid);
- WARN_ON(cgroup_apply_control(scgrp));
- cgroup_finalize_control(scgrp, 0);
+ if (src_root != &cgrp_dfl_root) {
+ /* disable from the source */
+ src_root->subsys_mask &= ~(1 << ssid);
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
/* rebind */
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
@@ -2187,8 +2211,10 @@ static void cgroup_kill_sb(struct super_block *sb)
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
- !percpu_ref_is_dying(&root->cgrp.self.refcnt))
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+ cgroup_bpf_offline(&root->cgrp);
percpu_ref_kill(&root->cgrp.self.refcnt);
+ }
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
@@ -2625,11 +2651,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
if (src_cset->dead)
return;
- src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
-
if (!list_empty(&src_cset->mg_preload_node))
return;
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
WARN_ON(src_cset->mg_src_cgrp);
WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks));
@@ -3605,6 +3631,7 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
@@ -3623,7 +3650,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return PTR_ERR(new);
}
- psi_trigger_replace(&of->priv, new);
+ psi_trigger_replace(&ctx->psi.trigger, new);
cgroup_put(cgrp);
@@ -3654,12 +3681,16 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
- return psi_trigger_poll(&of->priv, of->file, pt);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
- psi_trigger_replace(&of->priv, NULL);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ psi_trigger_replace(&ctx->psi.trigger, NULL);
}
bool cgroup_psi_enabled(void)
@@ -3786,24 +3817,43 @@ static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx;
+ int ret;
- if (cft->open)
- return cft->open(of);
- return 0;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ of->priv = ctx;
+
+ if (!cft->open)
+ return 0;
+
+ ret = cft->open(of);
+ if (ret) {
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
+ }
+ return ret;
}
static void cgroup_file_release(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx = of->priv;
if (cft->release)
cft->release(of);
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
@@ -3820,7 +3870,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
- ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
return -EPERM;
if (cft->write)
@@ -4726,21 +4776,21 @@ void css_task_iter_end(struct css_task_iter *it)
static void cgroup_procs_release(struct kernfs_open_file *of)
{
- if (of->priv) {
- css_task_iter_end(of->priv);
- kfree(of->priv);
- }
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ if (ctx->procs.started)
+ css_task_iter_end(&ctx->procs.iter);
}
static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
if (pos)
(*pos)++;
- return css_task_iter_next(it);
+ return css_task_iter_next(&ctx->procs.iter);
}
static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
@@ -4748,21 +4798,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct css_task_iter *it = &ctx->procs.iter;
/*
* When a seq_file is seeked, it's always traversed sequentially
* from position 0, so we can simply keep iterating on !0 *pos.
*/
- if (!it) {
+ if (!ctx->procs.started) {
if (WARN_ON_ONCE((*pos)))
return ERR_PTR(-EINVAL);
-
- it = kzalloc(sizeof(*it), GFP_KERNEL);
- if (!it)
- return ERR_PTR(-ENOMEM);
- of->priv = it;
css_task_iter_start(&cgrp->self, iter_flags, it);
+ ctx->procs.started = true;
} else if (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
@@ -4813,9 +4860,9 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb)
+ struct super_block *sb,
+ struct cgroup_namespace *ns)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *com_cgrp = src_cgrp;
int ret;
@@ -4844,11 +4891,12 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
static int cgroup_attach_permissions(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb, bool threadgroup)
+ struct super_block *sb, bool threadgroup,
+ struct cgroup_namespace *ns)
{
int ret = 0;
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
if (ret)
return ret;
@@ -4865,8 +4913,10 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
bool threadgroup)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
+ const struct cred *saved_cred;
ssize_t ret;
bool locked;
@@ -4884,9 +4934,16 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
- /* process and thread migrations follow same delegation rule */
+ /*
+ * Process and thread migrations follow same delegation rule. Check
+ * permissions using the credentials from file open to protect against
+ * inherited fd attacks.
+ */
+ saved_cred = override_creds(of->file->f_cred);
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb, threadgroup);
+ of->file->f_path.dentry->d_sb,
+ threadgroup, ctx->ns);
+ revert_creds(saved_cred);
if (ret)
goto out_finish;
@@ -5686,7 +5743,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
/* Create the root cgroup state for this subsystem */
ss->root = &cgrp_dfl_root;
- css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
+ css = ss->css_alloc(NULL);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
@@ -5909,17 +5966,20 @@ struct cgroup *cgroup_get_from_id(u64 id)
struct kernfs_node *kn;
struct cgroup *cgrp = NULL;
- mutex_lock(&cgroup_mutex);
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
- goto out_unlock;
+ goto out;
- cgrp = kn->priv;
- if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp))
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (cgrp && !cgroup_tryget(cgrp))
cgrp = NULL;
+
+ rcu_read_unlock();
+
kernfs_put(kn);
-out_unlock:
- mutex_unlock(&cgroup_mutex);
+out:
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_id);
@@ -6102,7 +6162,8 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
goto err;
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
- !(kargs->flags & CLONE_THREAD));
+ !(kargs->flags & CLONE_THREAD),
+ current->nsproxy->cgroup_ns);
if (ret)
goto err;
@@ -6472,30 +6533,34 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
*
* Find the cgroup at @path on the default hierarchy, increment its
* reference count and return it. Returns pointer to the found cgroup on
- * success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR)
- * if @path points to a non-directory.
+ * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
+ * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
*/
struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
- struct cgroup *cgrp;
-
- mutex_lock(&cgroup_mutex);
+ struct cgroup *cgrp = ERR_PTR(-ENOENT);
kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
- if (kn) {
- if (kernfs_type(kn) == KERNFS_DIR) {
- cgrp = kn->priv;
- cgroup_get_live(cgrp);
- } else {
- cgrp = ERR_PTR(-ENOTDIR);
- }
- kernfs_put(kn);
- } else {
- cgrp = ERR_PTR(-ENOENT);
+ if (!kn)
+ goto out;
+
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ cgrp = ERR_PTR(-ENOTDIR);
+ goto out_kernfs;
}
- mutex_unlock(&cgroup_mutex);
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (!cgrp || !cgroup_tryget(cgrp))
+ cgrp = ERR_PTR(-ENOENT);
+
+ rcu_read_unlock();
+
+out_kernfs:
+ kernfs_put(kn);
+out:
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
@@ -6572,118 +6637,57 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
- if (cgroup_sk_alloc_disabled)
- return;
- pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
- cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled false
-
-#endif
-
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled) {
- skcd->no_refcnt = 1;
- return;
- }
-
- /* Don't associate the sock with unrelated interrupted task's cgroup. */
- if (in_interrupt())
- return;
+ struct cgroup *cgroup;
rcu_read_lock();
+ /* Don't associate the sock with unrelated interrupted task's cgroup. */
+ if (in_interrupt()) {
+ cgroup = &cgrp_dfl_root.cgrp;
+ cgroup_get(cgroup);
+ goto out;
+ }
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
- skcd->val = (unsigned long)cset->dfl_cgrp;
- cgroup_bpf_get(cset->dfl_cgrp);
+ cgroup = cset->dfl_cgrp;
break;
}
cpu_relax();
}
-
+out:
+ skcd->cgroup = cgroup;
+ cgroup_bpf_get(cgroup);
rcu_read_unlock();
}
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
- if (skcd->val) {
- if (skcd->no_refcnt)
- return;
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
- cgroup_bpf_get(sock_cgroup_ptr(skcd));
- }
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(cgrp);
+ cgroup_bpf_get(cgrp);
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- if (skcd->no_refcnt)
- return;
cgroup_bpf_put(cgrp);
cgroup_put(cgrp);
}
#endif /* CONFIG_SOCK_CGROUP_DATA */
-#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_attach(struct cgroup *cgrp,
- struct bpf_prog *prog, struct bpf_prog *replace_prog,
- struct bpf_cgroup_link *link,
- enum bpf_attach_type type,
- u32 flags)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
- union bpf_attr __user *uattr)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_query(cgrp, attr, uattr);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-#endif /* CONFIG_CGROUP_BPF */
-
#ifdef CONFIG_SYSFS
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
ssize_t size, const char *prefix)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index df1ccf4558f8..dc653ab26e50 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -69,6 +69,13 @@
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
+/*
+ * There could be abnormal cpuset configurations for cpu or memory
+ * node binding, add this key to provide a quick low-cost judgement
+ * of the situation.
+ */
+DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
+
/* See "Frequency meter" comments, below. */
struct fmeter {
@@ -311,17 +318,19 @@ static struct cpuset top_cpuset = {
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
/*
- * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * There are two global locks guarding cpuset structures - cpuset_rwsem and
* callback_lock. We also require taking task_lock() when dereferencing a
* task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment.
+ * comment. The cpuset code uses only cpuset_rwsem write lock. Other
+ * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to
+ * prevent change to cpuset structures.
*
* A task must hold both locks to modify cpusets. If a task holds
- * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it
* is the only task able to also acquire callback_lock and be able to
* modify cpusets. It can perform various checks on the cpuset structure
* first, knowing nothing will change. It can also allocate memory while
- * just holding cpuset_mutex. While it is performing these checks, various
+ * just holding cpuset_rwsem. While it is performing these checks, various
* callback routines can briefly acquire callback_lock to query cpusets.
* Once it is ready to make the changes, it takes callback_lock, blocking
* everyone else.
@@ -370,6 +379,17 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
+static inline void check_insane_mems_config(nodemask_t *nodes)
+{
+ if (!cpusets_insane_config() &&
+ movable_only_nodes(nodes)) {
+ static_branch_enable(&cpusets_insane_config_key);
+ pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
+ "Cpuset allocations might fail even with a lot of memory available.\n",
+ nodemask_pr_args(nodes));
+ }
+}
+
/*
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
@@ -393,7 +413,7 @@ static inline bool is_in_v2_mode(void)
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void guarantee_online_cpus(struct task_struct *tsk,
struct cpumask *pmask)
@@ -435,7 +455,7 @@ out_unlock:
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
@@ -447,7 +467,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -468,7 +488,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cpuset_mutex.
+ * are only set if the other's are set. Call holding cpuset_rwsem.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -577,7 +597,7 @@ static inline void free_cpuset(struct cpuset *cs)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cpuset_mutex held.
+ * cpuset_rwsem held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -596,19 +616,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
struct cpuset *c, *par;
int ret;
- rcu_read_lock();
-
- /* Each of our child cpusets must be a subset of us */
- ret = -EBUSY;
- cpuset_for_each_child(c, css, cur)
- if (!is_cpuset_subset(c, trial))
- goto out;
-
- /* Remaining checks don't apply to root cpuset */
- ret = 0;
+ /* The checks don't apply to root cpuset */
if (cur == &top_cpuset)
- goto out;
+ return 0;
+ rcu_read_lock();
par = parent_cs(cur);
/* On legacy hierarchy, we must be a subset of our parent cpuset. */
@@ -700,7 +712,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
-/* Must be called with cpuset_mutex held. */
+/* Must be called with cpuset_rwsem held. */
static inline int nr_cpusets(void)
{
/* jump label reference count + the top-level cpuset */
@@ -726,7 +738,7 @@ static inline int nr_cpusets(void)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cpuset_mutex held.
+ * Must be called with cpuset_rwsem held.
*
* The three key local variables below are:
* cp - cpuset pointer, used (together with pos_css) to perform a
@@ -1005,7 +1017,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
- * Call with cpuset_mutex held. Takes cpus_read_lock().
+ * Call with cpuset_rwsem held. Takes cpus_read_lock().
*/
static void rebuild_sched_domains_locked(void)
{
@@ -1078,7 +1090,7 @@ void rebuild_sched_domains(void)
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its cpus_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
+ * effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
static void update_tasks_cpumask(struct cpuset *cs)
@@ -1347,7 +1359,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
*
* On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
*
- * Called with cpuset_mutex held
+ * Called with cpuset_rwsem held
*/
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
{
@@ -1704,12 +1716,12 @@ static void *cpuset_being_rebound;
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its mems_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
+ * effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
static void update_tasks_nodemask(struct cpuset *cs)
{
- static nodemask_t newmems; /* protected by cpuset_mutex */
+ static nodemask_t newmems; /* protected by cpuset_rwsem */
struct css_task_iter it;
struct task_struct *task;
@@ -1722,7 +1734,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cpuset_mutex, we know that no other rebind effort
+ * the global cpuset_rwsem, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -1768,7 +1780,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
*
* On legacy hierarchy, effective_mems will be the same with mems_allowed.
*
- * Called with cpuset_mutex held
+ * Called with cpuset_rwsem held
*/
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
@@ -1821,7 +1833,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_mutex held. May take callback_lock during call.
+ * Call with cpuset_rwsem held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_lock, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1868,6 +1880,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
+ check_insane_mems_config(&trialcs->mems_allowed);
+
spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
spin_unlock_irq(&callback_lock);
@@ -1911,7 +1925,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
* @cs: the cpuset in which each task's spread flags needs to be changed
*
* Iterate through each task of @cs updating its spread flags. As this
- * function is called with cpuset_mutex held, cpuset membership stays
+ * function is called with cpuset_rwsem held, cpuset membership stays
* stable.
*/
static void update_tasks_flags(struct cpuset *cs)
@@ -1931,7 +1945,7 @@ static void update_tasks_flags(struct cpuset *cs)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cpuset_mutex held.
+ * Call with cpuset_rwsem held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1980,7 +1994,7 @@ out:
* cs: the cpuset to update
* new_prs: new partition root state
*
- * Call with cpuset_mutex held.
+ * Call with cpuset_rwsem held.
*/
static int update_prstate(struct cpuset *cs, int new_prs)
{
@@ -2167,7 +2181,7 @@ static int fmeter_getrate(struct fmeter *fmp)
static struct cpuset *cpuset_attach_old_cs;
-/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
+/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
@@ -2219,7 +2233,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
}
/*
- * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
@@ -2227,7 +2241,7 @@ static cpumask_var_t cpus_attach;
static void cpuset_attach(struct cgroup_taskset *tset)
{
- /* static buf protected by cpuset_mutex */
+ /* static buf protected by cpuset_rwsem */
static nodemask_t cpuset_attach_nodemask_to;
struct task_struct *task;
struct task_struct *leader;
@@ -2417,7 +2431,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
- * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * grabbing cpuset_rwsem anyway. This only happens on the legacy
* hierarchies.
*/
css_get(&cs->css);
@@ -3171,6 +3185,9 @@ update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
+ if (mems_updated)
+ check_insane_mems_config(&new_mems);
+
if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
@@ -3511,7 +3528,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
- int allowed; /* is allocation in zone z allowed? */
+ bool allowed; /* is allocation in zone z allowed? */
unsigned long flags;
if (in_interrupt())
@@ -3672,7 +3689,7 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cpuset_mutex, keeping cpuset_attach() from changing it
+ * and we take cpuset_rwsem, keeping cpuset_attach() from changing it
* anyway.
*/
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index ec02d963cad1..fe3e8a0eb7ed 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -157,13 +157,6 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
new_usage = atomic_long_add_return(amount, &res->usage);
if (new_usage > READ_ONCE(res->max) ||
new_usage > READ_ONCE(misc_res_capacity[type])) {
- if (!res->failed) {
- pr_info("cgroup: charge rejected by the misc controller for %s resource in ",
- misc_res_name[type]);
- pr_cont_cgroup_path(i->css.cgroup);
- pr_cont("\n");
- res->failed = true;
- }
ret = -EBUSY;
goto err_charge;
}
@@ -171,6 +164,11 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
return 0;
err_charge:
+ for (j = i; j; j = parent_misc(j)) {
+ atomic_long_inc(&j->res[type].events);
+ cgroup_file_notify(&j->events_file);
+ }
+
for (j = cg; j != i; j = parent_misc(j))
misc_cg_cancel_charge(type, j, amount);
misc_cg_cancel_charge(type, i, amount);
@@ -335,6 +333,19 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
return 0;
}
+static int misc_events_show(struct seq_file *sf, void *v)
+{
+ struct misc_cg *cg = css_misc(seq_css(sf));
+ unsigned long events, i;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ events = atomic_long_read(&cg->res[i].events);
+ if (READ_ONCE(misc_res_capacity[i]) || events)
+ seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events);
+ }
+ return 0;
+}
+
/* Misc cgroup interface files */
static struct cftype misc_cg_files[] = {
{
@@ -353,6 +364,12 @@ static struct cftype misc_cg_files[] = {
.seq_show = misc_cg_capacity_show,
.flags = CFTYPE_ONLY_ON_ROOT,
},
+ {
+ .name = "events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct misc_cg, events_file),
+ .seq_show = misc_events_show,
+ },
{}
};
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index b264ab5652ba..9d331ba44870 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -35,7 +35,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
* instead of NULL, we can tell whether @cgrp is on the list by
* testing the next pointer for NULL.
*/
- if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
+ if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
return;
raw_spin_lock_irqsave(cpu_lock, flags);
@@ -88,6 +88,7 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
struct cgroup *root, int cpu)
{
struct cgroup_rstat_cpu *rstatc;
+ struct cgroup *parent;
if (pos == root)
return NULL;
@@ -96,10 +97,14 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* We're gonna walk down to the first leaf and visit/remove it. We
* can pick whatever unvisited node as the starting point.
*/
- if (!pos)
+ if (!pos) {
pos = root;
- else
+ /* return NULL if this subtree is not on-list */
+ if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
+ return NULL;
+ } else {
pos = cgroup_parent(pos);
+ }
/* walk down to the first leaf */
while (true) {
@@ -115,33 +120,25 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* However, due to the way we traverse, @pos will be the first
* child in most cases. The only exception is @root.
*/
- if (rstatc->updated_next) {
- struct cgroup *parent = cgroup_parent(pos);
-
- if (parent) {
- struct cgroup_rstat_cpu *prstatc;
- struct cgroup **nextp;
-
- prstatc = cgroup_rstat_cpu(parent, cpu);
- nextp = &prstatc->updated_children;
- while (true) {
- struct cgroup_rstat_cpu *nrstatc;
-
- nrstatc = cgroup_rstat_cpu(*nextp, cpu);
- if (*nextp == pos)
- break;
- WARN_ON_ONCE(*nextp == parent);
- nextp = &nrstatc->updated_next;
- }
- *nextp = rstatc->updated_next;
- }
+ parent = cgroup_parent(pos);
+ if (parent) {
+ struct cgroup_rstat_cpu *prstatc;
+ struct cgroup **nextp;
- rstatc->updated_next = NULL;
- return pos;
+ prstatc = cgroup_rstat_cpu(parent, cpu);
+ nextp = &prstatc->updated_children;
+ while (*nextp != pos) {
+ struct cgroup_rstat_cpu *nrstatc;
+
+ nrstatc = cgroup_rstat_cpu(*nextp, cpu);
+ WARN_ON_ONCE(*nextp == parent);
+ nextp = &nrstatc->updated_next;
+ }
+ *nextp = rstatc->updated_next;
}
- /* only happens for @root */
- return NULL;
+ rstatc->updated_next = NULL;
+ return pos;
}
/* see cgroup_rstat_flush() */
@@ -433,8 +430,6 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
- cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST];
- cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE];
}
}
diff --git a/kernel/compat.c b/kernel/compat.c
index 05adfd6fa8bf..55551989d9da 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -269,24 +269,3 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
return 0;
}
EXPORT_SYMBOL_GPL(get_compat_sigset);
-
-/*
- * Allocate user-space memory for the duration of a single system call,
- * in order to marshall parameters inside a compat thunk.
- */
-void __user *compat_alloc_user_space(unsigned long len)
-{
- void __user *ptr;
-
- /* If len would occupy more than half of the entire compat space... */
- if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
- return NULL;
-
- ptr = arch_compat_alloc_user_space(len);
-
- if (unlikely(!access_ok(ptr, len)))
- return NULL;
-
- return ptr;
-}
-EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 192e43a87407..407a2568f35e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -31,6 +31,7 @@
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
+#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
@@ -588,6 +589,12 @@ static int bringup_cpu(unsigned int cpu)
int ret;
/*
+ * Reset stale stack state from the last time this CPU was online.
+ */
+ scs_task_reset(idle);
+ kasan_unpoison_task_stack(idle);
+
+ /*
* Some architectures have to walk the irq descriptors to
* setup the vector space for the cpu which comes online.
* Prevent irq alloc/free across the bringup.
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index eb53f5ec62c9..256cf6db573c 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -6,6 +6,7 @@
#include <linux/buildid.h>
#include <linux/crash_core.h>
+#include <linux/init.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
@@ -295,6 +296,16 @@ int __init parse_crashkernel_low(char *cmdline,
"crashkernel=", suffix_tbl[SUFFIX_LOW]);
}
+/*
+ * Add a dummy early_param handler to mark crashkernel= as a known command line
+ * parameter and suppress incorrect warnings in init/main.c.
+ */
+static int __init parse_crashkernel_dummy(char *arg)
+{
+ return 0;
+}
+early_param("crashkernel", parse_crashkernel_dummy);
+
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
void *data, size_t data_len)
{
diff --git a/kernel/cred.c b/kernel/cred.c
index f784e08c2fbd..473d17c431f3 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -225,8 +225,6 @@ struct cred *cred_alloc_blank(void)
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
#endif
- new->ucounts = get_ucounts(&init_ucounts);
-
if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
@@ -501,7 +499,7 @@ int commit_creds(struct cred *new)
inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
- if (new->user != old->user)
+ if (new->user != old->user || new->user_ns != old->user_ns)
dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
alter_cred_subscribers(old, -2);
@@ -669,7 +667,7 @@ int set_cred_ucounts(struct cred *new)
{
struct task_struct *task = current;
const struct cred *old = task->real_cred;
- struct ucounts *old_ucounts = new->ucounts;
+ struct ucounts *new_ucounts, *old_ucounts = new->ucounts;
if (new->user == old->user && new->user_ns == old->user_ns)
return 0;
@@ -678,14 +676,14 @@ int set_cred_ucounts(struct cred *new)
* This optimization is needed because alloc_ucounts() uses locks
* for table lookups.
*/
- if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
+ if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
return 0;
- if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid)))
+ if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid)))
return -EAGAIN;
- if (old_ucounts)
- put_ucounts(old_ucounts);
+ new->ucounts = new_ucounts;
+ put_ucounts(old_ucounts);
return 0;
}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index b4aa6bb6b2bd..da06a5553835 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel Debug Core
*
@@ -22,10 +23,6 @@
*
* Original KGDB stub: David Grothe <dave@gcom.com>,
* Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
*/
#define pr_fmt(fmt) "KGDB: " fmt
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index b6f28fad4307..9d34d2364b5a 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel Debug Core
*
@@ -22,10 +23,6 @@
*
* Original KGDB stub: David Grothe <dave@gcom.com>,
* Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
*/
#include <linux/kernel.h>
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 2168f8dacb99..372025cf1ca3 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -523,51 +523,51 @@ static int kdb_ss(int argc, const char **argv)
}
static kdbtab_t bptab[] = {
- { .cmd_name = "bp",
- .cmd_func = kdb_bp,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "Set/Display breakpoints",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ { .name = "bp",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "Set/Display breakpoints",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "bl",
- .cmd_func = kdb_bp,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "Display breakpoints",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ { .name = "bl",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "Display breakpoints",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "bc",
- .cmd_func = kdb_bc,
- .cmd_usage = "<bpnum>",
- .cmd_help = "Clear Breakpoint",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ { .name = "bc",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Clear Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
},
- { .cmd_name = "be",
- .cmd_func = kdb_bc,
- .cmd_usage = "<bpnum>",
- .cmd_help = "Enable Breakpoint",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ { .name = "be",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Enable Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
},
- { .cmd_name = "bd",
- .cmd_func = kdb_bc,
- .cmd_usage = "<bpnum>",
- .cmd_help = "Disable Breakpoint",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL,
+ { .name = "bd",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Disable Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
},
- { .cmd_name = "ss",
- .cmd_func = kdb_ss,
- .cmd_usage = "",
- .cmd_help = "Single Step",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ { .name = "ss",
+ .func = kdb_ss,
+ .usage = "",
+ .help = "Single Step",
+ .minlen = 1,
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
},
};
static kdbtab_t bphcmd = {
- .cmd_name = "bph",
- .cmd_func = kdb_bp,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "[datar [length]|dataw [length]] Set hw brk",
- .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ .name = "bph",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "[datar [length]|dataw [length]] Set hw brk",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
};
/* Initialize the breakpoint table and register breakpoint commands. */
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 1f9f0e47aeda..10b454554ab0 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -46,7 +46,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
* btp <pid> Kernel stack for <pid>
* btt <address-expression> Kernel stack for task structure at
* <address-expression>
- * bta [DRSTCZEUIMA] All useful processes, optionally
+ * bta [state_chars>|A] All useful processes, optionally
* filtered by state
* btc [<cpu>] The current process on one cpu,
* default is all cpus
@@ -74,7 +74,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
*/
static int
-kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt)
+kdb_bt1(struct task_struct *p, const char *mask, bool btaprompt)
{
char ch;
@@ -120,7 +120,7 @@ kdb_bt_cpu(unsigned long cpu)
return;
}
- kdb_bt1(kdb_tsk, ~0UL, false);
+ kdb_bt1(kdb_tsk, "A", false);
}
int
@@ -138,8 +138,8 @@ kdb_bt(int argc, const char **argv)
if (strcmp(argv[0], "bta") == 0) {
struct task_struct *g, *p;
unsigned long cpu;
- unsigned long mask = kdb_task_state_string(argc ? argv[1] :
- NULL);
+ const char *mask = argc ? argv[1] : kdbgetenv("PS");
+
if (argc == 0)
kdb_ps_suppressed();
/* Run the active tasks first */
@@ -167,7 +167,7 @@ kdb_bt(int argc, const char **argv)
return diag;
p = find_task_by_pid_ns(pid, &init_pid_ns);
if (p)
- return kdb_bt1(p, ~0UL, false);
+ return kdb_bt1(p, "A", false);
kdb_printf("No process with pid == %ld found\n", pid);
return 0;
} else if (strcmp(argv[0], "btt") == 0) {
@@ -176,7 +176,7 @@ kdb_bt(int argc, const char **argv)
diag = kdbgetularg((char *)argv[1], &addr);
if (diag)
return diag;
- return kdb_bt1((struct task_struct *)addr, ~0UL, false);
+ return kdb_bt1((struct task_struct *)addr, "A", false);
} else if (strcmp(argv[0], "btc") == 0) {
unsigned long cpu = ~0;
if (argc > 1)
@@ -212,7 +212,7 @@ kdb_bt(int argc, const char **argv)
kdb_show_stack(kdb_current_task, (void *)addr);
return 0;
} else {
- return kdb_bt1(kdb_current_task, ~0UL, false);
+ return kdb_bt1(kdb_current_task, "A", false);
}
}
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 0220afda3200..e91fc3e4edd5 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -140,7 +140,6 @@ int kdb_stub(struct kgdb_state *ks)
*/
kdb_common_deinit_state();
KDB_STATE_CLEAR(PAGER);
- kdbnearsym_cleanup();
if (error == KDB_CMD_KGDB) {
if (KDB_STATE(DOING_KGDB))
KDB_STATE_CLEAR(DOING_KGDB);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index d8ee5647b732..0852a537dad4 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -33,7 +33,6 @@
#include <linux/kallsyms.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
-#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
@@ -654,16 +653,17 @@ static void kdb_cmderror(int diag)
* Returns:
* zero for success, a kdb diagnostic if error
*/
-struct defcmd_set {
- int count;
- bool usable;
- char *name;
- char *usage;
- char *help;
- char **command;
+struct kdb_macro {
+ kdbtab_t cmd; /* Macro command */
+ struct list_head statements; /* Associated statement list */
};
-static struct defcmd_set *defcmd_set;
-static int defcmd_set_count;
+
+struct kdb_macro_statement {
+ char *statement; /* Statement text */
+ struct list_head list_node; /* Statement list node */
+};
+
+static struct kdb_macro *kdb_macro;
static bool defcmd_in_progress;
/* Forward references */
@@ -671,53 +671,55 @@ static int kdb_exec_defcmd(int argc, const char **argv);
static int kdb_defcmd2(const char *cmdstr, const char *argv0)
{
- struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
- char **save_command = s->command;
+ struct kdb_macro_statement *kms;
+
+ if (!kdb_macro)
+ return KDB_NOTIMP;
+
if (strcmp(argv0, "endefcmd") == 0) {
defcmd_in_progress = false;
- if (!s->count)
- s->usable = false;
- if (s->usable)
- /* macros are always safe because when executed each
- * internal command re-enters kdb_parse() and is
- * safety checked individually.
- */
- kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
- s->help, 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ if (!list_empty(&kdb_macro->statements))
+ kdb_register(&kdb_macro->cmd);
return 0;
}
- if (!s->usable)
- return KDB_NOTIMP;
- s->command = kcalloc(s->count + 1, sizeof(*(s->command)), GFP_KDB);
- if (!s->command) {
- kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
+
+ kms = kmalloc(sizeof(*kms), GFP_KDB);
+ if (!kms) {
+ kdb_printf("Could not allocate new kdb macro command: %s\n",
cmdstr);
- s->usable = false;
return KDB_NOTIMP;
}
- memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
- s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
- kfree(save_command);
+
+ kms->statement = kdb_strdup(cmdstr, GFP_KDB);
+ list_add_tail(&kms->list_node, &kdb_macro->statements);
+
return 0;
}
static int kdb_defcmd(int argc, const char **argv)
{
- struct defcmd_set *save_defcmd_set = defcmd_set, *s;
+ kdbtab_t *mp;
+
if (defcmd_in_progress) {
kdb_printf("kdb: nested defcmd detected, assuming missing "
"endefcmd\n");
kdb_defcmd2("endefcmd", "endefcmd");
}
if (argc == 0) {
- int i;
- for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
- kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
- s->usage, s->help);
- for (i = 0; i < s->count; ++i)
- kdb_printf("%s", s->command[i]);
- kdb_printf("endefcmd\n");
+ kdbtab_t *kp;
+ struct kdb_macro *kmp;
+ struct kdb_macro_statement *kms;
+
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (kp->func == kdb_exec_defcmd) {
+ kdb_printf("defcmd %s \"%s\" \"%s\"\n",
+ kp->name, kp->usage, kp->help);
+ kmp = container_of(kp, struct kdb_macro, cmd);
+ list_for_each_entry(kms, &kmp->statements,
+ list_node)
+ kdb_printf("%s", kms->statement);
+ kdb_printf("endefcmd\n");
+ }
}
return 0;
}
@@ -727,45 +729,43 @@ static int kdb_defcmd(int argc, const char **argv)
kdb_printf("Command only available during kdb_init()\n");
return KDB_NOTIMP;
}
- defcmd_set = kmalloc_array(defcmd_set_count + 1, sizeof(*defcmd_set),
- GFP_KDB);
- if (!defcmd_set)
+ kdb_macro = kzalloc(sizeof(*kdb_macro), GFP_KDB);
+ if (!kdb_macro)
goto fail_defcmd;
- memcpy(defcmd_set, save_defcmd_set,
- defcmd_set_count * sizeof(*defcmd_set));
- s = defcmd_set + defcmd_set_count;
- memset(s, 0, sizeof(*s));
- s->usable = true;
- s->name = kdb_strdup(argv[1], GFP_KDB);
- if (!s->name)
+
+ mp = &kdb_macro->cmd;
+ mp->func = kdb_exec_defcmd;
+ mp->minlen = 0;
+ mp->flags = KDB_ENABLE_ALWAYS_SAFE;
+ mp->name = kdb_strdup(argv[1], GFP_KDB);
+ if (!mp->name)
goto fail_name;
- s->usage = kdb_strdup(argv[2], GFP_KDB);
- if (!s->usage)
+ mp->usage = kdb_strdup(argv[2], GFP_KDB);
+ if (!mp->usage)
goto fail_usage;
- s->help = kdb_strdup(argv[3], GFP_KDB);
- if (!s->help)
+ mp->help = kdb_strdup(argv[3], GFP_KDB);
+ if (!mp->help)
goto fail_help;
- if (s->usage[0] == '"') {
- strcpy(s->usage, argv[2]+1);
- s->usage[strlen(s->usage)-1] = '\0';
+ if (mp->usage[0] == '"') {
+ strcpy(mp->usage, argv[2]+1);
+ mp->usage[strlen(mp->usage)-1] = '\0';
}
- if (s->help[0] == '"') {
- strcpy(s->help, argv[3]+1);
- s->help[strlen(s->help)-1] = '\0';
+ if (mp->help[0] == '"') {
+ strcpy(mp->help, argv[3]+1);
+ mp->help[strlen(mp->help)-1] = '\0';
}
- ++defcmd_set_count;
+
+ INIT_LIST_HEAD(&kdb_macro->statements);
defcmd_in_progress = true;
- kfree(save_defcmd_set);
return 0;
fail_help:
- kfree(s->usage);
+ kfree(mp->usage);
fail_usage:
- kfree(s->name);
+ kfree(mp->name);
fail_name:
- kfree(defcmd_set);
+ kfree(kdb_macro);
fail_defcmd:
- kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
- defcmd_set = save_defcmd_set;
+ kdb_printf("Could not allocate new kdb_macro entry for %s\n", argv[1]);
return KDB_NOTIMP;
}
@@ -780,25 +780,31 @@ fail_defcmd:
*/
static int kdb_exec_defcmd(int argc, const char **argv)
{
- int i, ret;
- struct defcmd_set *s;
+ int ret;
+ kdbtab_t *kp;
+ struct kdb_macro *kmp;
+ struct kdb_macro_statement *kms;
+
if (argc != 0)
return KDB_ARGCOUNT;
- for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
- if (strcmp(s->name, argv[0]) == 0)
+
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (strcmp(kp->name, argv[0]) == 0)
break;
}
- if (i == defcmd_set_count) {
+ if (list_entry_is_head(kp, &kdb_cmds_head, list_node)) {
kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
argv[0]);
return KDB_NOTIMP;
}
- for (i = 0; i < s->count; ++i) {
- /* Recursive use of kdb_parse, do not use argv after
- * this point */
+ kmp = container_of(kp, struct kdb_macro, cmd);
+ list_for_each_entry(kms, &kmp->statements, list_node) {
+ /*
+ * Recursive use of kdb_parse, do not use argv after this point.
+ */
argv = NULL;
- kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
- ret = kdb_parse(s->command[i]);
+ kdb_printf("[%s]kdb> %s\n", kmp->cmd.name, kms->statement);
+ ret = kdb_parse(kms->statement);
if (ret)
return ret;
}
@@ -1009,11 +1015,11 @@ int kdb_parse(const char *cmdstr)
* If this command is allowed to be abbreviated,
* check to see if this is it.
*/
- if (tp->cmd_minlen && (strlen(argv[0]) <= tp->cmd_minlen) &&
- (strncmp(argv[0], tp->cmd_name, tp->cmd_minlen) == 0))
+ if (tp->minlen && (strlen(argv[0]) <= tp->minlen) &&
+ (strncmp(argv[0], tp->name, tp->minlen) == 0))
break;
- if (strcmp(argv[0], tp->cmd_name) == 0)
+ if (strcmp(argv[0], tp->name) == 0)
break;
}
@@ -1024,8 +1030,7 @@ int kdb_parse(const char *cmdstr)
*/
if (list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
list_for_each_entry(tp, &kdb_cmds_head, list_node) {
- if (strncmp(argv[0], tp->cmd_name,
- strlen(tp->cmd_name)) == 0)
+ if (strncmp(argv[0], tp->name, strlen(tp->name)) == 0)
break;
}
}
@@ -1033,19 +1038,19 @@ int kdb_parse(const char *cmdstr)
if (!list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
int result;
- if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
+ if (!kdb_check_flags(tp->flags, kdb_cmd_enabled, argc <= 1))
return KDB_NOPERM;
KDB_STATE_SET(CMD);
- result = (*tp->cmd_func)(argc-1, (const char **)argv);
+ result = (*tp->func)(argc-1, (const char **)argv);
if (result && ignore_errors && result > KDB_CMD_GO)
result = 0;
KDB_STATE_CLEAR(CMD);
- if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
+ if (tp->flags & KDB_REPEAT_WITH_ARGS)
return result;
- argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
+ argc = tp->flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
if (argv[argc])
*(argv[argc]) = '\0';
return result;
@@ -2198,8 +2203,8 @@ static void kdb_cpu_status(void)
state = 'D'; /* cpu is online but unresponsive */
} else {
state = ' '; /* cpu is responding to kdb */
- if (kdb_task_state_char(KDB_TSK(i)) == 'I')
- state = 'I'; /* idle task */
+ if (kdb_task_state_char(KDB_TSK(i)) == '-')
+ state = '-'; /* idle task */
}
if (state != prev_state) {
if (prev_state != '?') {
@@ -2266,37 +2271,30 @@ static int kdb_cpu(int argc, const char **argv)
void kdb_ps_suppressed(void)
{
int idle = 0, daemon = 0;
- unsigned long mask_I = kdb_task_state_string("I"),
- mask_M = kdb_task_state_string("M");
unsigned long cpu;
const struct task_struct *p, *g;
for_each_online_cpu(cpu) {
p = kdb_curr_task(cpu);
- if (kdb_task_state(p, mask_I))
+ if (kdb_task_state(p, "-"))
++idle;
}
for_each_process_thread(g, p) {
- if (kdb_task_state(p, mask_M))
+ if (kdb_task_state(p, "ims"))
++daemon;
}
if (idle || daemon) {
if (idle)
- kdb_printf("%d idle process%s (state I)%s\n",
+ kdb_printf("%d idle process%s (state -)%s\n",
idle, idle == 1 ? "" : "es",
daemon ? " and " : "");
if (daemon)
- kdb_printf("%d sleeping system daemon (state M) "
+ kdb_printf("%d sleeping system daemon (state [ims]) "
"process%s", daemon,
daemon == 1 ? "" : "es");
kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
}
}
-/*
- * kdb_ps - This function implements the 'ps' command which shows a
- * list of the active processes.
- * ps [DRSTCZEUIMA] All processes, optionally filtered by state
- */
void kdb_ps1(const struct task_struct *p)
{
int cpu;
@@ -2325,17 +2323,25 @@ void kdb_ps1(const struct task_struct *p)
}
}
+/*
+ * kdb_ps - This function implements the 'ps' command which shows a
+ * list of the active processes.
+ *
+ * ps [<state_chars>] Show processes, optionally selecting only those whose
+ * state character is found in <state_chars>.
+ */
static int kdb_ps(int argc, const char **argv)
{
struct task_struct *g, *p;
- unsigned long mask, cpu;
+ const char *mask;
+ unsigned long cpu;
if (argc == 0)
kdb_ps_suppressed();
kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
(int)(2*sizeof(void *))+2, "Task Addr",
(int)(2*sizeof(void *))+2, "Thread");
- mask = kdb_task_state_string(argc ? argv[1] : NULL);
+ mask = argc ? argv[1] : kdbgetenv("PS");
/* Run the active tasks first */
for_each_online_cpu(cpu) {
if (KDB_FLAG(CMD_INTERRUPT))
@@ -2412,12 +2418,12 @@ static int kdb_help(int argc, const char **argv)
char *space = "";
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
- if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
+ if (!kdb_check_flags(kt->flags, kdb_cmd_enabled, true))
continue;
- if (strlen(kt->cmd_usage) > 20)
+ if (strlen(kt->usage) > 20)
space = "\n ";
- kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
- kt->cmd_usage, space, kt->cmd_help);
+ kdb_printf("%-15.15s %-20s%s%s\n", kt->name,
+ kt->usage, space, kt->help);
}
return 0;
}
@@ -2613,56 +2619,32 @@ static int kdb_grep_help(int argc, const char **argv)
return 0;
}
-/*
- * kdb_register_flags - This function is used to register a kernel
- * debugger command.
- * Inputs:
- * cmd Command name
- * func Function to execute the command
- * usage A simple usage string showing arguments
- * help A simple help string describing command
- * repeat Does the command auto repeat on enter?
- * Returns:
- * zero for success, one if a duplicate command.
+/**
+ * kdb_register() - This function is used to register a kernel debugger
+ * command.
+ * @cmd: pointer to kdb command
+ *
+ * Note that it's the job of the caller to keep the memory for the cmd
+ * allocated until unregister is called.
*/
-int kdb_register_flags(char *cmd,
- kdb_func_t func,
- char *usage,
- char *help,
- short minlen,
- kdb_cmdflags_t flags)
+int kdb_register(kdbtab_t *cmd)
{
kdbtab_t *kp;
list_for_each_entry(kp, &kdb_cmds_head, list_node) {
- if (strcmp(kp->cmd_name, cmd) == 0) {
- kdb_printf("Duplicate kdb command registered: "
- "%s, func %px help %s\n", cmd, func, help);
+ if (strcmp(kp->name, cmd->name) == 0) {
+ kdb_printf("Duplicate kdb cmd: %s, func %p help %s\n",
+ cmd->name, cmd->func, cmd->help);
return 1;
}
}
- kp = kmalloc(sizeof(*kp), GFP_KDB);
- if (!kp) {
- kdb_printf("Could not allocate new kdb_command table\n");
- return 1;
- }
-
- kp->cmd_name = cmd;
- kp->cmd_func = func;
- kp->cmd_usage = usage;
- kp->cmd_help = help;
- kp->cmd_minlen = minlen;
- kp->cmd_flags = flags;
- kp->is_dynamic = true;
-
- list_add_tail(&kp->list_node, &kdb_cmds_head);
-
+ list_add_tail(&cmd->list_node, &kdb_cmds_head);
return 0;
}
-EXPORT_SYMBOL_GPL(kdb_register_flags);
+EXPORT_SYMBOL_GPL(kdb_register);
-/*
+/**
* kdb_register_table() - This function is used to register a kdb command
* table.
* @kp: pointer to kdb command table
@@ -2676,266 +2658,231 @@ void kdb_register_table(kdbtab_t *kp, size_t len)
}
}
-/*
- * kdb_register - Compatibility register function for commands that do
- * not need to specify a repeat state. Equivalent to
- * kdb_register_flags with flags set to 0.
- * Inputs:
- * cmd Command name
- * func Function to execute the command
- * usage A simple usage string showing arguments
- * help A simple help string describing command
- * Returns:
- * zero for success, one if a duplicate command.
+/**
+ * kdb_unregister() - This function is used to unregister a kernel debugger
+ * command. It is generally called when a module which
+ * implements kdb command is unloaded.
+ * @cmd: pointer to kdb command
*/
-int kdb_register(char *cmd,
- kdb_func_t func,
- char *usage,
- char *help,
- short minlen)
+void kdb_unregister(kdbtab_t *cmd)
{
- return kdb_register_flags(cmd, func, usage, help, minlen, 0);
-}
-EXPORT_SYMBOL_GPL(kdb_register);
-
-/*
- * kdb_unregister - This function is used to unregister a kernel
- * debugger command. It is generally called when a module which
- * implements kdb commands is unloaded.
- * Inputs:
- * cmd Command name
- * Returns:
- * zero for success, one command not registered.
- */
-int kdb_unregister(char *cmd)
-{
- kdbtab_t *kp;
-
- /*
- * find the command.
- */
- list_for_each_entry(kp, &kdb_cmds_head, list_node) {
- if (strcmp(kp->cmd_name, cmd) == 0) {
- list_del(&kp->list_node);
- if (kp->is_dynamic)
- kfree(kp);
- return 0;
- }
- }
-
- /* Couldn't find it. */
- return 1;
+ list_del(&cmd->list_node);
}
EXPORT_SYMBOL_GPL(kdb_unregister);
static kdbtab_t maintab[] = {
- { .cmd_name = "md",
- .cmd_func = kdb_md,
- .cmd_usage = "<vaddr>",
- .cmd_help = "Display Memory Contents, also mdWcN, e.g. md8c1",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ { .name = "md",
+ .func = kdb_md,
+ .usage = "<vaddr>",
+ .help = "Display Memory Contents, also mdWcN, e.g. md8c1",
+ .minlen = 1,
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "mdr",
- .cmd_func = kdb_md,
- .cmd_usage = "<vaddr> <bytes>",
- .cmd_help = "Display Raw Memory",
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ { .name = "mdr",
+ .func = kdb_md,
+ .usage = "<vaddr> <bytes>",
+ .help = "Display Raw Memory",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "mdp",
- .cmd_func = kdb_md,
- .cmd_usage = "<paddr> <bytes>",
- .cmd_help = "Display Physical Memory",
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ { .name = "mdp",
+ .func = kdb_md,
+ .usage = "<paddr> <bytes>",
+ .help = "Display Physical Memory",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "mds",
- .cmd_func = kdb_md,
- .cmd_usage = "<vaddr>",
- .cmd_help = "Display Memory Symbolically",
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ { .name = "mds",
+ .func = kdb_md,
+ .usage = "<vaddr>",
+ .help = "Display Memory Symbolically",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "mm",
- .cmd_func = kdb_mm,
- .cmd_usage = "<vaddr> <contents>",
- .cmd_help = "Modify Memory Contents",
- .cmd_flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS,
+ { .name = "mm",
+ .func = kdb_mm,
+ .usage = "<vaddr> <contents>",
+ .help = "Modify Memory Contents",
+ .flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS,
},
- { .cmd_name = "go",
- .cmd_func = kdb_go,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "Continue Execution",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_REG_WRITE |
+ { .name = "go",
+ .func = kdb_go,
+ .usage = "[<vaddr>]",
+ .help = "Continue Execution",
+ .minlen = 1,
+ .flags = KDB_ENABLE_REG_WRITE |
KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
},
- { .cmd_name = "rd",
- .cmd_func = kdb_rd,
- .cmd_usage = "",
- .cmd_help = "Display Registers",
- .cmd_flags = KDB_ENABLE_REG_READ,
+ { .name = "rd",
+ .func = kdb_rd,
+ .usage = "",
+ .help = "Display Registers",
+ .flags = KDB_ENABLE_REG_READ,
},
- { .cmd_name = "rm",
- .cmd_func = kdb_rm,
- .cmd_usage = "<reg> <contents>",
- .cmd_help = "Modify Registers",
- .cmd_flags = KDB_ENABLE_REG_WRITE,
+ { .name = "rm",
+ .func = kdb_rm,
+ .usage = "<reg> <contents>",
+ .help = "Modify Registers",
+ .flags = KDB_ENABLE_REG_WRITE,
},
- { .cmd_name = "ef",
- .cmd_func = kdb_ef,
- .cmd_usage = "<vaddr>",
- .cmd_help = "Display exception frame",
- .cmd_flags = KDB_ENABLE_MEM_READ,
+ { .name = "ef",
+ .func = kdb_ef,
+ .usage = "<vaddr>",
+ .help = "Display exception frame",
+ .flags = KDB_ENABLE_MEM_READ,
},
- { .cmd_name = "bt",
- .cmd_func = kdb_bt,
- .cmd_usage = "[<vaddr>]",
- .cmd_help = "Stack traceback",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ { .name = "bt",
+ .func = kdb_bt,
+ .usage = "[<vaddr>]",
+ .help = "Stack traceback",
+ .minlen = 1,
+ .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
},
- { .cmd_name = "btp",
- .cmd_func = kdb_bt,
- .cmd_usage = "<pid>",
- .cmd_help = "Display stack for process <pid>",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "btp",
+ .func = kdb_bt,
+ .usage = "<pid>",
+ .help = "Display stack for process <pid>",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "bta",
- .cmd_func = kdb_bt,
- .cmd_usage = "[D|R|S|T|C|Z|E|U|I|M|A]",
- .cmd_help = "Backtrace all processes matching state flag",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "bta",
+ .func = kdb_bt,
+ .usage = "[<state_chars>|A]",
+ .help = "Backtrace all processes whose state matches",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "btc",
- .cmd_func = kdb_bt,
- .cmd_usage = "",
- .cmd_help = "Backtrace current process on each cpu",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "btc",
+ .func = kdb_bt,
+ .usage = "",
+ .help = "Backtrace current process on each cpu",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "btt",
- .cmd_func = kdb_bt,
- .cmd_usage = "<vaddr>",
- .cmd_help = "Backtrace process given its struct task address",
- .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ { .name = "btt",
+ .func = kdb_bt,
+ .usage = "<vaddr>",
+ .help = "Backtrace process given its struct task address",
+ .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
},
- { .cmd_name = "env",
- .cmd_func = kdb_env,
- .cmd_usage = "",
- .cmd_help = "Show environment variables",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "env",
+ .func = kdb_env,
+ .usage = "",
+ .help = "Show environment variables",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "set",
- .cmd_func = kdb_set,
- .cmd_usage = "",
- .cmd_help = "Set environment variables",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "set",
+ .func = kdb_set,
+ .usage = "",
+ .help = "Set environment variables",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "help",
- .cmd_func = kdb_help,
- .cmd_usage = "",
- .cmd_help = "Display Help Message",
- .cmd_minlen = 1,
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "help",
+ .func = kdb_help,
+ .usage = "",
+ .help = "Display Help Message",
+ .minlen = 1,
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "?",
- .cmd_func = kdb_help,
- .cmd_usage = "",
- .cmd_help = "Display Help Message",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "?",
+ .func = kdb_help,
+ .usage = "",
+ .help = "Display Help Message",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "cpu",
- .cmd_func = kdb_cpu,
- .cmd_usage = "<cpunum>",
- .cmd_help = "Switch to new cpu",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
+ { .name = "cpu",
+ .func = kdb_cpu,
+ .usage = "<cpunum>",
+ .help = "Switch to new cpu",
+ .flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
},
- { .cmd_name = "kgdb",
- .cmd_func = kdb_kgdb,
- .cmd_usage = "",
- .cmd_help = "Enter kgdb mode",
- .cmd_flags = 0,
+ { .name = "kgdb",
+ .func = kdb_kgdb,
+ .usage = "",
+ .help = "Enter kgdb mode",
+ .flags = 0,
},
- { .cmd_name = "ps",
- .cmd_func = kdb_ps,
- .cmd_usage = "[<flags>|A]",
- .cmd_help = "Display active task list",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "ps",
+ .func = kdb_ps,
+ .usage = "[<state_chars>|A]",
+ .help = "Display active task list",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "pid",
- .cmd_func = kdb_pid,
- .cmd_usage = "<pidnum>",
- .cmd_help = "Switch to another task",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "pid",
+ .func = kdb_pid,
+ .usage = "<pidnum>",
+ .help = "Switch to another task",
+ .flags = KDB_ENABLE_INSPECT,
},
- { .cmd_name = "reboot",
- .cmd_func = kdb_reboot,
- .cmd_usage = "",
- .cmd_help = "Reboot the machine immediately",
- .cmd_flags = KDB_ENABLE_REBOOT,
+ { .name = "reboot",
+ .func = kdb_reboot,
+ .usage = "",
+ .help = "Reboot the machine immediately",
+ .flags = KDB_ENABLE_REBOOT,
},
#if defined(CONFIG_MODULES)
- { .cmd_name = "lsmod",
- .cmd_func = kdb_lsmod,
- .cmd_usage = "",
- .cmd_help = "List loaded kernel modules",
- .cmd_flags = KDB_ENABLE_INSPECT,
+ { .name = "lsmod",
+ .func = kdb_lsmod,
+ .usage = "",
+ .help = "List loaded kernel modules",
+ .flags = KDB_ENABLE_INSPECT,
},
#endif
#if defined(CONFIG_MAGIC_SYSRQ)
- { .cmd_name = "sr",
- .cmd_func = kdb_sr,
- .cmd_usage = "<key>",
- .cmd_help = "Magic SysRq key",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "sr",
+ .func = kdb_sr,
+ .usage = "<key>",
+ .help = "Magic SysRq key",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
#endif
#if defined(CONFIG_PRINTK)
- { .cmd_name = "dmesg",
- .cmd_func = kdb_dmesg,
- .cmd_usage = "[lines]",
- .cmd_help = "Display syslog buffer",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "dmesg",
+ .func = kdb_dmesg,
+ .usage = "[lines]",
+ .help = "Display syslog buffer",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
#endif
- { .cmd_name = "defcmd",
- .cmd_func = kdb_defcmd,
- .cmd_usage = "name \"usage\" \"help\"",
- .cmd_help = "Define a set of commands, down to endefcmd",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "defcmd",
+ .func = kdb_defcmd,
+ .usage = "name \"usage\" \"help\"",
+ .help = "Define a set of commands, down to endefcmd",
+ /*
+ * Macros are always safe because when executed each
+ * internal command re-enters kdb_parse() and is safety
+ * checked individually.
+ */
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "kill",
- .cmd_func = kdb_kill,
- .cmd_usage = "<-signal> <pid>",
- .cmd_help = "Send a signal to a process",
- .cmd_flags = KDB_ENABLE_SIGNAL,
+ { .name = "kill",
+ .func = kdb_kill,
+ .usage = "<-signal> <pid>",
+ .help = "Send a signal to a process",
+ .flags = KDB_ENABLE_SIGNAL,
},
- { .cmd_name = "summary",
- .cmd_func = kdb_summary,
- .cmd_usage = "",
- .cmd_help = "Summarize the system",
- .cmd_minlen = 4,
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "summary",
+ .func = kdb_summary,
+ .usage = "",
+ .help = "Summarize the system",
+ .minlen = 4,
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
- { .cmd_name = "per_cpu",
- .cmd_func = kdb_per_cpu,
- .cmd_usage = "<sym> [<bytes>] [<cpu>]",
- .cmd_help = "Display per_cpu variables",
- .cmd_minlen = 3,
- .cmd_flags = KDB_ENABLE_MEM_READ,
+ { .name = "per_cpu",
+ .func = kdb_per_cpu,
+ .usage = "<sym> [<bytes>] [<cpu>]",
+ .help = "Display per_cpu variables",
+ .minlen = 3,
+ .flags = KDB_ENABLE_MEM_READ,
},
- { .cmd_name = "grephelp",
- .cmd_func = kdb_grep_help,
- .cmd_usage = "",
- .cmd_help = "Display help on | grep",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ { .name = "grephelp",
+ .func = kdb_grep_help,
+ .usage = "",
+ .help = "Display help on | grep",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
},
};
static kdbtab_t nmicmd = {
- .cmd_name = "disable_nmi",
- .cmd_func = kdb_disable_nmi,
- .cmd_usage = "",
- .cmd_help = "Disable NMI entry to KDB",
- .cmd_flags = KDB_ENABLE_ALWAYS_SAFE,
+ .name = "disable_nmi",
+ .func = kdb_disable_nmi,
+ .usage = "",
+ .help = "Disable NMI entry to KDB",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
};
/* Initialize the kdb command table. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 170c69aedebb..0d2f9feea0a4 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -109,7 +109,6 @@ extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
long *, char **);
extern int kdbgetsymval(const char *, kdb_symtab_t *);
extern int kdbnearsym(unsigned long, kdb_symtab_t *);
-extern void kdbnearsym_cleanup(void);
extern char *kdb_strdup(const char *str, gfp_t type);
extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
@@ -165,19 +164,6 @@ typedef struct _kdb_bp {
#ifdef CONFIG_KGDB_KDB
extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
-/* The KDB shell command table */
-typedef struct _kdbtab {
- char *cmd_name; /* Command name */
- kdb_func_t cmd_func; /* Function to execute command */
- char *cmd_usage; /* Usage String for this command */
- char *cmd_help; /* Help message for this command */
- short cmd_minlen; /* Minimum legal # command
- * chars required */
- kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
- struct list_head list_node; /* Command list */
- bool is_dynamic; /* Command table allocation type */
-} kdbtab_t;
-
extern void kdb_register_table(kdbtab_t *kp, size_t len);
extern int kdb_bt(int, const char **); /* KDB display back trace */
@@ -204,10 +190,8 @@ extern char kdb_grep_string[];
extern int kdb_grep_leading;
extern int kdb_grep_trailing;
extern char *kdb_cmds[];
-extern unsigned long kdb_task_state_string(const char *);
extern char kdb_task_state_char (const struct task_struct *);
-extern unsigned long kdb_task_state(const struct task_struct *p,
- unsigned long mask);
+extern bool kdb_task_state(const struct task_struct *p, const char *mask);
extern void kdb_ps_suppressed(void);
extern void kdb_ps1(const struct task_struct *p);
extern void kdb_send_sig(struct task_struct *p, int sig);
@@ -233,10 +217,6 @@ extern struct task_struct *kdb_curr_task(int);
#define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL)
-extern void *debug_kmalloc(size_t size, gfp_t flags);
-extern void debug_kfree(void *);
-extern void debug_kusage(void);
-
extern struct task_struct *kdb_current_task;
extern struct pt_regs *kdb_current_regs;
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 9f50d22d68e6..df2bface866e 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -10,7 +10,6 @@
* 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
*/
-#include <stdarg.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/mm.h>
@@ -25,6 +24,7 @@
#include <linux/uaccess.h>
#include <linux/kdb.h>
#include <linux/slab.h>
+#include <linux/ctype.h>
#include "kdb_private.h"
/*
@@ -52,48 +52,48 @@ int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
}
EXPORT_SYMBOL(kdbgetsymval);
-static char *kdb_name_table[100]; /* arbitrary size */
-
-/*
- * kdbnearsym - Return the name of the symbol with the nearest address
- * less than 'addr'.
+/**
+ * kdbnearsym() - Return the name of the symbol with the nearest address
+ * less than @addr.
+ * @addr: Address to check for near symbol
+ * @symtab: Structure to receive results
*
- * Parameters:
- * addr Address to check for symbol near
- * symtab Structure to receive results
- * Returns:
- * 0 No sections contain this address, symtab zero filled
- * 1 Address mapped to module/symbol/section, data in symtab
- * Remarks:
- * 2.6 kallsyms has a "feature" where it unpacks the name into a
- * string. If that string is reused before the caller expects it
- * then the caller sees its string change without warning. To
- * avoid cluttering up the main kdb code with lots of kdb_strdup,
- * tests and kfree calls, kdbnearsym maintains an LRU list of the
- * last few unique strings. The list is sized large enough to
- * hold active strings, no kdb caller of kdbnearsym makes more
- * than ~20 later calls before using a saved value.
+ * WARNING: This function may return a pointer to a single statically
+ * allocated buffer (namebuf). kdb's unusual calling context (single
+ * threaded, all other CPUs halted) provides us sufficient locking for
+ * this to be safe. The only constraint imposed by the static buffer is
+ * that the caller must consume any previous reply prior to another call
+ * to lookup a new symbol.
+ *
+ * Note that, strictly speaking, some architectures may re-enter the kdb
+ * trap if the system turns out to be very badly damaged and this breaks
+ * the single-threaded assumption above. In these circumstances successful
+ * continuation and exit from the inner trap is unlikely to work and any
+ * user attempting this receives a prominent warning before being allowed
+ * to progress. In these circumstances we remain memory safe because
+ * namebuf[KSYM_NAME_LEN-1] will never change from '\0' although we do
+ * tolerate the possibility of garbled symbol display from the outer kdb
+ * trap.
+ *
+ * Return:
+ * * 0 - No sections contain this address, symtab zero filled
+ * * 1 - Address mapped to module/symbol/section, data in symtab
*/
int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
{
int ret = 0;
unsigned long symbolsize = 0;
unsigned long offset = 0;
-#define knt1_size 128 /* must be >= kallsyms table size */
- char *knt1 = NULL;
+ static char namebuf[KSYM_NAME_LEN];
kdb_dbg_printf(AR, "addr=0x%lx, symtab=%px\n", addr, symtab);
memset(symtab, 0, sizeof(*symtab));
if (addr < 4096)
goto out;
- knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
- if (!knt1) {
- kdb_func_printf("addr=0x%lx cannot kmalloc knt1\n", addr);
- goto out;
- }
+
symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
- (char **)(&symtab->mod_name), knt1);
+ (char **)(&symtab->mod_name), namebuf);
if (offset > 8*1024*1024) {
symtab->sym_name = NULL;
addr = offset = symbolsize = 0;
@@ -102,63 +102,14 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
symtab->sym_end = symtab->sym_start + symbolsize;
ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
- if (ret) {
- int i;
- /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
- * set but the buffer passed into kallsyms_lookup is not used,
- * so it contains garbage. The caller has to work out which
- * buffer needs to be saved.
- *
- * What was Rusty smoking when he wrote that code?
- */
- if (symtab->sym_name != knt1) {
- strncpy(knt1, symtab->sym_name, knt1_size);
- knt1[knt1_size-1] = '\0';
- }
- for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
- if (kdb_name_table[i] &&
- strcmp(kdb_name_table[i], knt1) == 0)
- break;
- }
- if (i >= ARRAY_SIZE(kdb_name_table)) {
- debug_kfree(kdb_name_table[0]);
- memmove(kdb_name_table, kdb_name_table+1,
- sizeof(kdb_name_table[0]) *
- (ARRAY_SIZE(kdb_name_table)-1));
- } else {
- debug_kfree(knt1);
- knt1 = kdb_name_table[i];
- memmove(kdb_name_table+i, kdb_name_table+i+1,
- sizeof(kdb_name_table[0]) *
- (ARRAY_SIZE(kdb_name_table)-i-1));
- }
- i = ARRAY_SIZE(kdb_name_table) - 1;
- kdb_name_table[i] = knt1;
- symtab->sym_name = kdb_name_table[i];
- knt1 = NULL;
- }
-
if (symtab->mod_name == NULL)
symtab->mod_name = "kernel";
kdb_dbg_printf(AR, "returns %d symtab->sym_start=0x%lx, symtab->mod_name=%px, symtab->sym_name=%px (%s)\n",
ret, symtab->sym_start, symtab->mod_name, symtab->sym_name, symtab->sym_name);
-
out:
- debug_kfree(knt1);
return ret;
}
-void kdbnearsym_cleanup(void)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
- if (kdb_name_table[i]) {
- debug_kfree(kdb_name_table[i]);
- kdb_name_table[i] = NULL;
- }
- }
-}
-
static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
/*
@@ -523,82 +474,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
return diag;
}
-/*
- * kdb_task_state_string - Convert a string containing any of the
- * letters DRSTCZEUIMA to a mask for the process state field and
- * return the value. If no argument is supplied, return the mask
- * that corresponds to environment variable PS, DRSTCZEU by
- * default.
- * Inputs:
- * s String to convert
- * Returns:
- * Mask for process state.
- * Notes:
- * The mask folds data from several sources into a single long value, so
- * be careful not to overlap the bits. TASK_* bits are in the LSB,
- * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
- * is no overlap between TASK_* and EXIT_* but that may not always be
- * true, so EXIT_* bits are shifted left 16 bits before being stored in
- * the mask.
- */
-
-/* unrunnable is < 0 */
-#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
-#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
-#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
-#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
-unsigned long kdb_task_state_string(const char *s)
-{
- long res = 0;
- if (!s) {
- s = kdbgetenv("PS");
- if (!s)
- s = "DRSTCZEU"; /* default value for ps */
- }
- while (*s) {
- switch (*s) {
- case 'D':
- res |= TASK_UNINTERRUPTIBLE;
- break;
- case 'R':
- res |= RUNNING;
- break;
- case 'S':
- res |= TASK_INTERRUPTIBLE;
- break;
- case 'T':
- res |= TASK_STOPPED;
- break;
- case 'C':
- res |= TASK_TRACED;
- break;
- case 'Z':
- res |= EXIT_ZOMBIE << 16;
- break;
- case 'E':
- res |= EXIT_DEAD << 16;
- break;
- case 'U':
- res |= UNRUNNABLE;
- break;
- case 'I':
- res |= IDLE;
- break;
- case 'M':
- res |= DAEMON;
- break;
- case 'A':
- res = ~0UL;
- break;
- default:
- kdb_func_printf("unknown flag '%c' ignored\n", *s);
- break;
- }
- ++s;
- }
- return res;
-}
/*
* kdb_task_state_char - Return the character that represents the task state.
@@ -609,7 +485,6 @@ unsigned long kdb_task_state_string(const char *s)
*/
char kdb_task_state_char (const struct task_struct *p)
{
- unsigned int p_state;
unsigned long tmp;
char state;
int cpu;
@@ -618,25 +493,18 @@ char kdb_task_state_char (const struct task_struct *p)
copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long)))
return 'E';
- cpu = kdb_process_cpu(p);
- p_state = READ_ONCE(p->__state);
- state = (p_state == 0) ? 'R' :
- (p_state < 0) ? 'U' :
- (p_state & TASK_UNINTERRUPTIBLE) ? 'D' :
- (p_state & TASK_STOPPED) ? 'T' :
- (p_state & TASK_TRACED) ? 'C' :
- (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
- (p->exit_state & EXIT_DEAD) ? 'E' :
- (p_state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+ state = task_state_to_char((struct task_struct *) p);
+
if (is_idle_task(p)) {
/* Idle task. Is it really idle, apart from the kdb
* interrupt? */
+ cpu = kdb_process_cpu(p);
if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
if (cpu != kdb_initial_cpu)
- state = 'I'; /* idle task */
+ state = '-'; /* idle task */
}
- } else if (!p->mm && state == 'S') {
- state = 'M'; /* sleeping system daemon */
+ } else if (!p->mm && strchr("IMS", state)) {
+ state = tolower(state); /* sleeping system daemon */
}
return state;
}
@@ -646,238 +514,28 @@ char kdb_task_state_char (const struct task_struct *p)
* given by the mask.
* Inputs:
* p struct task for the process
- * mask mask from kdb_task_state_string to select processes
+ * mask set of characters used to select processes; both NULL
+ * and the empty string mean adopt a default filter, which
+ * is to suppress sleeping system daemons and the idle tasks
* Returns:
* True if the process matches at least one criteria defined by the mask.
*/
-unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
+bool kdb_task_state(const struct task_struct *p, const char *mask)
{
- char state[] = { kdb_task_state_char(p), '\0' };
- return (mask & kdb_task_state_string(state)) != 0;
-}
-
-/* Last ditch allocator for debugging, so we can still debug even when
- * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
- * for space usage, not for speed. One smallish memory pool, the free
- * chain is always in ascending address order to allow coalescing,
- * allocations are done in brute force best fit.
- */
-
-struct debug_alloc_header {
- u32 next; /* offset of next header from start of pool */
- u32 size;
- void *caller;
-};
-
-/* The memory returned by this allocator must be aligned, which means
- * so must the header size. Do not assume that sizeof(struct
- * debug_alloc_header) is a multiple of the alignment, explicitly
- * calculate the overhead of this header, including the alignment.
- * The rest of this code must not use sizeof() on any header or
- * pointer to a header.
- */
-#define dah_align 8
-#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
-
-static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
-static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
-static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
-
-/* Locking is awkward. The debug code is called from all contexts,
- * including non maskable interrupts. A normal spinlock is not safe
- * in NMI context. Try to get the debug allocator lock, if it cannot
- * be obtained after a second then give up. If the lock could not be
- * previously obtained on this cpu then only try once.
- *
- * sparse has no annotation for "this function _sometimes_ acquires a
- * lock", so fudge the acquire/release notation.
- */
-static DEFINE_SPINLOCK(dap_lock);
-static int get_dap_lock(void)
- __acquires(dap_lock)
-{
- static int dap_locked = -1;
- int count;
- if (dap_locked == smp_processor_id())
- count = 1;
- else
- count = 1000;
- while (1) {
- if (spin_trylock(&dap_lock)) {
- dap_locked = -1;
- return 1;
- }
- if (!count--)
- break;
- udelay(1000);
- }
- dap_locked = smp_processor_id();
- __acquire(dap_lock);
- return 0;
-}
+ char state = kdb_task_state_char(p);
-void *debug_kmalloc(size_t size, gfp_t flags)
-{
- unsigned int rem, h_offset;
- struct debug_alloc_header *best, *bestprev, *prev, *h;
- void *p = NULL;
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return NULL;
- }
- h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
- if (dah_first_call) {
- h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
- dah_first_call = 0;
- }
- size = ALIGN(size, dah_align);
- prev = best = bestprev = NULL;
- while (1) {
- if (h->size >= size && (!best || h->size < best->size)) {
- best = h;
- bestprev = prev;
- if (h->size == size)
- break;
- }
- if (!h->next)
- break;
- prev = h;
- h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
- }
- if (!best)
- goto out;
- rem = best->size - size;
- /* The pool must always contain at least one header */
- if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
- goto out;
- if (rem >= dah_overhead) {
- best->size = size;
- h_offset = ((char *)best - debug_alloc_pool) +
- dah_overhead + best->size;
- h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
- h->size = rem - dah_overhead;
- h->next = best->next;
- } else
- h_offset = best->next;
- best->caller = __builtin_return_address(0);
- dah_used += best->size;
- dah_used_max = max(dah_used, dah_used_max);
- if (bestprev)
- bestprev->next = h_offset;
- else
- dah_first = h_offset;
- p = (char *)best + dah_overhead;
- memset(p, POISON_INUSE, best->size - 1);
- *((char *)p + best->size - 1) = POISON_END;
-out:
- spin_unlock(&dap_lock);
- return p;
-}
+ /* If there is no mask, then we will filter code that runs when the
+ * scheduler is idling and any system daemons that are currently
+ * sleeping.
+ */
+ if (!mask || mask[0] == '\0')
+ return !strchr("-ims", state);
-void debug_kfree(void *p)
-{
- struct debug_alloc_header *h;
- unsigned int h_offset;
- if (!p)
- return;
- if ((char *)p < debug_alloc_pool ||
- (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
- kfree(p);
- return;
- }
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return; /* memory leak, cannot be helped */
- }
- h = (struct debug_alloc_header *)((char *)p - dah_overhead);
- memset(p, POISON_FREE, h->size - 1);
- *((char *)p + h->size - 1) = POISON_END;
- h->caller = NULL;
- dah_used -= h->size;
- h_offset = (char *)h - debug_alloc_pool;
- if (h_offset < dah_first) {
- h->next = dah_first;
- dah_first = h_offset;
- } else {
- struct debug_alloc_header *prev;
- unsigned int prev_offset;
- prev = (struct debug_alloc_header *)(debug_alloc_pool +
- dah_first);
- while (1) {
- if (!prev->next || prev->next > h_offset)
- break;
- prev = (struct debug_alloc_header *)
- (debug_alloc_pool + prev->next);
- }
- prev_offset = (char *)prev - debug_alloc_pool;
- if (prev_offset + dah_overhead + prev->size == h_offset) {
- prev->size += dah_overhead + h->size;
- memset(h, POISON_FREE, dah_overhead - 1);
- *((char *)h + dah_overhead - 1) = POISON_END;
- h = prev;
- h_offset = prev_offset;
- } else {
- h->next = prev->next;
- prev->next = h_offset;
- }
- }
- if (h_offset + dah_overhead + h->size == h->next) {
- struct debug_alloc_header *next;
- next = (struct debug_alloc_header *)
- (debug_alloc_pool + h->next);
- h->size += dah_overhead + next->size;
- h->next = next->next;
- memset(next, POISON_FREE, dah_overhead - 1);
- *((char *)next + dah_overhead - 1) = POISON_END;
- }
- spin_unlock(&dap_lock);
-}
+ /* A is a special case that matches all states */
+ if (strchr(mask, 'A'))
+ return true;
-void debug_kusage(void)
-{
- struct debug_alloc_header *h_free, *h_used;
-#ifdef CONFIG_IA64
- /* FIXME: using dah for ia64 unwind always results in a memory leak.
- * Fix that memory leak first, then set debug_kusage_one_time = 1 for
- * all architectures.
- */
- static int debug_kusage_one_time;
-#else
- static int debug_kusage_one_time = 1;
-#endif
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return;
- }
- h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
- if (dah_first == 0 &&
- (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
- dah_first_call))
- goto out;
- if (!debug_kusage_one_time)
- goto out;
- debug_kusage_one_time = 0;
- kdb_func_printf("debug_kmalloc memory leak dah_first %d\n", dah_first);
- if (dah_first) {
- h_used = (struct debug_alloc_header *)debug_alloc_pool;
- kdb_func_printf("h_used %px size %d\n", h_used, h_used->size);
- }
- do {
- h_used = (struct debug_alloc_header *)
- ((char *)h_free + dah_overhead + h_free->size);
- kdb_func_printf("h_used %px size %d caller %px\n",
- h_used, h_used->size, h_used->caller);
- h_free = (struct debug_alloc_header *)
- (debug_alloc_pool + h_free->next);
- } while (h_free->next);
- h_used = (struct debug_alloc_header *)
- ((char *)h_free + dah_overhead + h_free->size);
- if ((char *)h_used - debug_alloc_pool !=
- sizeof(debug_alloc_pool_aligned))
- kdb_func_printf("h_used %px size %d caller %px\n",
- h_used, h_used->size, h_used->caller);
-out:
- spin_unlock(&dap_lock);
+ return strchr(mask, state);
}
/* Maintain a small stack of kdb_flags to allow recursion without disturbing
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 25fc85a7aebe..375fb3c9538d 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -40,7 +40,6 @@ static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
{
struct dma_coherent_mem *dma_mem;
int pages = size >> PAGE_SHIFT;
- int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
void *mem_base;
if (!size)
@@ -53,7 +52,7 @@ static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
if (!dma_mem)
goto out_unmap_membase;
- dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ dma_mem->bitmap = bitmap_zalloc(pages, GFP_KERNEL);
if (!dma_mem->bitmap)
goto out_free_dma_mem;
@@ -81,7 +80,7 @@ static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
return;
memunmap(mem->virt_base);
- kfree(mem->bitmap);
+ bitmap_free(mem->bitmap);
kfree(mem);
}
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 6c90c69e5311..7a14ca29c377 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -552,7 +552,7 @@ static void active_cacheline_remove(struct dma_debug_entry *entry)
* Wrapper function for adding an entry to the hash.
* This function takes care of locking itself.
*/
-static void add_dma_entry(struct dma_debug_entry *entry)
+static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
{
struct hash_bucket *bucket;
unsigned long flags;
@@ -566,8 +566,9 @@ static void add_dma_entry(struct dma_debug_entry *entry)
if (rc == -ENOMEM) {
pr_err("cacheline tracking ENOMEM, dma-debug disabled\n");
global_disable = true;
- } else if (rc == -EEXIST) {
- pr_err("cacheline tracking EEXIST, overlapping mappings aren't supported\n");
+ } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
+ err_printk(entry->dev, entry,
+ "cacheline tracking EEXIST, overlapping mappings aren't supported\n");
}
}
@@ -1190,7 +1191,8 @@ void debug_dma_map_single(struct device *dev, const void *addr,
EXPORT_SYMBOL(debug_dma_map_single);
void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
- size_t size, int direction, dma_addr_t dma_addr)
+ size_t size, int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1221,7 +1223,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
check_for_illegal_area(dev, addr, size);
}
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
@@ -1279,7 +1281,8 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
}
void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, int mapped_ents, int direction)
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
struct scatterlist *s;
@@ -1288,6 +1291,12 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
if (unlikely(dma_debug_disabled()))
return;
+ for_each_sg(sg, s, nents, i) {
+ check_for_stack(dev, sg_page(s), s->offset);
+ if (!PageHighMem(sg_page(s)))
+ check_for_illegal_area(dev, sg_virt(s), s->length);
+ }
+
for_each_sg(sg, s, mapped_ents, i) {
entry = dma_entry_alloc();
if (!entry)
@@ -1303,15 +1312,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
entry->sg_call_ents = nents;
entry->sg_mapped_ents = mapped_ents;
- check_for_stack(dev, sg_page(s), s->offset);
-
- if (!PageHighMem(sg_page(s))) {
- check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
- }
-
check_sg_segment(dev, s);
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
}
@@ -1367,7 +1370,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
}
void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt)
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1397,7 +1401,7 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
else
entry->pfn = page_to_pfn(virt_to_page(virt));
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
void debug_dma_free_coherent(struct device *dev, size_t size,
@@ -1428,7 +1432,8 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
}
void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
- int direction, dma_addr_t dma_addr)
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1448,7 +1453,7 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
entry->direction = direction;
entry->map_err_type = MAP_ERR_NOT_CHECKED;
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
index 83643b3010b2..f525197d3cae 100644
--- a/kernel/dma/debug.h
+++ b/kernel/dma/debug.h
@@ -11,26 +11,30 @@
#ifdef CONFIG_DMA_API_DEBUG
extern void debug_dma_map_page(struct device *dev, struct page *page,
size_t offset, size_t size,
- int direction, dma_addr_t dma_addr);
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs);
extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
size_t size, int direction);
extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, int mapped_ents, int direction);
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs);
extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
int nelems, int dir);
extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt);
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs);
extern void debug_dma_free_coherent(struct device *dev, size_t size,
void *virt, dma_addr_t addr);
extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
size_t size, int direction,
- dma_addr_t dma_addr);
+ dma_addr_t dma_addr,
+ unsigned long attrs);
extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
size_t size, int direction);
@@ -53,7 +57,8 @@ extern void debug_dma_sync_sg_for_device(struct device *dev,
#else /* CONFIG_DMA_API_DEBUG */
static inline void debug_dma_map_page(struct device *dev, struct page *page,
size_t offset, size_t size,
- int direction, dma_addr_t dma_addr)
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
}
@@ -63,7 +68,8 @@ static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
}
static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, int mapped_ents, int direction)
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs)
{
}
@@ -74,7 +80,8 @@ static inline void debug_dma_unmap_sg(struct device *dev,
}
static inline void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt)
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs)
{
}
@@ -85,7 +92,8 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size,
static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
size_t size, int direction,
- dma_addr_t dma_addr)
+ dma_addr_t dma_addr,
+ unsigned long attrs)
{
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 4c6c5e0635e3..50f48e9e4598 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -75,15 +75,45 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
}
+static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size)
+{
+ if (!force_dma_unencrypted(dev))
+ return 0;
+ return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size));
+}
+
+static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
+{
+ int ret;
+
+ if (!force_dma_unencrypted(dev))
+ return 0;
+ ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size));
+ if (ret)
+ pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n");
+ return ret;
+}
+
static void __dma_direct_free_pages(struct device *dev, struct page *page,
size_t size)
{
- if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
- swiotlb_free(dev, page, size))
+ if (swiotlb_free(dev, page, size))
return;
dma_free_contiguous(dev, page, size);
}
+static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
+{
+ struct page *page = swiotlb_alloc(dev, size);
+
+ if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ swiotlb_free(dev, page, size);
+ return NULL;
+ }
+
+ return page;
+}
+
static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp_t gfp)
{
@@ -93,18 +123,11 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
WARN_ON_ONCE(!PAGE_ALIGNED(size));
+ if (is_swiotlb_for_alloc(dev))
+ return dma_direct_alloc_swiotlb(dev, size);
+
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_limit);
- if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
- is_swiotlb_for_alloc(dev)) {
- page = swiotlb_alloc(dev, size);
- if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
- __dma_direct_free_pages(dev, page, size);
- return NULL;
- }
- return page;
- }
-
page = dma_alloc_contiguous(dev, size, gfp);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_free_contiguous(dev, page, size);
@@ -133,6 +156,15 @@ again:
return page;
}
+/*
+ * Check if a potentially blocking operations needs to dip into the atomic
+ * pools for the given device/gfp.
+ */
+static bool dma_direct_use_pool(struct device *dev, gfp_t gfp)
+{
+ return !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev);
+}
+
static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp)
{
@@ -140,6 +172,9 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
u64 phys_mask;
void *ret;
+ if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)))
+ return NULL;
+
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_mask);
page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
@@ -149,64 +184,103 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
return ret;
}
+static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp)
+{
+ struct page *page;
+
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+ if (!page)
+ return NULL;
+
+ /* remove any dirty cache lines on the kernel alias */
+ if (!PageHighMem(page))
+ arch_dma_prep_coherent(page, size);
+
+ /* return the page pointer as the opaque cookie */
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ return page;
+}
+
void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
+ bool remap = false, set_uncached = false;
struct page *page;
void *ret;
- int err;
size = PAGE_ALIGN(size);
if (attrs & DMA_ATTR_NO_WARN)
gfp |= __GFP_NOWARN;
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
- page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
- if (!page)
- return NULL;
- /* remove any dirty cache lines on the kernel alias */
- if (!PageHighMem(page))
- arch_dma_prep_coherent(page, size);
- *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
- /* return the page pointer as the opaque cookie */
- return page;
- }
+ !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev))
+ return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
- !dev_is_dma_coherent(dev) &&
- !is_swiotlb_for_alloc(dev))
- return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
+ if (!dev_is_dma_coherent(dev)) {
+ /*
+ * Fallback to the arch handler if it exists. This should
+ * eventually go away.
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
+ !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ !is_swiotlb_for_alloc(dev))
+ return arch_dma_alloc(dev, size, dma_handle, gfp,
+ attrs);
- if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
- !dev_is_dma_coherent(dev))
- return dma_alloc_from_global_coherent(dev, size, dma_handle);
+ /*
+ * If there is a global pool, always allocate from it for
+ * non-coherent devices.
+ */
+ if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL))
+ return dma_alloc_from_global_coherent(dev, size,
+ dma_handle);
+
+ /*
+ * Otherwise remap if the architecture is asking for it. But
+ * given that remapping memory is a blocking operation we'll
+ * instead have to dip into the atomic pools.
+ */
+ remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP);
+ if (remap) {
+ if (dma_direct_use_pool(dev, gfp))
+ return dma_direct_alloc_from_pool(dev, size,
+ dma_handle, gfp);
+ } else {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED))
+ return NULL;
+ set_uncached = true;
+ }
+ }
/*
- * Remapping or decrypting memory may block. If either is required and
- * we can't block, allocate the memory from the atomic pools.
- * If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must
- * set up another device coherent pool by shared-dma-pool and use
- * dma_alloc_from_dev_coherent instead.
+ * Decrypting memory may block, so allocate the memory from the atomic
+ * pools if we can't block.
*/
- if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
- !gfpflags_allow_blocking(gfp) &&
- (force_dma_unencrypted(dev) ||
- (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !dev_is_dma_coherent(dev))) &&
- !is_swiotlb_for_alloc(dev))
+ if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
/* we always manually zero the memory once we are done */
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
if (!page)
return NULL;
+ if (PageHighMem(page)) {
+ /*
+ * Depending on the cma= arguments and per-arch setup,
+ * dma_alloc_contiguous could return highmem pages.
+ * Without remapping there is no way to return them here, so
+ * log an error and fail.
+ */
+ if (!IS_ENABLED(CONFIG_DMA_REMAP)) {
+ dev_info(dev, "Rejecting highmem page from CMA.\n");
+ goto out_free_pages;
+ }
+ remap = true;
+ set_uncached = false;
+ }
- if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !dev_is_dma_coherent(dev)) ||
- (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
+ if (remap) {
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);
@@ -216,56 +290,27 @@ void *dma_direct_alloc(struct device *dev, size_t size,
__builtin_return_address(0));
if (!ret)
goto out_free_pages;
- if (force_dma_unencrypted(dev)) {
- err = set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size));
- if (err)
- goto out_free_pages;
- }
- memset(ret, 0, size);
- goto done;
- }
-
- if (PageHighMem(page)) {
- /*
- * Depending on the cma= arguments and per-arch setup
- * dma_alloc_contiguous could return highmem pages.
- * Without remapping there is no way to return them here,
- * so log an error and fail.
- */
- dev_info(dev, "Rejecting highmem page from CMA.\n");
- goto out_free_pages;
- }
-
- ret = page_address(page);
- if (force_dma_unencrypted(dev)) {
- err = set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size));
- if (err)
+ } else {
+ ret = page_address(page);
+ if (dma_set_decrypted(dev, ret, size))
goto out_free_pages;
}
memset(ret, 0, size);
- if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !dev_is_dma_coherent(dev)) {
+ if (set_uncached) {
arch_dma_prep_coherent(page, size);
ret = arch_dma_set_uncached(ret, size);
if (IS_ERR(ret))
goto out_encrypt_pages;
}
-done:
+
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return ret;
out_encrypt_pages:
- if (force_dma_unencrypted(dev)) {
- err = set_memory_encrypted((unsigned long)page_address(page),
- 1 << get_order(size));
- /* If memory cannot be re-encrypted, it must be leaked */
- if (err)
- return NULL;
- }
+ if (dma_set_encrypted(dev, page_address(page), size))
+ return NULL;
out_free_pages:
__dma_direct_free_pages(dev, page, size);
return NULL;
@@ -304,13 +349,14 @@ void dma_direct_free(struct device *dev, size_t size,
dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
return;
- if (force_dma_unencrypted(dev))
- set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
-
- if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
+ if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
vunmap(cpu_addr);
- else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
- arch_dma_clear_uncached(cpu_addr, size);
+ } else {
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
+ arch_dma_clear_uncached(cpu_addr, size);
+ if (dma_set_encrypted(dev, cpu_addr, 1 << page_order))
+ return;
+ }
__dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
}
@@ -321,9 +367,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
struct page *page;
void *ret;
- if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
- force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) &&
- !is_swiotlb_for_alloc(dev))
+ if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
page = __dma_direct_alloc_pages(dev, size, gfp);
@@ -341,11 +385,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
}
ret = page_address(page);
- if (force_dma_unencrypted(dev)) {
- if (set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size)))
- goto out_free_pages;
- }
+ if (dma_set_decrypted(dev, ret, size))
+ goto out_free_pages;
memset(ret, 0, size);
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
@@ -366,9 +407,8 @@ void dma_direct_free_pages(struct device *dev, size_t size,
dma_free_from_pool(dev, vaddr, size))
return;
- if (force_dma_unencrypted(dev))
- set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
-
+ if (dma_set_encrypted(dev, vaddr, 1 << page_order))
+ return;
__dma_direct_free_pages(dev, page, size);
}
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 7ee5284bff58..9478eccd1c8e 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -156,7 +156,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
else
addr = ops->map_page(dev, page, offset, size, dir, attrs);
- debug_dma_map_page(dev, page, offset, size, dir, addr);
+ debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
return addr;
}
@@ -195,7 +195,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
ents = ops->map_sg(dev, sg, nents, dir, attrs);
if (ents > 0)
- debug_dma_map_sg(dev, sg, nents, ents, dir);
+ debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
ents != -EIO))
return -EIO;
@@ -206,7 +206,8 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
/**
* dma_map_sg_attrs - Map the given buffer for DMA
* @dev: The device for which to perform the DMA operation
- * @sg: The sg_table object describing the buffer
+ * @sg: The sg_table object describing the buffer
+ * @nents: Number of entries to map
* @dir: DMA direction
* @attrs: Optional DMA attributes for the map operation
*
@@ -248,12 +249,12 @@ EXPORT_SYMBOL(dma_map_sg_attrs);
* Returns 0 on success or a negative error code on error. The following
* error codes are supported with the given meaning:
*
- * -EINVAL - An invalid argument, unaligned access or other error
- * in usage. Will not succeed if retried.
- * -ENOMEM - Insufficient resources (like memory or IOVA space) to
- * complete the mapping. Should succeed if retried later.
- * -EIO - Legacy error code with an unknown meaning. eg. this is
- * returned if a lower level call returned DMA_MAPPING_ERROR.
+ * -EINVAL An invalid argument, unaligned access or other error
+ * in usage. Will not succeed if retried.
+ * -ENOMEM Insufficient resources (like memory or IOVA space) to
+ * complete the mapping. Should succeed if retried later.
+ * -EIO Legacy error code with an unknown meaning. eg. this is
+ * returned if a lower level call returned DMA_MAPPING_ERROR.
*/
int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
enum dma_data_direction dir, unsigned long attrs)
@@ -295,16 +296,12 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
if (WARN_ON_ONCE(!dev->dma_mask))
return DMA_MAPPING_ERROR;
- /* Don't allow RAM to be mapped */
- if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
- return DMA_MAPPING_ERROR;
-
if (dma_map_direct(dev, ops))
addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
else if (ops->map_resource)
addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
- debug_dma_map_resource(dev, phys_addr, size, dir, addr);
+ debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs);
return addr;
}
EXPORT_SYMBOL(dma_map_resource);
@@ -509,7 +506,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
else
return NULL;
- debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+ debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs);
return cpu_addr;
}
EXPORT_SYMBOL(dma_alloc_attrs);
@@ -565,7 +562,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp);
if (page)
- debug_dma_map_page(dev, page, 0, size, dir, *dma_handle);
+ debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0);
return page;
}
EXPORT_SYMBOL_GPL(dma_alloc_pages);
@@ -643,7 +640,7 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
if (sgt) {
sgt->nents = 1;
- debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir);
+ debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs);
}
return sgt;
}
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 5f84e6cdb78e..4d40dcce7604 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void)
GFP_KERNEL);
if (!atomic_pool_kernel)
ret = -ENOMEM;
- if (IS_ENABLED(CONFIG_ZONE_DMA)) {
+ if (has_managed_dma()) {
atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
GFP_KERNEL | GFP_DMA);
if (!atomic_pool_dma)
@@ -226,7 +226,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
if (prev == NULL) {
if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
return atomic_pool_dma32;
- if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
+ if (atomic_pool_dma && (gfp & GFP_DMA))
return atomic_pool_dma;
return atomic_pool_kernel;
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 87c40517e822..8e840fbbed7c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -34,7 +34,7 @@
#include <linux/highmem.h>
#include <linux/gfp.h>
#include <linux/scatterlist.h>
-#include <linux/mem_encrypt.h>
+#include <linux/cc_platform.h>
#include <linux/set_memory.h>
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
@@ -247,7 +247,7 @@ swiotlb_init(int verbose)
return;
fail_free_mem:
- memblock_free_early(__pa(tlb), bytes);
+ memblock_free(tlb, bytes);
fail:
pr_warn("Cannot allocate buffer");
}
@@ -459,7 +459,7 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
* allocate a buffer from that IO TLB pool.
*/
static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
- size_t alloc_size)
+ size_t alloc_size, unsigned int alloc_align_mask)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
@@ -483,6 +483,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
if (alloc_size >= PAGE_SIZE)
stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
+ stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1);
spin_lock_irqsave(&mem->lock, flags);
if (unlikely(nslots > mem->nslabs - mem->used))
@@ -541,7 +542,8 @@ found:
phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
size_t mapping_size, size_t alloc_size,
- enum dma_data_direction dir, unsigned long attrs)
+ unsigned int alloc_align_mask, enum dma_data_direction dir,
+ unsigned long attrs)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
@@ -552,7 +554,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
if (!mem)
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
- if (mem_encrypt_active())
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
if (mapping_size > alloc_size) {
@@ -561,7 +563,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
return (phys_addr_t)DMA_MAPPING_ERROR;
}
- index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset);
+ index = swiotlb_find_slots(dev, orig_addr,
+ alloc_size + offset, alloc_align_mask);
if (index == -1) {
if (!(attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
@@ -675,7 +678,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
swiotlb_force);
- swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir,
+ swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir,
attrs);
if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
@@ -759,7 +762,7 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
if (!mem)
return NULL;
- index = swiotlb_find_slots(dev, 0, size);
+ index = swiotlb_find_slots(dev, 0, size, 0);
if (index == -1)
return NULL;
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bf16395b9e13..bad713684c2e 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -171,10 +171,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
handle_signal_work(regs, ti_work);
- if (ti_work & _TIF_NOTIFY_RESUME) {
+ if (ti_work & _TIF_NOTIFY_RESUME)
tracehook_notify_resume(regs);
- rseq_handle_notify_resume(NULL, regs);
- }
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);
@@ -189,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
/* Check if any of the above work has queued a deferred wakeup */
tick_nohz_user_enter_prepare();
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
}
/* Return the latest work state for arch_exit_to_user_mode() */
@@ -198,7 +196,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
- unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+ unsigned long ti_work = read_thread_flags();
lockdep_assert_irqs_disabled();
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 49972ee99aff..96d476e06c77 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -26,7 +26,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
if (ret)
return ret;
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
} while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
return 0;
}
@@ -43,7 +43,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu)
* disabled in the inner loop before going into guest mode. No need
* to disable interrupts here.
*/
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
if (!(ti_work & XFER_TO_GUEST_MODE_WORK))
return 0;
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index c240302f56e2..0b6379adff6b 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -47,14 +47,18 @@ bool syscall_user_dispatch(struct pt_regs *regs)
* access_ok() is performed once, at prctl time, when
* the selector is loaded by userspace.
*/
- if (unlikely(__get_user(state, sd->selector)))
- do_exit(SIGSEGV);
+ if (unlikely(__get_user(state, sd->selector))) {
+ force_exit_sig(SIGSEGV);
+ return true;
+ }
if (likely(state == SYSCALL_DISPATCH_FILTER_ALLOW))
return false;
- if (state != SYSCALL_DISPATCH_FILTER_BLOCK)
- do_exit(SIGSYS);
+ if (state != SYSCALL_DISPATCH_FILTER_BLOCK) {
+ force_exit_sig(SIGSYS);
+ return true;
+ }
}
sd->on_dispatch = true;
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 3c022e33c109..8591c180b52b 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,10 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
-endif
-
obj-y := core.o ring_buffer.o callchain.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_UPROBES) += uprobes.o
-
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 011cc5069b7b..fc18664f49b0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1808,6 +1808,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
+ if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+ ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
@@ -1999,6 +2001,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
ctx->nr_events--;
+ if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+ ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -3707,6 +3711,29 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
return 0;
}
+static inline bool event_update_userpage(struct perf_event *event)
+{
+ if (likely(!atomic_read(&event->mmap_count)))
+ return false;
+
+ perf_event_update_time(event);
+ perf_set_shadow_time(event, event->ctx);
+ perf_event_update_userpage(event);
+
+ return true;
+}
+
+static inline void group_update_userpage(struct perf_event *group_event)
+{
+ struct perf_event *event;
+
+ if (!event_update_userpage(group_event))
+ return;
+
+ for_each_sibling_event(event, group_event)
+ event_update_userpage(event);
+}
+
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
@@ -3725,14 +3752,15 @@ static int merge_sched_in(struct perf_event *event, void *data)
}
if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ *can_add_hw = 0;
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ } else {
+ ctx->rotate_necessary = 1;
+ perf_mux_hrtimer_restart(cpuctx);
+ group_update_userpage(event);
}
-
- *can_add_hw = 0;
- ctx->rotate_necessary = 1;
- perf_mux_hrtimer_restart(cpuctx);
}
return 0;
@@ -6324,6 +6352,8 @@ accounting:
ring_buffer_attach(event, rb);
+ perf_event_update_time(event);
+ perf_set_shadow_time(event, event->ctx);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
} else {
@@ -6495,26 +6525,43 @@ static void perf_pending_event(struct irq_work *entry)
perf_swevent_put_recursion_context(rctx);
}
-/*
- * We assume there is only KVM supporting the callbacks.
- * Later on, we might change it to a list if there is
- * another virtualization implementation supporting the callbacks.
- */
-struct perf_guest_info_callbacks *perf_guest_cbs;
+#ifdef CONFIG_GUEST_PERF_EVENTS
+struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
+
+DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
+DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
+DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
-int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = cbs;
- return 0;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
+ return;
+
+ rcu_assign_pointer(perf_guest_cbs, cbs);
+ static_call_update(__perf_guest_state, cbs->state);
+ static_call_update(__perf_guest_get_ip, cbs->get_ip);
+
+ /* Implementing ->handle_intel_pt_intr is optional. */
+ if (cbs->handle_intel_pt_intr)
+ static_call_update(__perf_guest_handle_intel_pt_intr,
+ cbs->handle_intel_pt_intr);
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
-int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = NULL;
- return 0;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
+ return;
+
+ rcu_assign_pointer(perf_guest_cbs, NULL);
+ static_call_update(__perf_guest_state, (void *)&__static_call_return0);
+ static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
+ static_call_update(__perf_guest_handle_intel_pt_intr,
+ (void *)&__static_call_return0);
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+#endif
static void
perf_output_sample_regs(struct perf_output_handle *handle,
@@ -7128,7 +7175,6 @@ void perf_output_sample(struct perf_output_handle *handle,
static u64 perf_virt_to_phys(u64 virt)
{
u64 phys_addr = 0;
- struct page *p = NULL;
if (!virt)
return 0;
@@ -7147,14 +7193,15 @@ static u64 perf_virt_to_phys(u64 virt)
* If failed, leave phys_addr as 0.
*/
if (current->mm != NULL) {
+ struct page *p;
+
pagefault_disable();
- if (get_user_page_fast_only(virt, 0, &p))
+ if (get_user_page_fast_only(virt, 0, &p)) {
phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ put_page(p);
+ }
pagefault_enable();
}
-
- if (p)
- put_page(p);
}
return phys_addr;
@@ -8320,8 +8367,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
else
flags = MAP_PRIVATE;
- if (vma->vm_flags & VM_DENYWRITE)
- flags |= MAP_DENYWRITE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
if (is_vm_hugetlb_page(vma))
@@ -9075,6 +9120,36 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}
+void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct perf_aux_event {
+ struct perf_event_header header;
+ u64 hw_id;
+ } rec;
+ int ret;
+
+ if (event->parent)
+ event = event->parent;
+
+ rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID;
+ rec.header.misc = 0;
+ rec.header.size = sizeof(rec);
+ rec.hw_id = hw_id;
+
+ perf_event_header__init_id(&rec.header, &sample, event);
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
+
+ if (ret)
+ return;
+
+ perf_output_put(&handle, rec);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
@@ -9705,6 +9780,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
continue;
if (event->attr.config != entry->type)
continue;
+ /* Cannot deliver synchronous signal to other task. */
+ if (event->attr.sigtrap)
+ continue;
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
}
@@ -10195,7 +10273,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
return;
if (ifh->nr_file_filters) {
- mm = get_task_mm(event->ctx->task);
+ mm = get_task_mm(task);
if (!mm)
goto restart;
@@ -13437,3 +13515,5 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
.threaded = true,
};
#endif /* CONFIG_CGROUP_PERF */
+
+DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 228801e20788..082832738c8f 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -205,12 +205,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
static inline int get_recursion_context(int *recursion)
{
- unsigned int pc = preempt_count();
- unsigned char rctx = 0;
-
- rctx += !!(pc & (NMI_MASK));
- rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
- rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+ unsigned char rctx = interrupt_context_level();
if (recursion[rctx])
return -1;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index af24dc3febbe..6357c3580d07 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -167,7 +167,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
addr + PAGE_SIZE);
if (new_page) {
- err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
+ err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm,
+ GFP_KERNEL);
if (err)
return err;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 91a43e57a32e..f702a6a63686 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,7 +48,6 @@
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
-#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
@@ -64,6 +63,7 @@
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
+#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -168,6 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+ kprobe_flush_task(tsk);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
put_task_struct(tsk);
@@ -339,6 +340,46 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
+static void coredump_task_exit(struct task_struct *tsk)
+{
+ struct core_state *core_state;
+
+ /*
+ * Serialize with any possible pending coredump.
+ * We must hold siglock around checking core_state
+ * and setting PF_POSTCOREDUMP. The core-inducing thread
+ * will increment ->nr_threads for each thread in the
+ * group without PF_POSTCOREDUMP set.
+ */
+ spin_lock_irq(&tsk->sighand->siglock);
+ tsk->flags |= PF_POSTCOREDUMP;
+ core_state = tsk->signal->core_state;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ if (core_state) {
+ struct core_thread self;
+
+ self.task = current;
+ if (self.task->flags & PF_SIGNALED)
+ self.next = xchg(&core_state->dumper.next, &self);
+ else
+ self.task = NULL;
+ /*
+ * Implies mb(), the result of xchg() must be visible
+ * to core_state->dumper.
+ */
+ if (atomic_dec_and_test(&core_state->nr_threads))
+ complete(&core_state->startup);
+
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!self.task) /* see coredump_finish() */
+ break;
+ freezable_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ }
+}
+
#ifdef CONFIG_MEMCG
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
@@ -434,47 +475,12 @@ assign_new_owner:
static void exit_mm(void)
{
struct mm_struct *mm = current->mm;
- struct core_state *core_state;
exit_mm_release(current, mm);
if (!mm)
return;
sync_mm_rss(mm);
- /*
- * Serialize with any possible pending coredump.
- * We must hold mmap_lock around checking core_state
- * and clearing tsk->mm. The core-inducing thread
- * will increment ->nr_threads for each thread in the
- * group with ->mm != NULL.
- */
mmap_read_lock(mm);
- core_state = mm->core_state;
- if (core_state) {
- struct core_thread self;
-
- mmap_read_unlock(mm);
-
- self.task = current;
- if (self.task->flags & PF_SIGNALED)
- self.next = xchg(&core_state->dumper.next, &self);
- else
- self.task = NULL;
- /*
- * Implies mb(), the result of xchg() must be visible
- * to core_state->dumper.
- */
- if (atomic_dec_and_test(&core_state->nr_threads))
- complete(&core_state->startup);
-
- for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!self.task) /* see coredump_finish() */
- break;
- freezable_schedule();
- }
- __set_current_state(TASK_RUNNING);
- mmap_read_lock(mm);
- }
mmgrab(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
@@ -762,6 +768,7 @@ void __noreturn do_exit(long code)
profile_task_exit(tsk);
kcov_task_exit(tsk);
+ coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
diff --git a/kernel/extable.c b/kernel/extable.c
index b0ea5eb0c3b4..b6f330f0fe74 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -62,40 +62,13 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
return e;
}
-int init_kernel_text(unsigned long addr)
-{
- if (addr >= (unsigned long)_sinittext &&
- addr < (unsigned long)_einittext)
- return 1;
- return 0;
-}
-
int notrace core_kernel_text(unsigned long addr)
{
- if (addr >= (unsigned long)_stext &&
- addr < (unsigned long)_etext)
+ if (is_kernel_text(addr))
return 1;
- if (system_state < SYSTEM_RUNNING &&
- init_kernel_text(addr))
- return 1;
- return 0;
-}
-
-/**
- * core_kernel_data - tell if addr points to kernel data
- * @addr: address to test
- *
- * Returns true if @addr passed in is from the core kernel data
- * section.
- *
- * Note: On some archs it may return true for core RODATA, and false
- * for others. But will always be true for core RW data.
- */
-int core_kernel_data(unsigned long addr)
-{
- if (addr >= (unsigned long)_sdata &&
- addr < (unsigned long)_edata)
+ if (system_state < SYSTEM_FREEING_INITMEM &&
+ is_kernel_inittext(addr))
return 1;
return 0;
}
@@ -112,7 +85,7 @@ int __kernel_text_address(unsigned long addr)
* Since we are after the module-symbols check, there's
* no danger of address overlap:
*/
- if (init_kernel_text(addr))
+ if (is_kernel_inittext(addr))
return 1;
return 0;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 695d1343a254..1c989cc4208a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -42,6 +42,7 @@
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
@@ -76,7 +77,6 @@
#include <linux/taskstats_kern.h>
#include <linux/random.h>
#include <linux/tty.h>
-#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
@@ -366,12 +366,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
new->vm_next = new->vm_prev = NULL;
+ dup_vma_anon_name(orig, new);
}
return new;
}
void vm_area_free(struct vm_area_struct *vma)
{
+ free_vma_anon_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}
@@ -471,6 +473,20 @@ void free_task(struct task_struct *tsk)
}
EXPORT_SYMBOL(free_task);
+static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ struct file *exe_file;
+
+ exe_file = get_mm_exe_file(oldmm);
+ RCU_INIT_POINTER(mm->exe_file, exe_file);
+ /*
+ * We depend on the oldmm having properly denied write access to the
+ * exe_file already.
+ */
+ if (exe_file && deny_write_access(exe_file))
+ pr_warn_once("deny_write_access() failed in %s\n", __func__);
+}
+
#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
@@ -494,7 +510,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
/* No ordering required: file already has been exposed. */
- RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+ dup_mm_exe_file(mm, oldmm);
mm->total_vm = oldmm->total_vm;
mm->data_vm = oldmm->data_vm;
@@ -557,12 +573,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
file = tmp->vm_file;
if (file) {
- struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
get_file(file);
- if (tmp->vm_flags & VM_DENYWRITE)
- put_write_access(inode);
i_mmap_lock_write(mapping);
if (tmp->vm_flags & VM_SHARED)
mapping_allow_writable(mapping);
@@ -640,7 +653,7 @@ static inline void mm_free_pgd(struct mm_struct *mm)
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
mmap_write_lock(oldmm);
- RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+ dup_mm_exe_file(mm, oldmm);
mmap_write_unlock(oldmm);
return 0;
}
@@ -1033,7 +1046,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
- mm->core_state = NULL;
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
@@ -1052,6 +1064,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->pmd_huge_pte = NULL;
#endif
mm_init_uprobes_state(mm);
+ hugetlb_count_init(mm);
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -1150,11 +1163,11 @@ void mmput_async(struct mm_struct *mm)
*
* Main users are mmput() and sys_execve(). Callers prevent concurrent
* invocations: in mmput() nobody alive left, in execve task is single
- * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
- * mm->exe_file, but does so without using set_mm_exe_file() in order
- * to avoid the need for any locks.
+ * threaded.
+ *
+ * Can only fail if new_exe_file != NULL.
*/
-void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
struct file *old_exe_file;
@@ -1165,11 +1178,73 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
*/
old_exe_file = rcu_dereference_raw(mm->exe_file);
- if (new_exe_file)
+ if (new_exe_file) {
+ /*
+ * We expect the caller (i.e., sys_execve) to already denied
+ * write access, so this is unlikely to fail.
+ */
+ if (unlikely(deny_write_access(new_exe_file)))
+ return -EACCES;
get_file(new_exe_file);
+ }
rcu_assign_pointer(mm->exe_file, new_exe_file);
- if (old_exe_file)
+ if (old_exe_file) {
+ allow_write_access(old_exe_file);
fput(old_exe_file);
+ }
+ return 0;
+}
+
+/**
+ * replace_mm_exe_file - replace a reference to the mm's executable file
+ *
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe),
+ * dealing with concurrent invocation and without grabbing the mmap lock in
+ * write mode.
+ *
+ * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
+ */
+int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+ struct vm_area_struct *vma;
+ struct file *old_exe_file;
+ int ret = 0;
+
+ /* Forbid mm->exe_file change if old file still mapped. */
+ old_exe_file = get_mm_exe_file(mm);
+ if (old_exe_file) {
+ mmap_read_lock(mm);
+ for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (path_equal(&vma->vm_file->f_path,
+ &old_exe_file->f_path))
+ ret = -EBUSY;
+ }
+ mmap_read_unlock(mm);
+ fput(old_exe_file);
+ if (ret)
+ return ret;
+ }
+
+ /* set the new file, lockless */
+ ret = deny_write_access(new_exe_file);
+ if (ret)
+ return -EACCES;
+ get_file(new_exe_file);
+
+ old_exe_file = xchg(&mm->exe_file, new_exe_file);
+ if (old_exe_file) {
+ /*
+ * Don't race with dup_mmap() getting the file and disallowing
+ * write access while someone might open the file writable.
+ */
+ mmap_read_lock(mm);
+ allow_write_access(old_exe_file);
+ fput(old_exe_file);
+ mmap_read_unlock(mm);
+ }
+ return 0;
}
/**
@@ -1189,7 +1264,6 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
rcu_read_unlock();
return exe_file;
}
-EXPORT_SYMBOL(get_mm_exe_file);
/**
* get_task_exe_file - acquire a reference to the task's executable file
@@ -1212,7 +1286,6 @@ struct file *get_task_exe_file(struct task_struct *task)
task_unlock(task);
return exe_file;
}
-EXPORT_SYMBOL(get_task_exe_file);
/**
* get_task_mm - acquire a reference to the task's mm
@@ -1320,8 +1393,7 @@ static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
* purposes.
*/
if (tsk->clear_child_tid) {
- if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
- atomic_read(&mm->mm_users) > 1) {
+ if (atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
* not set up a proper pointer then tough luck.
@@ -1487,32 +1559,6 @@ out:
return error;
}
-static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
-{
-#ifdef CONFIG_BLOCK
- struct io_context *ioc = current->io_context;
- struct io_context *new_ioc;
-
- if (!ioc)
- return 0;
- /*
- * Share io context with parent, if CLONE_IO is set
- */
- if (clone_flags & CLONE_IO) {
- ioc_task_link(ioc);
- tsk->io_context = ioc;
- } else if (ioprio_valid(ioc->ioprio)) {
- new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
- if (unlikely(!new_ioc))
- return -ENOMEM;
-
- new_ioc->ioprio = ioc->ioprio;
- put_io_context(new_ioc);
- }
-#endif
- return 0;
-}
-
static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
struct sighand_struct *sig;
@@ -2208,6 +2254,7 @@ static __latent_entropy struct task_struct *copy_process(
p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+ clear_posix_cputimers_work(p);
#ifdef CONFIG_KRETPROBES
p->kretprobe_instances.first = NULL;
@@ -2333,7 +2380,7 @@ static __latent_entropy struct task_struct *copy_process(
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- sched_post_fork(p);
+ sched_post_fork(p, args);
cgroup_post_fork(p, args);
perf_event_fork(p);
@@ -2955,7 +3002,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
int ksys_unshare(unsigned long unshare_flags)
{
struct fs_struct *fs, *new_fs = NULL;
- struct files_struct *fd, *new_fd = NULL;
+ struct files_struct *new_fd = NULL;
struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
@@ -3042,11 +3089,8 @@ int ksys_unshare(unsigned long unshare_flags)
spin_unlock(&fs->lock);
}
- if (new_fd) {
- fd = current->files;
- current->files = new_fd;
- new_fd = fd;
- }
+ if (new_fd)
+ swap(current->files, new_fd);
task_unlock(current);
diff --git a/kernel/futex.c b/kernel/futex.c
deleted file mode 100644
index e7b4c6121da4..000000000000
--- a/kernel/futex.c
+++ /dev/null
@@ -1,4240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Fast Userspace Mutexes (which I call "Futexes!").
- * (C) Rusty Russell, IBM 2002
- *
- * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
- * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
- *
- * Removed page pinning, fix privately mapped COW pages and other cleanups
- * (C) Copyright 2003, 2004 Jamie Lokier
- *
- * Robust futex support started by Ingo Molnar
- * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
- * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
- *
- * PI-futex support started by Ingo Molnar and Thomas Gleixner
- * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * PRIVATE futexes by Eric Dumazet
- * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
- *
- * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
- * Copyright (C) IBM Corporation, 2009
- * Thanks to Thomas Gleixner for conceptual design and careful reviews.
- *
- * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
- * enough at me, Linus for the original (flawed) idea, Matthew
- * Kirkwood for proof-of-concept implementation.
- *
- * "The futexes are also cursed."
- * "But they come in a choice of three flavours!"
- */
-#include <linux/compat.h>
-#include <linux/jhash.h>
-#include <linux/pagemap.h>
-#include <linux/syscalls.h>
-#include <linux/freezer.h>
-#include <linux/memblock.h>
-#include <linux/fault-inject.h>
-#include <linux/time_namespace.h>
-
-#include <asm/futex.h>
-
-#include "locking/rtmutex_common.h"
-
-/*
- * READ this before attempting to hack on futexes!
- *
- * Basic futex operation and ordering guarantees
- * =============================================
- *
- * The waiter reads the futex value in user space and calls
- * futex_wait(). This function computes the hash bucket and acquires
- * the hash bucket lock. After that it reads the futex user space value
- * again and verifies that the data has not changed. If it has not changed
- * it enqueues itself into the hash bucket, releases the hash bucket lock
- * and schedules.
- *
- * The waker side modifies the user space value of the futex and calls
- * futex_wake(). This function computes the hash bucket and acquires the
- * hash bucket lock. Then it looks for waiters on that futex in the hash
- * bucket and wakes them.
- *
- * In futex wake up scenarios where no tasks are blocked on a futex, taking
- * the hb spinlock can be avoided and simply return. In order for this
- * optimization to work, ordering guarantees must exist so that the waiter
- * being added to the list is acknowledged when the list is concurrently being
- * checked by the waker, avoiding scenarios like the following:
- *
- * CPU 0 CPU 1
- * val = *futex;
- * sys_futex(WAIT, futex, val);
- * futex_wait(futex, val);
- * uval = *futex;
- * *futex = newval;
- * sys_futex(WAKE, futex);
- * futex_wake(futex);
- * if (queue_empty())
- * return;
- * if (uval == val)
- * lock(hash_bucket(futex));
- * queue();
- * unlock(hash_bucket(futex));
- * schedule();
- *
- * This would cause the waiter on CPU 0 to wait forever because it
- * missed the transition of the user space value from val to newval
- * and the waker did not find the waiter in the hash bucket queue.
- *
- * The correct serialization ensures that a waiter either observes
- * the changed user space value before blocking or is woken by a
- * concurrent waker:
- *
- * CPU 0 CPU 1
- * val = *futex;
- * sys_futex(WAIT, futex, val);
- * futex_wait(futex, val);
- *
- * waiters++; (a)
- * smp_mb(); (A) <-- paired with -.
- * |
- * lock(hash_bucket(futex)); |
- * |
- * uval = *futex; |
- * | *futex = newval;
- * | sys_futex(WAKE, futex);
- * | futex_wake(futex);
- * |
- * `--------> smp_mb(); (B)
- * if (uval == val)
- * queue();
- * unlock(hash_bucket(futex));
- * schedule(); if (waiters)
- * lock(hash_bucket(futex));
- * else wake_waiters(futex);
- * waiters--; (b) unlock(hash_bucket(futex));
- *
- * Where (A) orders the waiters increment and the futex value read through
- * atomic operations (see hb_waiters_inc) and where (B) orders the write
- * to futex and the waiters read (see hb_waiters_pending()).
- *
- * This yields the following case (where X:=waiters, Y:=futex):
- *
- * X = Y = 0
- *
- * w[X]=1 w[Y]=1
- * MB MB
- * r[Y]=y r[X]=x
- *
- * Which guarantees that x==0 && y==0 is impossible; which translates back into
- * the guarantee that we cannot both miss the futex variable change and the
- * enqueue.
- *
- * Note that a new waiter is accounted for in (a) even when it is possible that
- * the wait call can return error, in which case we backtrack from it in (b).
- * Refer to the comment in queue_lock().
- *
- * Similarly, in order to account for waiters being requeued on another
- * address we always increment the waiters for the destination bucket before
- * acquiring the lock. It then decrements them again after releasing it -
- * the code that actually moves the futex(es) between hash buckets (requeue_futex)
- * will do the additional required waiter count housekeeping. This is done for
- * double_lock_hb() and double_unlock_hb(), respectively.
- */
-
-#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
-#define futex_cmpxchg_enabled 1
-#else
-static int __read_mostly futex_cmpxchg_enabled;
-#endif
-
-/*
- * Futex flags used to encode options to functions and preserve them across
- * restarts.
- */
-#ifdef CONFIG_MMU
-# define FLAGS_SHARED 0x01
-#else
-/*
- * NOMMU does not have per process address space. Let the compiler optimize
- * code away.
- */
-# define FLAGS_SHARED 0x00
-#endif
-#define FLAGS_CLOCKRT 0x02
-#define FLAGS_HAS_TIMEOUT 0x04
-
-/*
- * Priority Inheritance state:
- */
-struct futex_pi_state {
- /*
- * list of 'owned' pi_state instances - these have to be
- * cleaned up in do_exit() if the task exits prematurely:
- */
- struct list_head list;
-
- /*
- * The PI object:
- */
- struct rt_mutex_base pi_mutex;
-
- struct task_struct *owner;
- refcount_t refcount;
-
- union futex_key key;
-} __randomize_layout;
-
-/**
- * struct futex_q - The hashed futex queue entry, one per waiting task
- * @list: priority-sorted list of tasks waiting on this futex
- * @task: the task waiting on the futex
- * @lock_ptr: the hash bucket lock
- * @key: the key the futex is hashed on
- * @pi_state: optional priority inheritance state
- * @rt_waiter: rt_waiter storage for use with requeue_pi
- * @requeue_pi_key: the requeue_pi target futex key
- * @bitset: bitset for the optional bitmasked wakeup
- * @requeue_state: State field for futex_requeue_pi()
- * @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
- *
- * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
- * we can wake only the relevant ones (hashed queues may be shared).
- *
- * A futex_q has a woken state, just like tasks have TASK_RUNNING.
- * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
- * The order of wakeup is always to make the first condition true, then
- * the second.
- *
- * PI futexes are typically woken before they are removed from the hash list via
- * the rt_mutex code. See unqueue_me_pi().
- */
-struct futex_q {
- struct plist_node list;
-
- struct task_struct *task;
- spinlock_t *lock_ptr;
- union futex_key key;
- struct futex_pi_state *pi_state;
- struct rt_mutex_waiter *rt_waiter;
- union futex_key *requeue_pi_key;
- u32 bitset;
- atomic_t requeue_state;
-#ifdef CONFIG_PREEMPT_RT
- struct rcuwait requeue_wait;
-#endif
-} __randomize_layout;
-
-/*
- * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
- * underlying rtmutex. The task which is about to be requeued could have
- * just woken up (timeout, signal). After the wake up the task has to
- * acquire hash bucket lock, which is held by the requeue code. As a task
- * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
- * and the hash bucket lock blocking would collide and corrupt state.
- *
- * On !PREEMPT_RT this is not a problem and everything could be serialized
- * on hash bucket lock, but aside of having the benefit of common code,
- * this allows to avoid doing the requeue when the task is already on the
- * way out and taking the hash bucket lock of the original uaddr1 when the
- * requeue has been completed.
- *
- * The following state transitions are valid:
- *
- * On the waiter side:
- * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE
- * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT
- *
- * On the requeue side:
- * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS
- * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED
- * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed)
- * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED
- * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed)
- *
- * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
- * signals that the waiter is already on the way out. It also means that
- * the waiter is still on the 'wait' futex, i.e. uaddr1.
- *
- * The waiter side signals early wakeup to the requeue side either through
- * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
- * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
- * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
- * which means the wakeup is interleaving with a requeue in progress it has
- * to wait for the requeue side to change the state. Either to DONE/LOCKED
- * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
- * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
- * the requeue side when the requeue attempt failed via deadlock detection
- * and therefore the waiter q is still on the uaddr1 futex.
- */
-enum {
- Q_REQUEUE_PI_NONE = 0,
- Q_REQUEUE_PI_IGNORE,
- Q_REQUEUE_PI_IN_PROGRESS,
- Q_REQUEUE_PI_WAIT,
- Q_REQUEUE_PI_DONE,
- Q_REQUEUE_PI_LOCKED,
-};
-
-static const struct futex_q futex_q_init = {
- /* list gets initialized in queue_me()*/
- .key = FUTEX_KEY_INIT,
- .bitset = FUTEX_BITSET_MATCH_ANY,
- .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
-};
-
-/*
- * Hash buckets are shared by all the futex_keys that hash to the same
- * location. Each key may have multiple futex_q structures, one for each task
- * waiting on a futex.
- */
-struct futex_hash_bucket {
- atomic_t waiters;
- spinlock_t lock;
- struct plist_head chain;
-} ____cacheline_aligned_in_smp;
-
-/*
- * The base of the bucket array and its size are always used together
- * (after initialization only in hash_futex()), so ensure that they
- * reside in the same cacheline.
- */
-static struct {
- struct futex_hash_bucket *queues;
- unsigned long hashsize;
-} __futex_data __read_mostly __aligned(2*sizeof(long));
-#define futex_queues (__futex_data.queues)
-#define futex_hashsize (__futex_data.hashsize)
-
-
-/*
- * Fault injections for futexes.
- */
-#ifdef CONFIG_FAIL_FUTEX
-
-static struct {
- struct fault_attr attr;
-
- bool ignore_private;
-} fail_futex = {
- .attr = FAULT_ATTR_INITIALIZER,
- .ignore_private = false,
-};
-
-static int __init setup_fail_futex(char *str)
-{
- return setup_fault_attr(&fail_futex.attr, str);
-}
-__setup("fail_futex=", setup_fail_futex);
-
-static bool should_fail_futex(bool fshared)
-{
- if (fail_futex.ignore_private && !fshared)
- return false;
-
- return should_fail(&fail_futex.attr, 1);
-}
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
-static int __init fail_futex_debugfs(void)
-{
- umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
- struct dentry *dir;
-
- dir = fault_create_debugfs_attr("fail_futex", NULL,
- &fail_futex.attr);
- if (IS_ERR(dir))
- return PTR_ERR(dir);
-
- debugfs_create_bool("ignore-private", mode, dir,
- &fail_futex.ignore_private);
- return 0;
-}
-
-late_initcall(fail_futex_debugfs);
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
-#else
-static inline bool should_fail_futex(bool fshared)
-{
- return false;
-}
-#endif /* CONFIG_FAIL_FUTEX */
-
-#ifdef CONFIG_COMPAT
-static void compat_exit_robust_list(struct task_struct *curr);
-#endif
-
-/*
- * Reflects a new waiter being added to the waitqueue.
- */
-static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
-{
-#ifdef CONFIG_SMP
- atomic_inc(&hb->waiters);
- /*
- * Full barrier (A), see the ordering comment above.
- */
- smp_mb__after_atomic();
-#endif
-}
-
-/*
- * Reflects a waiter being removed from the waitqueue by wakeup
- * paths.
- */
-static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
-{
-#ifdef CONFIG_SMP
- atomic_dec(&hb->waiters);
-#endif
-}
-
-static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
-{
-#ifdef CONFIG_SMP
- /*
- * Full barrier (B), see the ordering comment above.
- */
- smp_mb();
- return atomic_read(&hb->waiters);
-#else
- return 1;
-#endif
-}
-
-/**
- * hash_futex - Return the hash bucket in the global hash
- * @key: Pointer to the futex key for which the hash is calculated
- *
- * We hash on the keys returned from get_futex_key (see below) and return the
- * corresponding hash bucket in the global hash.
- */
-static struct futex_hash_bucket *hash_futex(union futex_key *key)
-{
- u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
- key->both.offset);
-
- return &futex_queues[hash & (futex_hashsize - 1)];
-}
-
-
-/**
- * match_futex - Check whether two futex keys are equal
- * @key1: Pointer to key1
- * @key2: Pointer to key2
- *
- * Return 1 if two futex_keys are equal, 0 otherwise.
- */
-static inline int match_futex(union futex_key *key1, union futex_key *key2)
-{
- return (key1 && key2
- && key1->both.word == key2->both.word
- && key1->both.ptr == key2->both.ptr
- && key1->both.offset == key2->both.offset);
-}
-
-enum futex_access {
- FUTEX_READ,
- FUTEX_WRITE
-};
-
-/**
- * futex_setup_timer - set up the sleeping hrtimer.
- * @time: ptr to the given timeout value
- * @timeout: the hrtimer_sleeper structure to be set up
- * @flags: futex flags
- * @range_ns: optional range in ns
- *
- * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
- * value given
- */
-static inline struct hrtimer_sleeper *
-futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
- int flags, u64 range_ns)
-{
- if (!time)
- return NULL;
-
- hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- /*
- * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
- * effectively the same as calling hrtimer_set_expires().
- */
- hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
-
- return timeout;
-}
-
-/*
- * Generate a machine wide unique identifier for this inode.
- *
- * This relies on u64 not wrapping in the life-time of the machine; which with
- * 1ns resolution means almost 585 years.
- *
- * This further relies on the fact that a well formed program will not unmap
- * the file while it has a (shared) futex waiting on it. This mapping will have
- * a file reference which pins the mount and inode.
- *
- * If for some reason an inode gets evicted and read back in again, it will get
- * a new sequence number and will _NOT_ match, even though it is the exact same
- * file.
- *
- * It is important that match_futex() will never have a false-positive, esp.
- * for PI futexes that can mess up the state. The above argues that false-negatives
- * are only possible for malformed programs.
- */
-static u64 get_inode_sequence_number(struct inode *inode)
-{
- static atomic64_t i_seq;
- u64 old;
-
- /* Does the inode already have a sequence number? */
- old = atomic64_read(&inode->i_sequence);
- if (likely(old))
- return old;
-
- for (;;) {
- u64 new = atomic64_add_return(1, &i_seq);
- if (WARN_ON_ONCE(!new))
- continue;
-
- old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
- if (old)
- return old;
- return new;
- }
-}
-
-/**
- * get_futex_key() - Get parameters which are the keys for a futex
- * @uaddr: virtual address of the futex
- * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
- * @key: address where result is stored.
- * @rw: mapping needs to be read/write (values: FUTEX_READ,
- * FUTEX_WRITE)
- *
- * Return: a negative error code or 0
- *
- * The key words are stored in @key on success.
- *
- * For shared mappings (when @fshared), the key is:
- *
- * ( inode->i_sequence, page->index, offset_within_page )
- *
- * [ also see get_inode_sequence_number() ]
- *
- * For private mappings (or when !@fshared), the key is:
- *
- * ( current->mm, address, 0 )
- *
- * This allows (cross process, where applicable) identification of the futex
- * without keeping the page pinned for the duration of the FUTEX_WAIT.
- *
- * lock_page() might sleep, the caller should not hold a spinlock.
- */
-static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
- enum futex_access rw)
-{
- unsigned long address = (unsigned long)uaddr;
- struct mm_struct *mm = current->mm;
- struct page *page, *tail;
- struct address_space *mapping;
- int err, ro = 0;
-
- /*
- * The futex address must be "naturally" aligned.
- */
- key->both.offset = address % PAGE_SIZE;
- if (unlikely((address % sizeof(u32)) != 0))
- return -EINVAL;
- address -= key->both.offset;
-
- if (unlikely(!access_ok(uaddr, sizeof(u32))))
- return -EFAULT;
-
- if (unlikely(should_fail_futex(fshared)))
- return -EFAULT;
-
- /*
- * PROCESS_PRIVATE futexes are fast.
- * As the mm cannot disappear under us and the 'key' only needs
- * virtual address, we dont even have to find the underlying vma.
- * Note : We do have to check 'uaddr' is a valid user address,
- * but access_ok() should be faster than find_vma()
- */
- if (!fshared) {
- key->private.mm = mm;
- key->private.address = address;
- return 0;
- }
-
-again:
- /* Ignore any VERIFY_READ mapping (futex common case) */
- if (unlikely(should_fail_futex(true)))
- return -EFAULT;
-
- err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
- /*
- * If write access is not required (eg. FUTEX_WAIT), try
- * and get read-only access.
- */
- if (err == -EFAULT && rw == FUTEX_READ) {
- err = get_user_pages_fast(address, 1, 0, &page);
- ro = 1;
- }
- if (err < 0)
- return err;
- else
- err = 0;
-
- /*
- * The treatment of mapping from this point on is critical. The page
- * lock protects many things but in this context the page lock
- * stabilizes mapping, prevents inode freeing in the shared
- * file-backed region case and guards against movement to swap cache.
- *
- * Strictly speaking the page lock is not needed in all cases being
- * considered here and page lock forces unnecessarily serialization
- * From this point on, mapping will be re-verified if necessary and
- * page lock will be acquired only if it is unavoidable
- *
- * Mapping checks require the head page for any compound page so the
- * head page and mapping is looked up now. For anonymous pages, it
- * does not matter if the page splits in the future as the key is
- * based on the address. For filesystem-backed pages, the tail is
- * required as the index of the page determines the key. For
- * base pages, there is no tail page and tail == page.
- */
- tail = page;
- page = compound_head(page);
- mapping = READ_ONCE(page->mapping);
-
- /*
- * If page->mapping is NULL, then it cannot be a PageAnon
- * page; but it might be the ZERO_PAGE or in the gate area or
- * in a special mapping (all cases which we are happy to fail);
- * or it may have been a good file page when get_user_pages_fast
- * found it, but truncated or holepunched or subjected to
- * invalidate_complete_page2 before we got the page lock (also
- * cases which we are happy to fail). And we hold a reference,
- * so refcount care in invalidate_complete_page's remove_mapping
- * prevents drop_caches from setting mapping to NULL beneath us.
- *
- * The case we do have to guard against is when memory pressure made
- * shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page->mapping.
- */
- if (unlikely(!mapping)) {
- int shmem_swizzled;
-
- /*
- * Page lock is required to identify which special case above
- * applies. If this is really a shmem page then the page lock
- * will prevent unexpected transitions.
- */
- lock_page(page);
- shmem_swizzled = PageSwapCache(page) || page->mapping;
- unlock_page(page);
- put_page(page);
-
- if (shmem_swizzled)
- goto again;
-
- return -EFAULT;
- }
-
- /*
- * Private mappings are handled in a simple way.
- *
- * If the futex key is stored on an anonymous page, then the associated
- * object is the mm which is implicitly pinned by the calling process.
- *
- * NOTE: When userspace waits on a MAP_SHARED mapping, even if
- * it's a read-only handle, it's expected that futexes attach to
- * the object not the particular process.
- */
- if (PageAnon(page)) {
- /*
- * A RO anonymous page will never change and thus doesn't make
- * sense for futex operations.
- */
- if (unlikely(should_fail_futex(true)) || ro) {
- err = -EFAULT;
- goto out;
- }
-
- key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
- key->private.mm = mm;
- key->private.address = address;
-
- } else {
- struct inode *inode;
-
- /*
- * The associated futex object in this case is the inode and
- * the page->mapping must be traversed. Ordinarily this should
- * be stabilised under page lock but it's not strictly
- * necessary in this case as we just want to pin the inode, not
- * update the radix tree or anything like that.
- *
- * The RCU read lock is taken as the inode is finally freed
- * under RCU. If the mapping still matches expectations then the
- * mapping->host can be safely accessed as being a valid inode.
- */
- rcu_read_lock();
-
- if (READ_ONCE(page->mapping) != mapping) {
- rcu_read_unlock();
- put_page(page);
-
- goto again;
- }
-
- inode = READ_ONCE(mapping->host);
- if (!inode) {
- rcu_read_unlock();
- put_page(page);
-
- goto again;
- }
-
- key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.i_seq = get_inode_sequence_number(inode);
- key->shared.pgoff = page_to_pgoff(tail);
- rcu_read_unlock();
- }
-
-out:
- put_page(page);
- return err;
-}
-
-/**
- * fault_in_user_writeable() - Fault in user address and verify RW access
- * @uaddr: pointer to faulting user space address
- *
- * Slow path to fixup the fault we just took in the atomic write
- * access to @uaddr.
- *
- * We have no generic implementation of a non-destructive write to the
- * user address. We know that we faulted in the atomic pagefault
- * disabled section so we can as well avoid the #PF overhead by
- * calling get_user_pages() right away.
- */
-static int fault_in_user_writeable(u32 __user *uaddr)
-{
- struct mm_struct *mm = current->mm;
- int ret;
-
- mmap_read_lock(mm);
- ret = fixup_user_fault(mm, (unsigned long)uaddr,
- FAULT_FLAG_WRITE, NULL);
- mmap_read_unlock(mm);
-
- return ret < 0 ? ret : 0;
-}
-
-/**
- * futex_top_waiter() - Return the highest priority waiter on a futex
- * @hb: the hash bucket the futex_q's reside in
- * @key: the futex key (to distinguish it from other futex futex_q's)
- *
- * Must be called with the hb lock held.
- */
-static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
- union futex_key *key)
-{
- struct futex_q *this;
-
- plist_for_each_entry(this, &hb->chain, list) {
- if (match_futex(&this->key, key))
- return this;
- }
- return NULL;
-}
-
-static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
- u32 uval, u32 newval)
-{
- int ret;
-
- pagefault_disable();
- ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
- pagefault_enable();
-
- return ret;
-}
-
-static int get_futex_value_locked(u32 *dest, u32 __user *from)
-{
- int ret;
-
- pagefault_disable();
- ret = __get_user(*dest, from);
- pagefault_enable();
-
- return ret ? -EFAULT : 0;
-}
-
-
-/*
- * PI code:
- */
-static int refill_pi_state_cache(void)
-{
- struct futex_pi_state *pi_state;
-
- if (likely(current->pi_state_cache))
- return 0;
-
- pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
-
- if (!pi_state)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&pi_state->list);
- /* pi_mutex gets initialized later */
- pi_state->owner = NULL;
- refcount_set(&pi_state->refcount, 1);
- pi_state->key = FUTEX_KEY_INIT;
-
- current->pi_state_cache = pi_state;
-
- return 0;
-}
-
-static struct futex_pi_state *alloc_pi_state(void)
-{
- struct futex_pi_state *pi_state = current->pi_state_cache;
-
- WARN_ON(!pi_state);
- current->pi_state_cache = NULL;
-
- return pi_state;
-}
-
-static void pi_state_update_owner(struct futex_pi_state *pi_state,
- struct task_struct *new_owner)
-{
- struct task_struct *old_owner = pi_state->owner;
-
- lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
-
- if (old_owner) {
- raw_spin_lock(&old_owner->pi_lock);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- raw_spin_unlock(&old_owner->pi_lock);
- }
-
- if (new_owner) {
- raw_spin_lock(&new_owner->pi_lock);
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &new_owner->pi_state_list);
- pi_state->owner = new_owner;
- raw_spin_unlock(&new_owner->pi_lock);
- }
-}
-
-static void get_pi_state(struct futex_pi_state *pi_state)
-{
- WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
-}
-
-/*
- * Drops a reference to the pi_state object and frees or caches it
- * when the last reference is gone.
- */
-static void put_pi_state(struct futex_pi_state *pi_state)
-{
- if (!pi_state)
- return;
-
- if (!refcount_dec_and_test(&pi_state->refcount))
- return;
-
- /*
- * If pi_state->owner is NULL, the owner is most probably dying
- * and has cleaned up the pi_state already
- */
- if (pi_state->owner) {
- unsigned long flags;
-
- raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
- pi_state_update_owner(pi_state, NULL);
- rt_mutex_proxy_unlock(&pi_state->pi_mutex);
- raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
- }
-
- if (current->pi_state_cache) {
- kfree(pi_state);
- } else {
- /*
- * pi_state->list is already empty.
- * clear pi_state->owner.
- * refcount is at 0 - put it back to 1.
- */
- pi_state->owner = NULL;
- refcount_set(&pi_state->refcount, 1);
- current->pi_state_cache = pi_state;
- }
-}
-
-#ifdef CONFIG_FUTEX_PI
-
-/*
- * This task is holding PI mutexes at exit time => bad.
- * Kernel cleans up PI-state, but userspace is likely hosed.
- * (Robust-futex cleanup is separate and might save the day for userspace.)
- */
-static void exit_pi_state_list(struct task_struct *curr)
-{
- struct list_head *next, *head = &curr->pi_state_list;
- struct futex_pi_state *pi_state;
- struct futex_hash_bucket *hb;
- union futex_key key = FUTEX_KEY_INIT;
-
- if (!futex_cmpxchg_enabled)
- return;
- /*
- * We are a ZOMBIE and nobody can enqueue itself on
- * pi_state_list anymore, but we have to be careful
- * versus waiters unqueueing themselves:
- */
- raw_spin_lock_irq(&curr->pi_lock);
- while (!list_empty(head)) {
- next = head->next;
- pi_state = list_entry(next, struct futex_pi_state, list);
- key = pi_state->key;
- hb = hash_futex(&key);
-
- /*
- * We can race against put_pi_state() removing itself from the
- * list (a waiter going away). put_pi_state() will first
- * decrement the reference count and then modify the list, so
- * its possible to see the list entry but fail this reference
- * acquire.
- *
- * In that case; drop the locks to let put_pi_state() make
- * progress and retry the loop.
- */
- if (!refcount_inc_not_zero(&pi_state->refcount)) {
- raw_spin_unlock_irq(&curr->pi_lock);
- cpu_relax();
- raw_spin_lock_irq(&curr->pi_lock);
- continue;
- }
- raw_spin_unlock_irq(&curr->pi_lock);
-
- spin_lock(&hb->lock);
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- raw_spin_lock(&curr->pi_lock);
- /*
- * We dropped the pi-lock, so re-check whether this
- * task still owns the PI-state:
- */
- if (head->next != next) {
- /* retain curr->pi_lock for the loop invariant */
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
- put_pi_state(pi_state);
- continue;
- }
-
- WARN_ON(pi_state->owner != curr);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- pi_state->owner = NULL;
-
- raw_spin_unlock(&curr->pi_lock);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
-
- rt_mutex_futex_unlock(&pi_state->pi_mutex);
- put_pi_state(pi_state);
-
- raw_spin_lock_irq(&curr->pi_lock);
- }
- raw_spin_unlock_irq(&curr->pi_lock);
-}
-#else
-static inline void exit_pi_state_list(struct task_struct *curr) { }
-#endif
-
-/*
- * We need to check the following states:
- *
- * Waiter | pi_state | pi->owner | uTID | uODIED | ?
- *
- * [1] NULL | --- | --- | 0 | 0/1 | Valid
- * [2] NULL | --- | --- | >0 | 0/1 | Valid
- *
- * [3] Found | NULL | -- | Any | 0/1 | Invalid
- *
- * [4] Found | Found | NULL | 0 | 1 | Valid
- * [5] Found | Found | NULL | >0 | 1 | Invalid
- *
- * [6] Found | Found | task | 0 | 1 | Valid
- *
- * [7] Found | Found | NULL | Any | 0 | Invalid
- *
- * [8] Found | Found | task | ==taskTID | 0/1 | Valid
- * [9] Found | Found | task | 0 | 0 | Invalid
- * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
- *
- * [1] Indicates that the kernel can acquire the futex atomically. We
- * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
- *
- * [2] Valid, if TID does not belong to a kernel thread. If no matching
- * thread is found then it indicates that the owner TID has died.
- *
- * [3] Invalid. The waiter is queued on a non PI futex
- *
- * [4] Valid state after exit_robust_list(), which sets the user space
- * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
- *
- * [5] The user space value got manipulated between exit_robust_list()
- * and exit_pi_state_list()
- *
- * [6] Valid state after exit_pi_state_list() which sets the new owner in
- * the pi_state but cannot access the user space value.
- *
- * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
- *
- * [8] Owner and user space value match
- *
- * [9] There is no transient state which sets the user space TID to 0
- * except exit_robust_list(), but this is indicated by the
- * FUTEX_OWNER_DIED bit. See [4]
- *
- * [10] There is no transient state which leaves owner and user space
- * TID out of sync. Except one error case where the kernel is denied
- * write access to the user address, see fixup_pi_state_owner().
- *
- *
- * Serialization and lifetime rules:
- *
- * hb->lock:
- *
- * hb -> futex_q, relation
- * futex_q -> pi_state, relation
- *
- * (cannot be raw because hb can contain arbitrary amount
- * of futex_q's)
- *
- * pi_mutex->wait_lock:
- *
- * {uval, pi_state}
- *
- * (and pi_mutex 'obviously')
- *
- * p->pi_lock:
- *
- * p->pi_state_list -> pi_state->list, relation
- * pi_mutex->owner -> pi_state->owner, relation
- *
- * pi_state->refcount:
- *
- * pi_state lifetime
- *
- *
- * Lock order:
- *
- * hb->lock
- * pi_mutex->wait_lock
- * p->pi_lock
- *
- */
-
-/*
- * Validate that the existing waiter has a pi_state and sanity check
- * the pi_state against the user space value. If correct, attach to
- * it.
- */
-static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
- struct futex_pi_state *pi_state,
- struct futex_pi_state **ps)
-{
- pid_t pid = uval & FUTEX_TID_MASK;
- u32 uval2;
- int ret;
-
- /*
- * Userspace might have messed up non-PI and PI futexes [3]
- */
- if (unlikely(!pi_state))
- return -EINVAL;
-
- /*
- * We get here with hb->lock held, and having found a
- * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
- * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
- * which in turn means that futex_lock_pi() still has a reference on
- * our pi_state.
- *
- * The waiter holding a reference on @pi_state also protects against
- * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
- * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
- * free pi_state before we can take a reference ourselves.
- */
- WARN_ON(!refcount_read(&pi_state->refcount));
-
- /*
- * Now that we have a pi_state, we can acquire wait_lock
- * and do the state validation.
- */
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-
- /*
- * Since {uval, pi_state} is serialized by wait_lock, and our current
- * uval was read without holding it, it can have changed. Verify it
- * still is what we expect it to be, otherwise retry the entire
- * operation.
- */
- if (get_futex_value_locked(&uval2, uaddr))
- goto out_efault;
-
- if (uval != uval2)
- goto out_eagain;
-
- /*
- * Handle the owner died case:
- */
- if (uval & FUTEX_OWNER_DIED) {
- /*
- * exit_pi_state_list sets owner to NULL and wakes the
- * topmost waiter. The task which acquires the
- * pi_state->rt_mutex will fixup owner.
- */
- if (!pi_state->owner) {
- /*
- * No pi state owner, but the user space TID
- * is not 0. Inconsistent state. [5]
- */
- if (pid)
- goto out_einval;
- /*
- * Take a ref on the state and return success. [4]
- */
- goto out_attach;
- }
-
- /*
- * If TID is 0, then either the dying owner has not
- * yet executed exit_pi_state_list() or some waiter
- * acquired the rtmutex in the pi state, but did not
- * yet fixup the TID in user space.
- *
- * Take a ref on the state and return success. [6]
- */
- if (!pid)
- goto out_attach;
- } else {
- /*
- * If the owner died bit is not set, then the pi_state
- * must have an owner. [7]
- */
- if (!pi_state->owner)
- goto out_einval;
- }
-
- /*
- * Bail out if user space manipulated the futex value. If pi
- * state exists then the owner TID must be the same as the
- * user space TID. [9/10]
- */
- if (pid != task_pid_vnr(pi_state->owner))
- goto out_einval;
-
-out_attach:
- get_pi_state(pi_state);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- *ps = pi_state;
- return 0;
-
-out_einval:
- ret = -EINVAL;
- goto out_error;
-
-out_eagain:
- ret = -EAGAIN;
- goto out_error;
-
-out_efault:
- ret = -EFAULT;
- goto out_error;
-
-out_error:
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return ret;
-}
-
-/**
- * wait_for_owner_exiting - Block until the owner has exited
- * @ret: owner's current futex lock status
- * @exiting: Pointer to the exiting task
- *
- * Caller must hold a refcount on @exiting.
- */
-static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
-{
- if (ret != -EBUSY) {
- WARN_ON_ONCE(exiting);
- return;
- }
-
- if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
- return;
-
- mutex_lock(&exiting->futex_exit_mutex);
- /*
- * No point in doing state checking here. If the waiter got here
- * while the task was in exec()->exec_futex_release() then it can
- * have any FUTEX_STATE_* value when the waiter has acquired the
- * mutex. OK, if running, EXITING or DEAD if it reached exit()
- * already. Highly unlikely and not a problem. Just one more round
- * through the futex maze.
- */
- mutex_unlock(&exiting->futex_exit_mutex);
-
- put_task_struct(exiting);
-}
-
-static int handle_exit_race(u32 __user *uaddr, u32 uval,
- struct task_struct *tsk)
-{
- u32 uval2;
-
- /*
- * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
- * caller that the alleged owner is busy.
- */
- if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
- return -EBUSY;
-
- /*
- * Reread the user space value to handle the following situation:
- *
- * CPU0 CPU1
- *
- * sys_exit() sys_futex()
- * do_exit() futex_lock_pi()
- * futex_lock_pi_atomic()
- * exit_signals(tsk) No waiters:
- * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
- * mm_release(tsk) Set waiter bit
- * exit_robust_list(tsk) { *uaddr = 0x80000PID;
- * Set owner died attach_to_pi_owner() {
- * *uaddr = 0xC0000000; tsk = get_task(PID);
- * } if (!tsk->flags & PF_EXITING) {
- * ... attach();
- * tsk->futex_state = } else {
- * FUTEX_STATE_DEAD; if (tsk->futex_state !=
- * FUTEX_STATE_DEAD)
- * return -EAGAIN;
- * return -ESRCH; <--- FAIL
- * }
- *
- * Returning ESRCH unconditionally is wrong here because the
- * user space value has been changed by the exiting task.
- *
- * The same logic applies to the case where the exiting task is
- * already gone.
- */
- if (get_futex_value_locked(&uval2, uaddr))
- return -EFAULT;
-
- /* If the user space value has changed, try again. */
- if (uval2 != uval)
- return -EAGAIN;
-
- /*
- * The exiting task did not have a robust list, the robust list was
- * corrupted or the user space value in *uaddr is simply bogus.
- * Give up and tell user space.
- */
- return -ESRCH;
-}
-
-/*
- * Lookup the task for the TID provided from user space and attach to
- * it after doing proper sanity checks.
- */
-static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
- struct futex_pi_state **ps,
- struct task_struct **exiting)
-{
- pid_t pid = uval & FUTEX_TID_MASK;
- struct futex_pi_state *pi_state;
- struct task_struct *p;
-
- /*
- * We are the first waiter - try to look up the real owner and attach
- * the new pi_state to it, but bail out when TID = 0 [1]
- *
- * The !pid check is paranoid. None of the call sites should end up
- * with pid == 0, but better safe than sorry. Let the caller retry
- */
- if (!pid)
- return -EAGAIN;
- p = find_get_task_by_vpid(pid);
- if (!p)
- return handle_exit_race(uaddr, uval, NULL);
-
- if (unlikely(p->flags & PF_KTHREAD)) {
- put_task_struct(p);
- return -EPERM;
- }
-
- /*
- * We need to look at the task state to figure out, whether the
- * task is exiting. To protect against the change of the task state
- * in futex_exit_release(), we do this protected by p->pi_lock:
- */
- raw_spin_lock_irq(&p->pi_lock);
- if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
- /*
- * The task is on the way out. When the futex state is
- * FUTEX_STATE_DEAD, we know that the task has finished
- * the cleanup:
- */
- int ret = handle_exit_race(uaddr, uval, p);
-
- raw_spin_unlock_irq(&p->pi_lock);
- /*
- * If the owner task is between FUTEX_STATE_EXITING and
- * FUTEX_STATE_DEAD then store the task pointer and keep
- * the reference on the task struct. The calling code will
- * drop all locks, wait for the task to reach
- * FUTEX_STATE_DEAD and then drop the refcount. This is
- * required to prevent a live lock when the current task
- * preempted the exiting task between the two states.
- */
- if (ret == -EBUSY)
- *exiting = p;
- else
- put_task_struct(p);
- return ret;
- }
-
- /*
- * No existing pi state. First waiter. [2]
- *
- * This creates pi_state, we have hb->lock held, this means nothing can
- * observe this state, wait_lock is irrelevant.
- */
- pi_state = alloc_pi_state();
-
- /*
- * Initialize the pi_mutex in locked state and make @p
- * the owner of it:
- */
- rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
-
- /* Store the key for possible exit cleanups: */
- pi_state->key = *key;
-
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &p->pi_state_list);
- /*
- * Assignment without holding pi_state->pi_mutex.wait_lock is safe
- * because there is no concurrency as the object is not published yet.
- */
- pi_state->owner = p;
- raw_spin_unlock_irq(&p->pi_lock);
-
- put_task_struct(p);
-
- *ps = pi_state;
-
- return 0;
-}
-
-static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
-{
- int err;
- u32 curval;
-
- if (unlikely(should_fail_futex(true)))
- return -EFAULT;
-
- err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
- if (unlikely(err))
- return err;
-
- /* If user space value changed, let the caller retry */
- return curval != uval ? -EAGAIN : 0;
-}
-
-/**
- * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
- * @uaddr: the pi futex user address
- * @hb: the pi futex hash bucket
- * @key: the futex key associated with uaddr and hb
- * @ps: the pi_state pointer where we store the result of the
- * lookup
- * @task: the task to perform the atomic lock work for. This will
- * be "current" except in the case of requeue pi.
- * @exiting: Pointer to store the task pointer of the owner task
- * which is in the middle of exiting
- * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
- *
- * Return:
- * - 0 - ready to wait;
- * - 1 - acquired the lock;
- * - <0 - error
- *
- * The hb->lock must be held by the caller.
- *
- * @exiting is only set when the return value is -EBUSY. If so, this holds
- * a refcount on the exiting task on return and the caller needs to drop it
- * after waiting for the exit to complete.
- */
-static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
- union futex_key *key,
- struct futex_pi_state **ps,
- struct task_struct *task,
- struct task_struct **exiting,
- int set_waiters)
-{
- u32 uval, newval, vpid = task_pid_vnr(task);
- struct futex_q *top_waiter;
- int ret;
-
- /*
- * Read the user space value first so we can validate a few
- * things before proceeding further.
- */
- if (get_futex_value_locked(&uval, uaddr))
- return -EFAULT;
-
- if (unlikely(should_fail_futex(true)))
- return -EFAULT;
-
- /*
- * Detect deadlocks.
- */
- if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
- return -EDEADLK;
-
- if ((unlikely(should_fail_futex(true))))
- return -EDEADLK;
-
- /*
- * Lookup existing state first. If it exists, try to attach to
- * its pi_state.
- */
- top_waiter = futex_top_waiter(hb, key);
- if (top_waiter)
- return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
-
- /*
- * No waiter and user TID is 0. We are here because the
- * waiters or the owner died bit is set or called from
- * requeue_cmp_pi or for whatever reason something took the
- * syscall.
- */
- if (!(uval & FUTEX_TID_MASK)) {
- /*
- * We take over the futex. No other waiters and the user space
- * TID is 0. We preserve the owner died bit.
- */
- newval = uval & FUTEX_OWNER_DIED;
- newval |= vpid;
-
- /* The futex requeue_pi code can enforce the waiters bit */
- if (set_waiters)
- newval |= FUTEX_WAITERS;
-
- ret = lock_pi_update_atomic(uaddr, uval, newval);
- /* If the take over worked, return 1 */
- return ret < 0 ? ret : 1;
- }
-
- /*
- * First waiter. Set the waiters bit before attaching ourself to
- * the owner. If owner tries to unlock, it will be forced into
- * the kernel and blocked on hb->lock.
- */
- newval = uval | FUTEX_WAITERS;
- ret = lock_pi_update_atomic(uaddr, uval, newval);
- if (ret)
- return ret;
- /*
- * If the update of the user space value succeeded, we try to
- * attach to the owner. If that fails, no harm done, we only
- * set the FUTEX_WAITERS bit in the user space variable.
- */
- return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
-}
-
-/**
- * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
- * @q: The futex_q to unqueue
- *
- * The q->lock_ptr must not be NULL and must be held by the caller.
- */
-static void __unqueue_futex(struct futex_q *q)
-{
- struct futex_hash_bucket *hb;
-
- if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
- return;
- lockdep_assert_held(q->lock_ptr);
-
- hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
- plist_del(&q->list, &hb->chain);
- hb_waiters_dec(hb);
-}
-
-/*
- * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed. Callers
- * must ensure to later call wake_up_q() for the actual
- * wakeups to occur.
- */
-static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
-{
- struct task_struct *p = q->task;
-
- if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
- return;
-
- get_task_struct(p);
- __unqueue_futex(q);
- /*
- * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
- * is written, without taking any locks. This is possible in the event
- * of a spurious wakeup, for example. A memory barrier is required here
- * to prevent the following store to lock_ptr from getting ahead of the
- * plist_del in __unqueue_futex().
- */
- smp_store_release(&q->lock_ptr, NULL);
-
- /*
- * Queue the task for later wakeup for after we've released
- * the hb->lock.
- */
- wake_q_add_safe(wake_q, p);
-}
-
-/*
- * Caller must hold a reference on @pi_state.
- */
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
-{
- struct rt_mutex_waiter *top_waiter;
- struct task_struct *new_owner;
- bool postunlock = false;
- DEFINE_RT_WAKE_Q(wqh);
- u32 curval, newval;
- int ret = 0;
-
- top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
- if (WARN_ON_ONCE(!top_waiter)) {
- /*
- * As per the comment in futex_unlock_pi() this should not happen.
- *
- * When this happens, give up our locks and try again, giving
- * the futex_lock_pi() instance time to complete, either by
- * waiting on the rtmutex or removing itself from the futex
- * queue.
- */
- ret = -EAGAIN;
- goto out_unlock;
- }
-
- new_owner = top_waiter->task;
-
- /*
- * We pass it to the next owner. The WAITERS bit is always kept
- * enabled while there is PI state around. We cleanup the owner
- * died bit, because we are the owner.
- */
- newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
-
- if (unlikely(should_fail_futex(true))) {
- ret = -EFAULT;
- goto out_unlock;
- }
-
- ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
- if (!ret && (curval != uval)) {
- /*
- * If a unconditional UNLOCK_PI operation (user space did not
- * try the TID->0 transition) raced with a waiter setting the
- * FUTEX_WAITERS flag between get_user() and locking the hash
- * bucket lock, retry the operation.
- */
- if ((FUTEX_TID_MASK & curval) == uval)
- ret = -EAGAIN;
- else
- ret = -EINVAL;
- }
-
- if (!ret) {
- /*
- * This is a point of no return; once we modified the uval
- * there is no going back and subsequent operations must
- * not fail.
- */
- pi_state_update_owner(pi_state, new_owner);
- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
- }
-
-out_unlock:
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-
- if (postunlock)
- rt_mutex_postunlock(&wqh);
-
- return ret;
-}
-
-/*
- * Express the locking dependencies for lockdep:
- */
-static inline void
-double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
-{
- if (hb1 <= hb2) {
- spin_lock(&hb1->lock);
- if (hb1 < hb2)
- spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
- } else { /* hb1 > hb2 */
- spin_lock(&hb2->lock);
- spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
- }
-}
-
-static inline void
-double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
-{
- spin_unlock(&hb1->lock);
- if (hb1 != hb2)
- spin_unlock(&hb2->lock);
-}
-
-/*
- * Wake up waiters matching bitset queued on this futex (uaddr).
- */
-static int
-futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
-{
- struct futex_hash_bucket *hb;
- struct futex_q *this, *next;
- union futex_key key = FUTEX_KEY_INIT;
- int ret;
- DEFINE_WAKE_Q(wake_q);
-
- if (!bitset)
- return -EINVAL;
-
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
- if (unlikely(ret != 0))
- return ret;
-
- hb = hash_futex(&key);
-
- /* Make sure we really have tasks to wakeup */
- if (!hb_waiters_pending(hb))
- return ret;
-
- spin_lock(&hb->lock);
-
- plist_for_each_entry_safe(this, next, &hb->chain, list) {
- if (match_futex (&this->key, &key)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- break;
- }
-
- /* Check if one of the bits is set in both bitsets */
- if (!(this->bitset & bitset))
- continue;
-
- mark_wake_futex(&wake_q, this);
- if (++ret >= nr_wake)
- break;
- }
- }
-
- spin_unlock(&hb->lock);
- wake_up_q(&wake_q);
- return ret;
-}
-
-static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
-{
- unsigned int op = (encoded_op & 0x70000000) >> 28;
- unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
- int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
- int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
- int oldval, ret;
-
- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
- if (oparg < 0 || oparg > 31) {
- char comm[sizeof(current->comm)];
- /*
- * kill this print and return -EINVAL when userspace
- * is sane again
- */
- pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
- get_task_comm(comm, current), oparg);
- oparg &= 31;
- }
- oparg = 1 << oparg;
- }
-
- pagefault_disable();
- ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
- pagefault_enable();
- if (ret)
- return ret;
-
- switch (cmp) {
- case FUTEX_OP_CMP_EQ:
- return oldval == cmparg;
- case FUTEX_OP_CMP_NE:
- return oldval != cmparg;
- case FUTEX_OP_CMP_LT:
- return oldval < cmparg;
- case FUTEX_OP_CMP_GE:
- return oldval >= cmparg;
- case FUTEX_OP_CMP_LE:
- return oldval <= cmparg;
- case FUTEX_OP_CMP_GT:
- return oldval > cmparg;
- default:
- return -ENOSYS;
- }
-}
-
-/*
- * Wake up all waiters hashed on the physical page that is mapped
- * to this virtual address:
- */
-static int
-futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
- int nr_wake, int nr_wake2, int op)
-{
- union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb1, *hb2;
- struct futex_q *this, *next;
- int ret, op_ret;
- DEFINE_WAKE_Q(wake_q);
-
-retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
- if (unlikely(ret != 0))
- return ret;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
- if (unlikely(ret != 0))
- return ret;
-
- hb1 = hash_futex(&key1);
- hb2 = hash_futex(&key2);
-
-retry_private:
- double_lock_hb(hb1, hb2);
- op_ret = futex_atomic_op_inuser(op, uaddr2);
- if (unlikely(op_ret < 0)) {
- double_unlock_hb(hb1, hb2);
-
- if (!IS_ENABLED(CONFIG_MMU) ||
- unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
- /*
- * we don't get EFAULT from MMU faults if we don't have
- * an MMU, but we might get them from range checking
- */
- ret = op_ret;
- return ret;
- }
-
- if (op_ret == -EFAULT) {
- ret = fault_in_user_writeable(uaddr2);
- if (ret)
- return ret;
- }
-
- cond_resched();
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
- goto retry;
- }
-
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (match_futex (&this->key, &key1)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- goto out_unlock;
- }
- mark_wake_futex(&wake_q, this);
- if (++ret >= nr_wake)
- break;
- }
- }
-
- if (op_ret > 0) {
- op_ret = 0;
- plist_for_each_entry_safe(this, next, &hb2->chain, list) {
- if (match_futex (&this->key, &key2)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- goto out_unlock;
- }
- mark_wake_futex(&wake_q, this);
- if (++op_ret >= nr_wake2)
- break;
- }
- }
- ret += op_ret;
- }
-
-out_unlock:
- double_unlock_hb(hb1, hb2);
- wake_up_q(&wake_q);
- return ret;
-}
-
-/**
- * requeue_futex() - Requeue a futex_q from one hb to another
- * @q: the futex_q to requeue
- * @hb1: the source hash_bucket
- * @hb2: the target hash_bucket
- * @key2: the new key for the requeued futex_q
- */
-static inline
-void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
- struct futex_hash_bucket *hb2, union futex_key *key2)
-{
-
- /*
- * If key1 and key2 hash to the same bucket, no need to
- * requeue.
- */
- if (likely(&hb1->chain != &hb2->chain)) {
- plist_del(&q->list, &hb1->chain);
- hb_waiters_dec(hb1);
- hb_waiters_inc(hb2);
- plist_add(&q->list, &hb2->chain);
- q->lock_ptr = &hb2->lock;
- }
- q->key = *key2;
-}
-
-static inline bool futex_requeue_pi_prepare(struct futex_q *q,
- struct futex_pi_state *pi_state)
-{
- int old, new;
-
- /*
- * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
- * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
- * ignore the waiter.
- */
- old = atomic_read_acquire(&q->requeue_state);
- do {
- if (old == Q_REQUEUE_PI_IGNORE)
- return false;
-
- /*
- * futex_proxy_trylock_atomic() might have set it to
- * IN_PROGRESS and a interleaved early wake to WAIT.
- *
- * It was considered to have an extra state for that
- * trylock, but that would just add more conditionals
- * all over the place for a dubious value.
- */
- if (old != Q_REQUEUE_PI_NONE)
- break;
-
- new = Q_REQUEUE_PI_IN_PROGRESS;
- } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
-
- q->pi_state = pi_state;
- return true;
-}
-
-static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
-{
- int old, new;
-
- old = atomic_read_acquire(&q->requeue_state);
- do {
- if (old == Q_REQUEUE_PI_IGNORE)
- return;
-
- if (locked >= 0) {
- /* Requeue succeeded. Set DONE or LOCKED */
- WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&
- old != Q_REQUEUE_PI_WAIT);
- new = Q_REQUEUE_PI_DONE + locked;
- } else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
- /* Deadlock, no early wakeup interleave */
- new = Q_REQUEUE_PI_NONE;
- } else {
- /* Deadlock, early wakeup interleave. */
- WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);
- new = Q_REQUEUE_PI_IGNORE;
- }
- } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
-
-#ifdef CONFIG_PREEMPT_RT
- /* If the waiter interleaved with the requeue let it know */
- if (unlikely(old == Q_REQUEUE_PI_WAIT))
- rcuwait_wake_up(&q->requeue_wait);
-#endif
-}
-
-static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
-{
- int old, new;
-
- old = atomic_read_acquire(&q->requeue_state);
- do {
- /* Is requeue done already? */
- if (old >= Q_REQUEUE_PI_DONE)
- return old;
-
- /*
- * If not done, then tell the requeue code to either ignore
- * the waiter or to wake it up once the requeue is done.
- */
- new = Q_REQUEUE_PI_WAIT;
- if (old == Q_REQUEUE_PI_NONE)
- new = Q_REQUEUE_PI_IGNORE;
- } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
-
- /* If the requeue was in progress, wait for it to complete */
- if (old == Q_REQUEUE_PI_IN_PROGRESS) {
-#ifdef CONFIG_PREEMPT_RT
- rcuwait_wait_event(&q->requeue_wait,
- atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
- TASK_UNINTERRUPTIBLE);
-#else
- (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
-#endif
- }
-
- /*
- * Requeue is now either prohibited or complete. Reread state
- * because during the wait above it might have changed. Nothing
- * will modify q->requeue_state after this point.
- */
- return atomic_read(&q->requeue_state);
-}
-
-/**
- * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
- * @q: the futex_q
- * @key: the key of the requeue target futex
- * @hb: the hash_bucket of the requeue target futex
- *
- * During futex_requeue, with requeue_pi=1, it is possible to acquire the
- * target futex if it is uncontended or via a lock steal. Set the futex_q key
- * to the requeue target futex so the waiter can detect the wakeup on the right
- * futex, but remove it from the hb and NULL the rt_waiter so it can detect
- * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
- * to protect access to the pi_state to fixup the owner later. Must be called
- * with both q->lock_ptr and hb->lock held.
- */
-static inline
-void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
- struct futex_hash_bucket *hb)
-{
- q->key = *key;
-
- __unqueue_futex(q);
-
- WARN_ON(!q->rt_waiter);
- q->rt_waiter = NULL;
-
- q->lock_ptr = &hb->lock;
-
- /* Signal locked state to the waiter */
- futex_requeue_pi_complete(q, 1);
- wake_up_state(q->task, TASK_NORMAL);
-}
-
-/**
- * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
- * @pifutex: the user address of the to futex
- * @hb1: the from futex hash bucket, must be locked by the caller
- * @hb2: the to futex hash bucket, must be locked by the caller
- * @key1: the from futex key
- * @key2: the to futex key
- * @ps: address to store the pi_state pointer
- * @exiting: Pointer to store the task pointer of the owner task
- * which is in the middle of exiting
- * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
- *
- * Try and get the lock on behalf of the top waiter if we can do it atomically.
- * Wake the top waiter if we succeed. If the caller specified set_waiters,
- * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
- * hb1 and hb2 must be held by the caller.
- *
- * @exiting is only set when the return value is -EBUSY. If so, this holds
- * a refcount on the exiting task on return and the caller needs to drop it
- * after waiting for the exit to complete.
- *
- * Return:
- * - 0 - failed to acquire the lock atomically;
- * - >0 - acquired the lock, return value is vpid of the top_waiter
- * - <0 - error
- */
-static int
-futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
- struct futex_hash_bucket *hb2, union futex_key *key1,
- union futex_key *key2, struct futex_pi_state **ps,
- struct task_struct **exiting, int set_waiters)
-{
- struct futex_q *top_waiter = NULL;
- u32 curval;
- int ret, vpid;
-
- if (get_futex_value_locked(&curval, pifutex))
- return -EFAULT;
-
- if (unlikely(should_fail_futex(true)))
- return -EFAULT;
-
- /*
- * Find the top_waiter and determine if there are additional waiters.
- * If the caller intends to requeue more than 1 waiter to pifutex,
- * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
- * as we have means to handle the possible fault. If not, don't set
- * the bit unnecessarily as it will force the subsequent unlock to enter
- * the kernel.
- */
- top_waiter = futex_top_waiter(hb1, key1);
-
- /* There are no waiters, nothing for us to do. */
- if (!top_waiter)
- return 0;
-
- /*
- * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
- * and waiting on the 'waitqueue' futex which is always !PI.
- */
- if (!top_waiter->rt_waiter || top_waiter->pi_state)
- ret = -EINVAL;
-
- /* Ensure we requeue to the expected futex. */
- if (!match_futex(top_waiter->requeue_pi_key, key2))
- return -EINVAL;
-
- /* Ensure that this does not race against an early wakeup */
- if (!futex_requeue_pi_prepare(top_waiter, NULL))
- return -EAGAIN;
-
- /*
- * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
- * the contended case or if set_waiters is 1. The pi_state is returned
- * in ps in contended cases.
- */
- vpid = task_pid_vnr(top_waiter->task);
- ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
- exiting, set_waiters);
- if (ret == 1) {
- /* Dequeue, wake up and update top_waiter::requeue_state */
- requeue_pi_wake_futex(top_waiter, key2, hb2);
- return vpid;
- } else if (ret < 0) {
- /* Rewind top_waiter::requeue_state */
- futex_requeue_pi_complete(top_waiter, ret);
- } else {
- /*
- * futex_lock_pi_atomic() did not acquire the user space
- * futex, but managed to establish the proxy lock and pi
- * state. top_waiter::requeue_state cannot be fixed up here
- * because the waiter is not enqueued on the rtmutex
- * yet. This is handled at the callsite depending on the
- * result of rt_mutex_start_proxy_lock() which is
- * guaranteed to be reached with this function returning 0.
- */
- }
- return ret;
-}
-
-/**
- * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
- * @uaddr1: source futex user address
- * @flags: futex flags (FLAGS_SHARED, etc.)
- * @uaddr2: target futex user address
- * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
- * @nr_requeue: number of waiters to requeue (0-INT_MAX)
- * @cmpval: @uaddr1 expected value (or %NULL)
- * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
- * pi futex (pi to pi requeue is not supported)
- *
- * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
- * uaddr2 atomically on behalf of the top waiter.
- *
- * Return:
- * - >=0 - on success, the number of tasks requeued or woken;
- * - <0 - on error
- */
-static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
- u32 __user *uaddr2, int nr_wake, int nr_requeue,
- u32 *cmpval, int requeue_pi)
-{
- union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
- int task_count = 0, ret;
- struct futex_pi_state *pi_state = NULL;
- struct futex_hash_bucket *hb1, *hb2;
- struct futex_q *this, *next;
- DEFINE_WAKE_Q(wake_q);
-
- if (nr_wake < 0 || nr_requeue < 0)
- return -EINVAL;
-
- /*
- * When PI not supported: return -ENOSYS if requeue_pi is true,
- * consequently the compiler knows requeue_pi is always false past
- * this point which will optimize away all the conditional code
- * further down.
- */
- if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
- return -ENOSYS;
-
- if (requeue_pi) {
- /*
- * Requeue PI only works on two distinct uaddrs. This
- * check is only valid for private futexes. See below.
- */
- if (uaddr1 == uaddr2)
- return -EINVAL;
-
- /*
- * futex_requeue() allows the caller to define the number
- * of waiters to wake up via the @nr_wake argument. With
- * REQUEUE_PI, waking up more than one waiter is creating
- * more problems than it solves. Waking up a waiter makes
- * only sense if the PI futex @uaddr2 is uncontended as
- * this allows the requeue code to acquire the futex
- * @uaddr2 before waking the waiter. The waiter can then
- * return to user space without further action. A secondary
- * wakeup would just make the futex_wait_requeue_pi()
- * handling more complex, because that code would have to
- * look up pi_state and do more or less all the handling
- * which the requeue code has to do for the to be requeued
- * waiters. So restrict the number of waiters to wake to
- * one, and only wake it up when the PI futex is
- * uncontended. Otherwise requeue it and let the unlock of
- * the PI futex handle the wakeup.
- *
- * All REQUEUE_PI users, e.g. pthread_cond_signal() and
- * pthread_cond_broadcast() must use nr_wake=1.
- */
- if (nr_wake != 1)
- return -EINVAL;
-
- /*
- * requeue_pi requires a pi_state, try to allocate it now
- * without any locks in case it fails.
- */
- if (refill_pi_state_cache())
- return -ENOMEM;
- }
-
-retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
- if (unlikely(ret != 0))
- return ret;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
- requeue_pi ? FUTEX_WRITE : FUTEX_READ);
- if (unlikely(ret != 0))
- return ret;
-
- /*
- * The check above which compares uaddrs is not sufficient for
- * shared futexes. We need to compare the keys:
- */
- if (requeue_pi && match_futex(&key1, &key2))
- return -EINVAL;
-
- hb1 = hash_futex(&key1);
- hb2 = hash_futex(&key2);
-
-retry_private:
- hb_waiters_inc(hb2);
- double_lock_hb(hb1, hb2);
-
- if (likely(cmpval != NULL)) {
- u32 curval;
-
- ret = get_futex_value_locked(&curval, uaddr1);
-
- if (unlikely(ret)) {
- double_unlock_hb(hb1, hb2);
- hb_waiters_dec(hb2);
-
- ret = get_user(curval, uaddr1);
- if (ret)
- return ret;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- goto retry;
- }
- if (curval != *cmpval) {
- ret = -EAGAIN;
- goto out_unlock;
- }
- }
-
- if (requeue_pi) {
- struct task_struct *exiting = NULL;
-
- /*
- * Attempt to acquire uaddr2 and wake the top waiter. If we
- * intend to requeue waiters, force setting the FUTEX_WAITERS
- * bit. We force this here where we are able to easily handle
- * faults rather in the requeue loop below.
- *
- * Updates topwaiter::requeue_state if a top waiter exists.
- */
- ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
- &key2, &pi_state,
- &exiting, nr_requeue);
-
- /*
- * At this point the top_waiter has either taken uaddr2 or is
- * waiting on it. If the former, then the pi_state will not
- * exist yet, look it up one more time to ensure we have a
- * reference to it. If the lock was taken, @ret contains the
- * VPID of the top waiter task.
- * If the lock was not taken, we have pi_state and an initial
- * refcount on it. In case of an error we have nothing.
- *
- * The top waiter's requeue_state is up to date:
- *
- * - If the lock was acquired atomically (ret > 0), then
- * the state is Q_REQUEUE_PI_LOCKED.
- *
- * - If the trylock failed with an error (ret < 0) then
- * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
- * happened", or Q_REQUEUE_PI_IGNORE when there was an
- * interleaved early wakeup.
- *
- * - If the trylock did not succeed (ret == 0) then the
- * state is either Q_REQUEUE_PI_IN_PROGRESS or
- * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
- * This will be cleaned up in the loop below, which
- * cannot fail because futex_proxy_trylock_atomic() did
- * the same sanity checks for requeue_pi as the loop
- * below does.
- */
- if (ret > 0) {
- WARN_ON(pi_state);
- task_count++;
- /*
- * If futex_proxy_trylock_atomic() acquired the
- * user space futex, then the user space value
- * @uaddr2 has been set to the @hb1's top waiter
- * task VPID. This task is guaranteed to be alive
- * and cannot be exiting because it is either
- * sleeping or blocked on @hb2 lock.
- *
- * The @uaddr2 futex cannot have waiters either as
- * otherwise futex_proxy_trylock_atomic() would not
- * have succeeded.
- *
- * In order to requeue waiters to @hb2, pi state is
- * required. Hand in the VPID value (@ret) and
- * allocate PI state with an initial refcount on
- * it.
- */
- ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state,
- &exiting);
- WARN_ON(ret);
- }
-
- switch (ret) {
- case 0:
- /* We hold a reference on the pi state. */
- break;
-
- /*
- * If the above failed, then pi_state is NULL and
- * waiter::requeue_state is correct.
- */
- case -EFAULT:
- double_unlock_hb(hb1, hb2);
- hb_waiters_dec(hb2);
- ret = fault_in_user_writeable(uaddr2);
- if (!ret)
- goto retry;
- return ret;
- case -EBUSY:
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - EBUSY: Owner is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
- */
- double_unlock_hb(hb1, hb2);
- hb_waiters_dec(hb2);
- /*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
- */
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock;
- }
- }
-
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (task_count - nr_wake >= nr_requeue)
- break;
-
- if (!match_futex(&this->key, &key1))
- continue;
-
- /*
- * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
- * be paired with each other and no other futex ops.
- *
- * We should never be requeueing a futex_q with a pi_state,
- * which is awaiting a futex_unlock_pi().
- */
- if ((requeue_pi && !this->rt_waiter) ||
- (!requeue_pi && this->rt_waiter) ||
- this->pi_state) {
- ret = -EINVAL;
- break;
- }
-
- /* Plain futexes just wake or requeue and are done */
- if (!requeue_pi) {
- if (++task_count <= nr_wake)
- mark_wake_futex(&wake_q, this);
- else
- requeue_futex(this, hb1, hb2, &key2);
- continue;
- }
-
- /* Ensure we requeue to the expected futex for requeue_pi. */
- if (!match_futex(this->requeue_pi_key, &key2)) {
- ret = -EINVAL;
- break;
- }
-
- /*
- * Requeue nr_requeue waiters and possibly one more in the case
- * of requeue_pi if we couldn't acquire the lock atomically.
- *
- * Prepare the waiter to take the rt_mutex. Take a refcount
- * on the pi_state and store the pointer in the futex_q
- * object of the waiter.
- */
- get_pi_state(pi_state);
-
- /* Don't requeue when the waiter is already on the way out. */
- if (!futex_requeue_pi_prepare(this, pi_state)) {
- /*
- * Early woken waiter signaled that it is on the
- * way out. Drop the pi_state reference and try the
- * next waiter. @this->pi_state is still NULL.
- */
- put_pi_state(pi_state);
- continue;
- }
-
- ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
- this->rt_waiter,
- this->task);
-
- if (ret == 1) {
- /*
- * We got the lock. We do neither drop the refcount
- * on pi_state nor clear this->pi_state because the
- * waiter needs the pi_state for cleaning up the
- * user space value. It will drop the refcount
- * after doing so. this::requeue_state is updated
- * in the wakeup as well.
- */
- requeue_pi_wake_futex(this, &key2, hb2);
- task_count++;
- } else if (!ret) {
- /* Waiter is queued, move it to hb2 */
- requeue_futex(this, hb1, hb2, &key2);
- futex_requeue_pi_complete(this, 0);
- task_count++;
- } else {
- /*
- * rt_mutex_start_proxy_lock() detected a potential
- * deadlock when we tried to queue that waiter.
- * Drop the pi_state reference which we took above
- * and remove the pointer to the state from the
- * waiters futex_q object.
- */
- this->pi_state = NULL;
- put_pi_state(pi_state);
- futex_requeue_pi_complete(this, ret);
- /*
- * We stop queueing more waiters and let user space
- * deal with the mess.
- */
- break;
- }
- }
-
- /*
- * We took an extra initial reference to the pi_state either in
- * futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need
- * to drop it here again.
- */
- put_pi_state(pi_state);
-
-out_unlock:
- double_unlock_hb(hb1, hb2);
- wake_up_q(&wake_q);
- hb_waiters_dec(hb2);
- return ret ? ret : task_count;
-}
-
-/* The key must be already stored in q->key. */
-static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
- __acquires(&hb->lock)
-{
- struct futex_hash_bucket *hb;
-
- hb = hash_futex(&q->key);
-
- /*
- * Increment the counter before taking the lock so that
- * a potential waker won't miss a to-be-slept task that is
- * waiting for the spinlock. This is safe as all queue_lock()
- * users end up calling queue_me(). Similarly, for housekeeping,
- * decrement the counter at queue_unlock() when some error has
- * occurred and we don't end up adding the task to the list.
- */
- hb_waiters_inc(hb); /* implies smp_mb(); (A) */
-
- q->lock_ptr = &hb->lock;
-
- spin_lock(&hb->lock);
- return hb;
-}
-
-static inline void
-queue_unlock(struct futex_hash_bucket *hb)
- __releases(&hb->lock)
-{
- spin_unlock(&hb->lock);
- hb_waiters_dec(hb);
-}
-
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
-{
- int prio;
-
- /*
- * The priority used to register this element is
- * - either the real thread-priority for the real-time threads
- * (i.e. threads with a priority lower than MAX_RT_PRIO)
- * - or MAX_RT_PRIO for non-RT threads.
- * Thus, all RT-threads are woken first in priority order, and
- * the others are woken last, in FIFO order.
- */
- prio = min(current->normal_prio, MAX_RT_PRIO);
-
- plist_node_init(&q->list, prio);
- plist_add(&q->list, &hb->chain);
- q->task = current;
-}
-
-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb: The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me(). The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
- __releases(&hb->lock)
-{
- __queue_me(q, hb);
- spin_unlock(&hb->lock);
-}
-
-/**
- * unqueue_me() - Remove the futex_q from its futex_hash_bucket
- * @q: The futex_q to unqueue
- *
- * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
- * be paired with exactly one earlier call to queue_me().
- *
- * Return:
- * - 1 - if the futex_q was still queued (and we removed unqueued it);
- * - 0 - if the futex_q was already removed by the waking thread
- */
-static int unqueue_me(struct futex_q *q)
-{
- spinlock_t *lock_ptr;
- int ret = 0;
-
- /* In the common case we don't take the spinlock, which is nice. */
-retry:
- /*
- * q->lock_ptr can change between this read and the following spin_lock.
- * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
- * optimizing lock_ptr out of the logic below.
- */
- lock_ptr = READ_ONCE(q->lock_ptr);
- if (lock_ptr != NULL) {
- spin_lock(lock_ptr);
- /*
- * q->lock_ptr can change between reading it and
- * spin_lock(), causing us to take the wrong lock. This
- * corrects the race condition.
- *
- * Reasoning goes like this: if we have the wrong lock,
- * q->lock_ptr must have changed (maybe several times)
- * between reading it and the spin_lock(). It can
- * change again after the spin_lock() but only if it was
- * already changed before the spin_lock(). It cannot,
- * however, change back to the original value. Therefore
- * we can detect whether we acquired the correct lock.
- */
- if (unlikely(lock_ptr != q->lock_ptr)) {
- spin_unlock(lock_ptr);
- goto retry;
- }
- __unqueue_futex(q);
-
- BUG_ON(q->pi_state);
-
- spin_unlock(lock_ptr);
- ret = 1;
- }
-
- return ret;
-}
-
-/*
- * PI futexes can not be requeued and must remove themselves from the
- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
- */
-static void unqueue_me_pi(struct futex_q *q)
-{
- __unqueue_futex(q);
-
- BUG_ON(!q->pi_state);
- put_pi_state(q->pi_state);
- q->pi_state = NULL;
-}
-
-static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *argowner)
-{
- struct futex_pi_state *pi_state = q->pi_state;
- struct task_struct *oldowner, *newowner;
- u32 uval, curval, newval, newtid;
- int err = 0;
-
- oldowner = pi_state->owner;
-
- /*
- * We are here because either:
- *
- * - we stole the lock and pi_state->owner needs updating to reflect
- * that (@argowner == current),
- *
- * or:
- *
- * - someone stole our lock and we need to fix things to point to the
- * new owner (@argowner == NULL).
- *
- * Either way, we have to replace the TID in the user space variable.
- * This must be atomic as we have to preserve the owner died bit here.
- *
- * Note: We write the user space value _before_ changing the pi_state
- * because we can fault here. Imagine swapped out pages or a fork
- * that marked all the anonymous memory readonly for cow.
- *
- * Modifying pi_state _before_ the user space value would leave the
- * pi_state in an inconsistent state when we fault here, because we
- * need to drop the locks to handle the fault. This might be observed
- * in the PID checks when attaching to PI state .
- */
-retry:
- if (!argowner) {
- if (oldowner != current) {
- /*
- * We raced against a concurrent self; things are
- * already fixed up. Nothing to do.
- */
- return 0;
- }
-
- if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
- /* We got the lock. pi_state is correct. Tell caller. */
- return 1;
- }
-
- /*
- * The trylock just failed, so either there is an owner or
- * there is a higher priority waiter than this one.
- */
- newowner = rt_mutex_owner(&pi_state->pi_mutex);
- /*
- * If the higher priority waiter has not yet taken over the
- * rtmutex then newowner is NULL. We can't return here with
- * that state because it's inconsistent vs. the user space
- * state. So drop the locks and try again. It's a valid
- * situation and not any different from the other retry
- * conditions.
- */
- if (unlikely(!newowner)) {
- err = -EAGAIN;
- goto handle_err;
- }
- } else {
- WARN_ON_ONCE(argowner != current);
- if (oldowner == current) {
- /*
- * We raced against a concurrent self; things are
- * already fixed up. Nothing to do.
- */
- return 1;
- }
- newowner = argowner;
- }
-
- newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
- /* Owner died? */
- if (!pi_state->owner)
- newtid |= FUTEX_OWNER_DIED;
-
- err = get_futex_value_locked(&uval, uaddr);
- if (err)
- goto handle_err;
-
- for (;;) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
-
- err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
- if (err)
- goto handle_err;
-
- if (curval == uval)
- break;
- uval = curval;
- }
-
- /*
- * We fixed up user space. Now we need to fix the pi_state
- * itself.
- */
- pi_state_update_owner(pi_state, newowner);
-
- return argowner == current;
-
- /*
- * In order to reschedule or handle a page fault, we need to drop the
- * locks here. In the case of a fault, this gives the other task
- * (either the highest priority waiter itself or the task which stole
- * the rtmutex) the chance to try the fixup of the pi_state. So once we
- * are back from handling the fault we need to check the pi_state after
- * reacquiring the locks and before trying to do another fixup. When
- * the fixup has been done already we simply return.
- *
- * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
- * drop hb->lock since the caller owns the hb -> futex_q relation.
- * Dropping the pi_mutex->wait_lock requires the state revalidate.
- */
-handle_err:
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(q->lock_ptr);
-
- switch (err) {
- case -EFAULT:
- err = fault_in_user_writeable(uaddr);
- break;
-
- case -EAGAIN:
- cond_resched();
- err = 0;
- break;
-
- default:
- WARN_ON_ONCE(1);
- break;
- }
-
- spin_lock(q->lock_ptr);
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-
- /*
- * Check if someone else fixed it for us:
- */
- if (pi_state->owner != oldowner)
- return argowner == current;
-
- /* Retry if err was -EAGAIN or the fault in succeeded */
- if (!err)
- goto retry;
-
- /*
- * fault_in_user_writeable() failed so user state is immutable. At
- * best we can make the kernel state consistent but user state will
- * be most likely hosed and any subsequent unlock operation will be
- * rejected due to PI futex rule [10].
- *
- * Ensure that the rtmutex owner is also the pi_state owner despite
- * the user space value claiming something different. There is no
- * point in unlocking the rtmutex if current is the owner as it
- * would need to wait until the next waiter has taken the rtmutex
- * to guarantee consistent state. Keep it simple. Userspace asked
- * for this wreckaged state.
- *
- * The rtmutex has an owner - either current or some other
- * task. See the EAGAIN loop above.
- */
- pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
-
- return err;
-}
-
-static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *argowner)
-{
- struct futex_pi_state *pi_state = q->pi_state;
- int ret;
-
- lockdep_assert_held(q->lock_ptr);
-
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- ret = __fixup_pi_state_owner(uaddr, q, argowner);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return ret;
-}
-
-static long futex_wait_restart(struct restart_block *restart);
-
-/**
- * fixup_owner() - Post lock pi_state and corner case management
- * @uaddr: user address of the futex
- * @q: futex_q (contains pi_state and access to the rt_mutex)
- * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
- *
- * After attempting to lock an rt_mutex, this function is called to cleanup
- * the pi_state owner as well as handle race conditions that may allow us to
- * acquire the lock. Must be called with the hb lock held.
- *
- * Return:
- * - 1 - success, lock taken;
- * - 0 - success, lock not taken;
- * - <0 - on error (-EFAULT)
- */
-static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
-{
- if (locked) {
- /*
- * Got the lock. We might not be the anticipated owner if we
- * did a lock-steal - fix up the PI-state in that case:
- *
- * Speculative pi_state->owner read (we don't hold wait_lock);
- * since we own the lock pi_state->owner == current is the
- * stable state, anything else needs more attention.
- */
- if (q->pi_state->owner != current)
- return fixup_pi_state_owner(uaddr, q, current);
- return 1;
- }
-
- /*
- * If we didn't get the lock; check if anybody stole it from us. In
- * that case, we need to fix up the uval to point to them instead of
- * us, otherwise bad things happen. [10]
- *
- * Another speculative read; pi_state->owner == current is unstable
- * but needs our attention.
- */
- if (q->pi_state->owner == current)
- return fixup_pi_state_owner(uaddr, q, NULL);
-
- /*
- * Paranoia check. If we did not take the lock, then we should not be
- * the owner of the rt_mutex. Warn and establish consistent state.
- */
- if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
- return fixup_pi_state_owner(uaddr, q, current);
-
- return 0;
-}
-
-/**
- * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
- * @hb: the futex hash bucket, must be locked by the caller
- * @q: the futex_q to queue up on
- * @timeout: the prepared hrtimer_sleeper, or null for no timeout
- */
-static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout)
-{
- /*
- * The task state is guaranteed to be set before another task can
- * wake it. set_current_state() is implemented using smp_store_mb() and
- * queue_me() calls spin_unlock() upon completion, both serializing
- * access to the hash list and forcing another memory barrier.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- queue_me(q, hb);
-
- /* Arm the timer */
- if (timeout)
- hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
-
- /*
- * If we have been removed from the hash list, then another task
- * has tried to wake us, and we can skip the call to schedule().
- */
- if (likely(!plist_node_empty(&q->list))) {
- /*
- * If the timer has already expired, current will already be
- * flagged for rescheduling. Only call schedule if there
- * is no timeout, or if it has yet to expire.
- */
- if (!timeout || timeout->task)
- freezable_schedule();
- }
- __set_current_state(TASK_RUNNING);
-}
-
-/**
- * futex_wait_setup() - Prepare to wait on a futex
- * @uaddr: the futex userspace address
- * @val: the expected value
- * @flags: futex flags (FLAGS_SHARED, etc.)
- * @q: the associated futex_q
- * @hb: storage for hash_bucket pointer to be returned to caller
- *
- * Setup the futex_q and locate the hash_bucket. Get the futex value and
- * compare it with the expected value. Handle atomic faults internally.
- * Return with the hb lock held on success, and unlocked on failure.
- *
- * Return:
- * - 0 - uaddr contains val and hb has been locked;
- * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
- */
-static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
- struct futex_q *q, struct futex_hash_bucket **hb)
-{
- u32 uval;
- int ret;
-
- /*
- * Access the page AFTER the hash-bucket is locked.
- * Order is important:
- *
- * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
- * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
- *
- * The basic logical guarantee of a futex is that it blocks ONLY
- * if cond(var) is known to be true at the time of blocking, for
- * any cond. If we locked the hash-bucket after testing *uaddr, that
- * would open a race condition where we could block indefinitely with
- * cond(var) false, which would violate the guarantee.
- *
- * On the other hand, we insert q and release the hash-bucket only
- * after testing *uaddr. This guarantees that futex_wait() will NOT
- * absorb a wakeup if *uaddr does not match the desired values
- * while the syscall executes.
- */
-retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
- if (unlikely(ret != 0))
- return ret;
-
-retry_private:
- *hb = queue_lock(q);
-
- ret = get_futex_value_locked(&uval, uaddr);
-
- if (ret) {
- queue_unlock(*hb);
-
- ret = get_user(uval, uaddr);
- if (ret)
- return ret;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- goto retry;
- }
-
- if (uval != val) {
- queue_unlock(*hb);
- ret = -EWOULDBLOCK;
- }
-
- return ret;
-}
-
-static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
- ktime_t *abs_time, u32 bitset)
-{
- struct hrtimer_sleeper timeout, *to;
- struct restart_block *restart;
- struct futex_hash_bucket *hb;
- struct futex_q q = futex_q_init;
- int ret;
-
- if (!bitset)
- return -EINVAL;
- q.bitset = bitset;
-
- to = futex_setup_timer(abs_time, &timeout, flags,
- current->timer_slack_ns);
-retry:
- /*
- * Prepare to wait on uaddr. On success, it holds hb->lock and q
- * is initialized.
- */
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
- if (ret)
- goto out;
-
- /* queue_me and wait for wakeup, timeout, or a signal. */
- futex_wait_queue_me(hb, &q, to);
-
- /* If we were woken (and unqueued), we succeeded, whatever. */
- ret = 0;
- if (!unqueue_me(&q))
- goto out;
- ret = -ETIMEDOUT;
- if (to && !to->task)
- goto out;
-
- /*
- * We expect signal_pending(current), but we might be the
- * victim of a spurious wakeup as well.
- */
- if (!signal_pending(current))
- goto retry;
-
- ret = -ERESTARTSYS;
- if (!abs_time)
- goto out;
-
- restart = &current->restart_block;
- restart->futex.uaddr = uaddr;
- restart->futex.val = val;
- restart->futex.time = *abs_time;
- restart->futex.bitset = bitset;
- restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
-
- ret = set_restart_fn(restart, futex_wait_restart);
-
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
- }
- return ret;
-}
-
-
-static long futex_wait_restart(struct restart_block *restart)
-{
- u32 __user *uaddr = restart->futex.uaddr;
- ktime_t t, *tp = NULL;
-
- if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
- t = restart->futex.time;
- tp = &t;
- }
- restart->fn = do_no_restart_syscall;
-
- return (long)futex_wait(uaddr, restart->futex.flags,
- restart->futex.val, tp, restart->futex.bitset);
-}
-
-
-/*
- * Userspace tried a 0 -> TID atomic transition of the futex value
- * and failed. The kernel side here does the whole locking operation:
- * if there are waiters then it will block as a consequence of relying
- * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
- * a 0 value of the futex too.).
- *
- * Also serves as futex trylock_pi()'ing, and due semantics.
- */
-static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
- ktime_t *time, int trylock)
-{
- struct hrtimer_sleeper timeout, *to;
- struct task_struct *exiting = NULL;
- struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
- struct futex_q q = futex_q_init;
- int res, ret;
-
- if (!IS_ENABLED(CONFIG_FUTEX_PI))
- return -ENOSYS;
-
- if (refill_pi_state_cache())
- return -ENOMEM;
-
- to = futex_setup_timer(time, &timeout, flags, 0);
-
-retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
- if (unlikely(ret != 0))
- goto out;
-
-retry_private:
- hb = queue_lock(&q);
-
- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
- &exiting, 0);
- if (unlikely(ret)) {
- /*
- * Atomic work succeeded and we got the lock,
- * or failed. Either way, we do _not_ block.
- */
- switch (ret) {
- case 1:
- /* We got the lock. */
- ret = 0;
- goto out_unlock_put_key;
- case -EFAULT:
- goto uaddr_faulted;
- case -EBUSY:
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - EBUSY: Task is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
- */
- queue_unlock(hb);
- /*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
- */
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock_put_key;
- }
- }
-
- WARN_ON(!q.pi_state);
-
- /*
- * Only actually queue now that the atomic ops are done:
- */
- __queue_me(&q, hb);
-
- if (trylock) {
- ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
- /* Fixup the trylock return value: */
- ret = ret ? 0 : -EWOULDBLOCK;
- goto no_block;
- }
-
- rt_mutex_init_waiter(&rt_waiter);
-
- /*
- * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
- * hold it while doing rt_mutex_start_proxy(), because then it will
- * include hb->lock in the blocking chain, even through we'll not in
- * fact hold it while blocking. This will lead it to report -EDEADLK
- * and BUG when futex_unlock_pi() interleaves with this.
- *
- * Therefore acquire wait_lock while holding hb->lock, but drop the
- * latter before calling __rt_mutex_start_proxy_lock(). This
- * interleaves with futex_unlock_pi() -- which does a similar lock
- * handoff -- such that the latter can observe the futex_q::pi_state
- * before __rt_mutex_start_proxy_lock() is done.
- */
- raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
- spin_unlock(q.lock_ptr);
- /*
- * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
- * such that futex_unlock_pi() is guaranteed to observe the waiter when
- * it sees the futex_q::pi_state.
- */
- ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
- raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
-
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto cleanup;
- }
-
- if (unlikely(to))
- hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
-
- ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
-
-cleanup:
- spin_lock(q.lock_ptr);
- /*
- * If we failed to acquire the lock (deadlock/signal/timeout), we must
- * first acquire the hb->lock before removing the lock from the
- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
- * lists consistent.
- *
- * In particular; it is important that futex_unlock_pi() can not
- * observe this inconsistency.
- */
- if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
- ret = 0;
-
-no_block:
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
- */
- res = fixup_owner(uaddr, &q, !ret);
- /*
- * If fixup_owner() returned an error, propagate that. If it acquired
- * the lock, clear our -ETIMEDOUT or -EINTR.
- */
- if (res)
- ret = (res < 0) ? res : 0;
-
- unqueue_me_pi(&q);
- spin_unlock(q.lock_ptr);
- goto out;
-
-out_unlock_put_key:
- queue_unlock(hb);
-
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
- }
- return ret != -EINTR ? ret : -ERESTARTNOINTR;
-
-uaddr_faulted:
- queue_unlock(hb);
-
- ret = fault_in_user_writeable(uaddr);
- if (ret)
- goto out;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- goto retry;
-}
-
-/*
- * Userspace attempted a TID -> 0 atomic transition, and failed.
- * This is the in-kernel slowpath: we look up the PI state (if any),
- * and do the rt-mutex unlock.
- */
-static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
-{
- u32 curval, uval, vpid = task_pid_vnr(current);
- union futex_key key = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb;
- struct futex_q *top_waiter;
- int ret;
-
- if (!IS_ENABLED(CONFIG_FUTEX_PI))
- return -ENOSYS;
-
-retry:
- if (get_user(uval, uaddr))
- return -EFAULT;
- /*
- * We release only a lock we actually own:
- */
- if ((uval & FUTEX_TID_MASK) != vpid)
- return -EPERM;
-
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
- if (ret)
- return ret;
-
- hb = hash_futex(&key);
- spin_lock(&hb->lock);
-
- /*
- * Check waiters first. We do not trust user space values at
- * all and we at least want to know if user space fiddled
- * with the futex value instead of blindly unlocking.
- */
- top_waiter = futex_top_waiter(hb, &key);
- if (top_waiter) {
- struct futex_pi_state *pi_state = top_waiter->pi_state;
-
- ret = -EINVAL;
- if (!pi_state)
- goto out_unlock;
-
- /*
- * If current does not own the pi_state then the futex is
- * inconsistent and user space fiddled with the futex value.
- */
- if (pi_state->owner != current)
- goto out_unlock;
-
- get_pi_state(pi_state);
- /*
- * By taking wait_lock while still holding hb->lock, we ensure
- * there is no point where we hold neither; and therefore
- * wake_futex_pi() must observe a state consistent with what we
- * observed.
- *
- * In particular; this forces __rt_mutex_start_proxy() to
- * complete such that we're guaranteed to observe the
- * rt_waiter. Also see the WARN in wake_futex_pi().
- */
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
-
- /* drops pi_state->pi_mutex.wait_lock */
- ret = wake_futex_pi(uaddr, uval, pi_state);
-
- put_pi_state(pi_state);
-
- /*
- * Success, we're done! No tricky corner cases.
- */
- if (!ret)
- return ret;
- /*
- * The atomic access to the futex value generated a
- * pagefault, so retry the user-access and the wakeup:
- */
- if (ret == -EFAULT)
- goto pi_faulted;
- /*
- * A unconditional UNLOCK_PI op raced against a waiter
- * setting the FUTEX_WAITERS bit. Try again.
- */
- if (ret == -EAGAIN)
- goto pi_retry;
- /*
- * wake_futex_pi has detected invalid state. Tell user
- * space.
- */
- return ret;
- }
-
- /*
- * We have no kernel internal state, i.e. no waiters in the
- * kernel. Waiters which are about to queue themselves are stuck
- * on hb->lock. So we can safely ignore them. We do neither
- * preserve the WAITERS bit not the OWNER_DIED one. We are the
- * owner.
- */
- if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
- spin_unlock(&hb->lock);
- switch (ret) {
- case -EFAULT:
- goto pi_faulted;
-
- case -EAGAIN:
- goto pi_retry;
-
- default:
- WARN_ON_ONCE(1);
- return ret;
- }
- }
-
- /*
- * If uval has changed, let user space handle it.
- */
- ret = (curval == uval) ? 0 : -EAGAIN;
-
-out_unlock:
- spin_unlock(&hb->lock);
- return ret;
-
-pi_retry:
- cond_resched();
- goto retry;
-
-pi_faulted:
-
- ret = fault_in_user_writeable(uaddr);
- if (!ret)
- goto retry;
-
- return ret;
-}
-
-/**
- * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
- * @hb: the hash_bucket futex_q was original enqueued on
- * @q: the futex_q woken while waiting to be requeued
- * @timeout: the timeout associated with the wait (NULL if none)
- *
- * Determine the cause for the early wakeup.
- *
- * Return:
- * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
- */
-static inline
-int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
- struct futex_q *q,
- struct hrtimer_sleeper *timeout)
-{
- int ret;
-
- /*
- * With the hb lock held, we avoid races while we process the wakeup.
- * We only need to hold hb (and not hb2) to ensure atomicity as the
- * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
- * It can't be requeued from uaddr2 to something else since we don't
- * support a PI aware source futex for requeue.
- */
- WARN_ON_ONCE(&hb->lock != q->lock_ptr);
-
- /*
- * We were woken prior to requeue by a timeout or a signal.
- * Unqueue the futex_q and determine which it was.
- */
- plist_del(&q->list, &hb->chain);
- hb_waiters_dec(hb);
-
- /* Handle spurious wakeups gracefully */
- ret = -EWOULDBLOCK;
- if (timeout && !timeout->task)
- ret = -ETIMEDOUT;
- else if (signal_pending(current))
- ret = -ERESTARTNOINTR;
- return ret;
-}
-
-/**
- * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
- * @uaddr: the futex we initially wait on (non-pi)
- * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
- * the same type, no requeueing from private to shared, etc.
- * @val: the expected value of uaddr
- * @abs_time: absolute timeout
- * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
- * @uaddr2: the pi futex we will take prior to returning to user-space
- *
- * The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
- * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
- * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
- * without one, the pi logic would not know which task to boost/deboost, if
- * there was a need to.
- *
- * We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following--
- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
- * 2) wakeup on uaddr2 after a requeue
- * 3) signal
- * 4) timeout
- *
- * If 3, cleanup and return -ERESTARTNOINTR.
- *
- * If 2, we may then block on trying to take the rt_mutex and return via:
- * 5) successful lock
- * 6) signal
- * 7) timeout
- * 8) other lock acquisition failure
- *
- * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
- *
- * If 4 or 7, we cleanup and return with -ETIMEDOUT.
- *
- * Return:
- * - 0 - On success;
- * - <0 - On error
- */
-static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
- u32 val, ktime_t *abs_time, u32 bitset,
- u32 __user *uaddr2)
-{
- struct hrtimer_sleeper timeout, *to;
- struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
- union futex_key key2 = FUTEX_KEY_INIT;
- struct futex_q q = futex_q_init;
- struct rt_mutex_base *pi_mutex;
- int res, ret;
-
- if (!IS_ENABLED(CONFIG_FUTEX_PI))
- return -ENOSYS;
-
- if (uaddr == uaddr2)
- return -EINVAL;
-
- if (!bitset)
- return -EINVAL;
-
- to = futex_setup_timer(abs_time, &timeout, flags,
- current->timer_slack_ns);
-
- /*
- * The waiter is allocated on our stack, manipulated by the requeue
- * code while we sleep on uaddr.
- */
- rt_mutex_init_waiter(&rt_waiter);
-
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
- if (unlikely(ret != 0))
- goto out;
-
- q.bitset = bitset;
- q.rt_waiter = &rt_waiter;
- q.requeue_pi_key = &key2;
-
- /*
- * Prepare to wait on uaddr. On success, it holds hb->lock and q
- * is initialized.
- */
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
- if (ret)
- goto out;
-
- /*
- * The check above which compares uaddrs is not sufficient for
- * shared futexes. We need to compare the keys:
- */
- if (match_futex(&q.key, &key2)) {
- queue_unlock(hb);
- ret = -EINVAL;
- goto out;
- }
-
- /* Queue the futex_q, drop the hb lock, wait for wakeup. */
- futex_wait_queue_me(hb, &q, to);
-
- switch (futex_requeue_pi_wakeup_sync(&q)) {
- case Q_REQUEUE_PI_IGNORE:
- /* The waiter is still on uaddr1 */
- spin_lock(&hb->lock);
- ret = handle_early_requeue_pi_wakeup(hb, &q, to);
- spin_unlock(&hb->lock);
- break;
-
- case Q_REQUEUE_PI_LOCKED:
- /* The requeue acquired the lock */
- if (q.pi_state && (q.pi_state->owner != current)) {
- spin_lock(q.lock_ptr);
- ret = fixup_owner(uaddr2, &q, true);
- /*
- * Drop the reference to the pi state which the
- * requeue_pi() code acquired for us.
- */
- put_pi_state(q.pi_state);
- spin_unlock(q.lock_ptr);
- /*
- * Adjust the return value. It's either -EFAULT or
- * success (1) but the caller expects 0 for success.
- */
- ret = ret < 0 ? ret : 0;
- }
- break;
-
- case Q_REQUEUE_PI_DONE:
- /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
- pi_mutex = &q.pi_state->pi_mutex;
- ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
-
- /* Current is not longer pi_blocked_on */
- spin_lock(q.lock_ptr);
- if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
- ret = 0;
-
- debug_rt_mutex_free_waiter(&rt_waiter);
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
- */
- res = fixup_owner(uaddr2, &q, !ret);
- /*
- * If fixup_owner() returned an error, propagate that. If it
- * acquired the lock, clear -ETIMEDOUT or -EINTR.
- */
- if (res)
- ret = (res < 0) ? res : 0;
-
- unqueue_me_pi(&q);
- spin_unlock(q.lock_ptr);
-
- if (ret == -EINTR) {
- /*
- * We've already been requeued, but cannot restart
- * by calling futex_lock_pi() directly. We could
- * restart this syscall, but it would detect that
- * the user space "val" changed and return
- * -EWOULDBLOCK. Save the overhead of the restart
- * and return -EWOULDBLOCK directly.
- */
- ret = -EWOULDBLOCK;
- }
- break;
- default:
- BUG();
- }
-
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
- }
- return ret;
-}
-
-/*
- * Support for robust futexes: the kernel cleans up held futexes at
- * thread exit time.
- *
- * Implementation: user-space maintains a per-thread list of locks it
- * is holding. Upon do_exit(), the kernel carefully walks this list,
- * and marks all locks that are owned by this thread with the
- * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
- * always manipulated with the lock held, so the list is private and
- * per-thread. Userspace also maintains a per-thread 'list_op_pending'
- * field, to allow the kernel to clean up if the thread dies after
- * acquiring the lock, but just before it could have added itself to
- * the list. There can only be one such pending lock.
- */
-
-/**
- * sys_set_robust_list() - Set the robust-futex list head of a task
- * @head: pointer to the list-head
- * @len: length of the list-head, as userspace expects
- */
-SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
- size_t, len)
-{
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
- /*
- * The kernel knows only one size for now:
- */
- if (unlikely(len != sizeof(*head)))
- return -EINVAL;
-
- current->robust_list = head;
-
- return 0;
-}
-
-/**
- * sys_get_robust_list() - Get the robust-futex list head of a task
- * @pid: pid of the process [zero for current task]
- * @head_ptr: pointer to a list-head pointer, the kernel fills it in
- * @len_ptr: pointer to a length field, the kernel fills in the header size
- */
-SYSCALL_DEFINE3(get_robust_list, int, pid,
- struct robust_list_head __user * __user *, head_ptr,
- size_t __user *, len_ptr)
-{
- struct robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
-
- head = p->robust_list;
- rcu_read_unlock();
-
- if (put_user(sizeof(*head), len_ptr))
- return -EFAULT;
- return put_user(head, head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
-}
-
-/* Constants for the pending_op argument of handle_futex_death */
-#define HANDLE_DEATH_PENDING true
-#define HANDLE_DEATH_LIST false
-
-/*
- * Process a futex-list entry, check whether it's owned by the
- * dying task, and do notification if so:
- */
-static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
- bool pi, bool pending_op)
-{
- u32 uval, nval, mval;
- int err;
-
- /* Futex address must be 32bit aligned */
- if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
- return -1;
-
-retry:
- if (get_user(uval, uaddr))
- return -1;
-
- /*
- * Special case for regular (non PI) futexes. The unlock path in
- * user space has two race scenarios:
- *
- * 1. The unlock path releases the user space futex value and
- * before it can execute the futex() syscall to wake up
- * waiters it is killed.
- *
- * 2. A woken up waiter is killed before it can acquire the
- * futex in user space.
- *
- * In both cases the TID validation below prevents a wakeup of
- * potential waiters which can cause these waiters to block
- * forever.
- *
- * In both cases the following conditions are met:
- *
- * 1) task->robust_list->list_op_pending != NULL
- * @pending_op == true
- * 2) User space futex value == 0
- * 3) Regular futex: @pi == false
- *
- * If these conditions are met, it is safe to attempt waking up a
- * potential waiter without touching the user space futex value and
- * trying to set the OWNER_DIED bit. The user space futex value is
- * uncontended and the rest of the user space mutex state is
- * consistent, so a woken waiter will just take over the
- * uncontended futex. Setting the OWNER_DIED bit would create
- * inconsistent state and malfunction of the user space owner died
- * handling.
- */
- if (pending_op && !pi && !uval) {
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
- return 0;
- }
-
- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
- return 0;
-
- /*
- * Ok, this dying thread is truly holding a futex
- * of interest. Set the OWNER_DIED bit atomically
- * via cmpxchg, and if the value had FUTEX_WAITERS
- * set, wake up a waiter (if any). (We have to do a
- * futex_wake() even if OWNER_DIED is already set -
- * to handle the rare but possible case of recursive
- * thread-death.) The rest of the cleanup is done in
- * userspace.
- */
- mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-
- /*
- * We are not holding a lock here, but we want to have
- * the pagefault_disable/enable() protection because
- * we want to handle the fault gracefully. If the
- * access fails we try to fault in the futex with R/W
- * verification via get_user_pages. get_user() above
- * does not guarantee R/W access. If that fails we
- * give up and leave the futex locked.
- */
- if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
- switch (err) {
- case -EFAULT:
- if (fault_in_user_writeable(uaddr))
- return -1;
- goto retry;
-
- case -EAGAIN:
- cond_resched();
- goto retry;
-
- default:
- WARN_ON_ONCE(1);
- return err;
- }
- }
-
- if (nval != uval)
- goto retry;
-
- /*
- * Wake robust non-PI futexes here. The wakeup of
- * PI futexes happens in exit_pi_state():
- */
- if (!pi && (uval & FUTEX_WAITERS))
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
-
- return 0;
-}
-
-/*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
- */
-static inline int fetch_robust_entry(struct robust_list __user **entry,
- struct robust_list __user * __user *head,
- unsigned int *pi)
-{
- unsigned long uentry;
-
- if (get_user(uentry, (unsigned long __user *)head))
- return -EFAULT;
-
- *entry = (void __user *)(uentry & ~1UL);
- *pi = uentry & 1;
-
- return 0;
-}
-
-/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
- * and mark any locks found there dead, and notify any waiters.
- *
- * We silently return on any sign of list-walking problem.
- */
-static void exit_robust_list(struct task_struct *curr)
-{
- struct robust_list_head __user *head = curr->robust_list;
- struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int next_pi;
- unsigned long futex_offset;
- int rc;
-
- if (!futex_cmpxchg_enabled)
- return;
-
- /*
- * Fetch the list head (which was registered earlier, via
- * sys_set_robust_list()):
- */
- if (fetch_robust_entry(&entry, &head->list.next, &pi))
- return;
- /*
- * Fetch the relative futex offset:
- */
- if (get_user(futex_offset, &head->futex_offset))
- return;
- /*
- * Fetch any possibly pending lock-add first, and handle it
- * if it exists:
- */
- if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
- return;
-
- next_entry = NULL; /* avoid warning with gcc */
- while (entry != &head->list) {
- /*
- * Fetch the next entry in the list before calling
- * handle_futex_death:
- */
- rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
- /*
- * A pending lock might already be on the list, so
- * don't process it twice:
- */
- if (entry != pending) {
- if (handle_futex_death((void __user *)entry + futex_offset,
- curr, pi, HANDLE_DEATH_LIST))
- return;
- }
- if (rc)
- return;
- entry = next_entry;
- pi = next_pi;
- /*
- * Avoid excessively long or circular lists:
- */
- if (!--limit)
- break;
-
- cond_resched();
- }
-
- if (pending) {
- handle_futex_death((void __user *)pending + futex_offset,
- curr, pip, HANDLE_DEATH_PENDING);
- }
-}
-
-static void futex_cleanup(struct task_struct *tsk)
-{
- if (unlikely(tsk->robust_list)) {
- exit_robust_list(tsk);
- tsk->robust_list = NULL;
- }
-
-#ifdef CONFIG_COMPAT
- if (unlikely(tsk->compat_robust_list)) {
- compat_exit_robust_list(tsk);
- tsk->compat_robust_list = NULL;
- }
-#endif
-
- if (unlikely(!list_empty(&tsk->pi_state_list)))
- exit_pi_state_list(tsk);
-}
-
-/**
- * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
- * @tsk: task to set the state on
- *
- * Set the futex exit state of the task lockless. The futex waiter code
- * observes that state when a task is exiting and loops until the task has
- * actually finished the futex cleanup. The worst case for this is that the
- * waiter runs through the wait loop until the state becomes visible.
- *
- * This is called from the recursive fault handling path in do_exit().
- *
- * This is best effort. Either the futex exit code has run already or
- * not. If the OWNER_DIED bit has been set on the futex then the waiter can
- * take it over. If not, the problem is pushed back to user space. If the
- * futex exit code did not run yet, then an already queued waiter might
- * block forever, but there is nothing which can be done about that.
- */
-void futex_exit_recursive(struct task_struct *tsk)
-{
- /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
- if (tsk->futex_state == FUTEX_STATE_EXITING)
- mutex_unlock(&tsk->futex_exit_mutex);
- tsk->futex_state = FUTEX_STATE_DEAD;
-}
-
-static void futex_cleanup_begin(struct task_struct *tsk)
-{
- /*
- * Prevent various race issues against a concurrent incoming waiter
- * including live locks by forcing the waiter to block on
- * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
- * attach_to_pi_owner().
- */
- mutex_lock(&tsk->futex_exit_mutex);
-
- /*
- * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
- *
- * This ensures that all subsequent checks of tsk->futex_state in
- * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
- * tsk->pi_lock held.
- *
- * It guarantees also that a pi_state which was queued right before
- * the state change under tsk->pi_lock by a concurrent waiter must
- * be observed in exit_pi_state_list().
- */
- raw_spin_lock_irq(&tsk->pi_lock);
- tsk->futex_state = FUTEX_STATE_EXITING;
- raw_spin_unlock_irq(&tsk->pi_lock);
-}
-
-static void futex_cleanup_end(struct task_struct *tsk, int state)
-{
- /*
- * Lockless store. The only side effect is that an observer might
- * take another loop until it becomes visible.
- */
- tsk->futex_state = state;
- /*
- * Drop the exit protection. This unblocks waiters which observed
- * FUTEX_STATE_EXITING to reevaluate the state.
- */
- mutex_unlock(&tsk->futex_exit_mutex);
-}
-
-void futex_exec_release(struct task_struct *tsk)
-{
- /*
- * The state handling is done for consistency, but in the case of
- * exec() there is no way to prevent further damage as the PID stays
- * the same. But for the unlikely and arguably buggy case that a
- * futex is held on exec(), this provides at least as much state
- * consistency protection which is possible.
- */
- futex_cleanup_begin(tsk);
- futex_cleanup(tsk);
- /*
- * Reset the state to FUTEX_STATE_OK. The task is alive and about
- * exec a new binary.
- */
- futex_cleanup_end(tsk, FUTEX_STATE_OK);
-}
-
-void futex_exit_release(struct task_struct *tsk)
-{
- futex_cleanup_begin(tsk);
- futex_cleanup(tsk);
- futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
-}
-
-long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
- u32 __user *uaddr2, u32 val2, u32 val3)
-{
- int cmd = op & FUTEX_CMD_MASK;
- unsigned int flags = 0;
-
- if (!(op & FUTEX_PRIVATE_FLAG))
- flags |= FLAGS_SHARED;
-
- if (op & FUTEX_CLOCK_REALTIME) {
- flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
- cmd != FUTEX_LOCK_PI2)
- return -ENOSYS;
- }
-
- switch (cmd) {
- case FUTEX_LOCK_PI:
- case FUTEX_LOCK_PI2:
- case FUTEX_UNLOCK_PI:
- case FUTEX_TRYLOCK_PI:
- case FUTEX_WAIT_REQUEUE_PI:
- case FUTEX_CMP_REQUEUE_PI:
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
- }
-
- switch (cmd) {
- case FUTEX_WAIT:
- val3 = FUTEX_BITSET_MATCH_ANY;
- fallthrough;
- case FUTEX_WAIT_BITSET:
- return futex_wait(uaddr, flags, val, timeout, val3);
- case FUTEX_WAKE:
- val3 = FUTEX_BITSET_MATCH_ANY;
- fallthrough;
- case FUTEX_WAKE_BITSET:
- return futex_wake(uaddr, flags, val, val3);
- case FUTEX_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
- case FUTEX_CMP_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
- case FUTEX_WAKE_OP:
- return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
- case FUTEX_LOCK_PI:
- flags |= FLAGS_CLOCKRT;
- fallthrough;
- case FUTEX_LOCK_PI2:
- return futex_lock_pi(uaddr, flags, timeout, 0);
- case FUTEX_UNLOCK_PI:
- return futex_unlock_pi(uaddr, flags);
- case FUTEX_TRYLOCK_PI:
- return futex_lock_pi(uaddr, flags, NULL, 1);
- case FUTEX_WAIT_REQUEUE_PI:
- val3 = FUTEX_BITSET_MATCH_ANY;
- return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
- uaddr2);
- case FUTEX_CMP_REQUEUE_PI:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
- }
- return -ENOSYS;
-}
-
-static __always_inline bool futex_cmd_has_timeout(u32 cmd)
-{
- switch (cmd) {
- case FUTEX_WAIT:
- case FUTEX_LOCK_PI:
- case FUTEX_LOCK_PI2:
- case FUTEX_WAIT_BITSET:
- case FUTEX_WAIT_REQUEUE_PI:
- return true;
- }
- return false;
-}
-
-static __always_inline int
-futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
-{
- if (!timespec64_valid(ts))
- return -EINVAL;
-
- *t = timespec64_to_ktime(*ts);
- if (cmd == FUTEX_WAIT)
- *t = ktime_add_safe(ktime_get(), *t);
- else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
- *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
- return 0;
-}
-
-SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
- const struct __kernel_timespec __user *, utime,
- u32 __user *, uaddr2, u32, val3)
-{
- int ret, cmd = op & FUTEX_CMD_MASK;
- ktime_t t, *tp = NULL;
- struct timespec64 ts;
-
- if (utime && futex_cmd_has_timeout(cmd)) {
- if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
- return -EFAULT;
- if (get_timespec64(&ts, utime))
- return -EFAULT;
- ret = futex_init_timeout(cmd, op, &ts, &t);
- if (ret)
- return ret;
- tp = &t;
- }
-
- return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
-}
-
-#ifdef CONFIG_COMPAT
-/*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
- */
-static inline int
-compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
- compat_uptr_t __user *head, unsigned int *pi)
-{
- if (get_user(*uentry, head))
- return -EFAULT;
-
- *entry = compat_ptr((*uentry) & ~1);
- *pi = (unsigned int)(*uentry) & 1;
-
- return 0;
-}
-
-static void __user *futex_uaddr(struct robust_list __user *entry,
- compat_long_t futex_offset)
-{
- compat_uptr_t base = ptr_to_compat(entry);
- void __user *uaddr = compat_ptr(base + futex_offset);
-
- return uaddr;
-}
-
-/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
- * and mark any locks found there dead, and notify any waiters.
- *
- * We silently return on any sign of list-walking problem.
- */
-static void compat_exit_robust_list(struct task_struct *curr)
-{
- struct compat_robust_list_head __user *head = curr->compat_robust_list;
- struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int next_pi;
- compat_uptr_t uentry, next_uentry, upending;
- compat_long_t futex_offset;
- int rc;
-
- if (!futex_cmpxchg_enabled)
- return;
-
- /*
- * Fetch the list head (which was registered earlier, via
- * sys_set_robust_list()):
- */
- if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
- return;
- /*
- * Fetch the relative futex offset:
- */
- if (get_user(futex_offset, &head->futex_offset))
- return;
- /*
- * Fetch any possibly pending lock-add first, and handle it
- * if it exists:
- */
- if (compat_fetch_robust_entry(&upending, &pending,
- &head->list_op_pending, &pip))
- return;
-
- next_entry = NULL; /* avoid warning with gcc */
- while (entry != (struct robust_list __user *) &head->list) {
- /*
- * Fetch the next entry in the list before calling
- * handle_futex_death:
- */
- rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
- (compat_uptr_t __user *)&entry->next, &next_pi);
- /*
- * A pending lock might already be on the list, so
- * dont process it twice:
- */
- if (entry != pending) {
- void __user *uaddr = futex_uaddr(entry, futex_offset);
-
- if (handle_futex_death(uaddr, curr, pi,
- HANDLE_DEATH_LIST))
- return;
- }
- if (rc)
- return;
- uentry = next_uentry;
- entry = next_entry;
- pi = next_pi;
- /*
- * Avoid excessively long or circular lists:
- */
- if (!--limit)
- break;
-
- cond_resched();
- }
- if (pending) {
- void __user *uaddr = futex_uaddr(pending, futex_offset);
-
- handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
- }
-}
-
-COMPAT_SYSCALL_DEFINE2(set_robust_list,
- struct compat_robust_list_head __user *, head,
- compat_size_t, len)
-{
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- if (unlikely(len != sizeof(*head)))
- return -EINVAL;
-
- current->compat_robust_list = head;
-
- return 0;
-}
-
-COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
- compat_uptr_t __user *, head_ptr,
- compat_size_t __user *, len_ptr)
-{
- struct compat_robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
-
- head = p->compat_robust_list;
- rcu_read_unlock();
-
- if (put_user(sizeof(*head), len_ptr))
- return -EFAULT;
- return put_user(ptr_to_compat(head), head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
-}
-#endif /* CONFIG_COMPAT */
-
-#ifdef CONFIG_COMPAT_32BIT_TIME
-SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
- const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
- u32, val3)
-{
- int ret, cmd = op & FUTEX_CMD_MASK;
- ktime_t t, *tp = NULL;
- struct timespec64 ts;
-
- if (utime && futex_cmd_has_timeout(cmd)) {
- if (get_old_timespec32(&ts, utime))
- return -EFAULT;
- ret = futex_init_timeout(cmd, op, &ts, &t);
- if (ret)
- return ret;
- tp = &t;
- }
-
- return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
-}
-#endif /* CONFIG_COMPAT_32BIT_TIME */
-
-static void __init futex_detect_cmpxchg(void)
-{
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
- u32 curval;
-
- /*
- * This will fail and we want it. Some arch implementations do
- * runtime detection of the futex_atomic_cmpxchg_inatomic()
- * functionality. We want to know that before we call in any
- * of the complex code paths. Also we want to prevent
- * registration of robust lists in that case. NULL is
- * guaranteed to fault and we get -EFAULT on functional
- * implementation, the non-functional ones will return
- * -ENOSYS.
- */
- if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
- futex_cmpxchg_enabled = 1;
-#endif
-}
-
-static int __init futex_init(void)
-{
- unsigned int futex_shift;
- unsigned long i;
-
-#if CONFIG_BASE_SMALL
- futex_hashsize = 16;
-#else
- futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
-#endif
-
- futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
- futex_hashsize, 0,
- futex_hashsize < 256 ? HASH_SMALL : 0,
- &futex_shift, NULL,
- futex_hashsize, futex_hashsize);
- futex_hashsize = 1UL << futex_shift;
-
- futex_detect_cmpxchg();
-
- for (i = 0; i < futex_hashsize; i++) {
- atomic_set(&futex_queues[i].waiters, 0);
- plist_head_init(&futex_queues[i].chain);
- spin_lock_init(&futex_queues[i].lock);
- }
-
- return 0;
-}
-core_initcall(futex_init);
diff --git a/kernel/futex/Makefile b/kernel/futex/Makefile
new file mode 100644
index 000000000000..b77188d1fa07
--- /dev/null
+++ b/kernel/futex/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += core.o syscalls.o pi.o requeue.o waitwake.o
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
new file mode 100644
index 000000000000..926c2bb752bc
--- /dev/null
+++ b/kernel/futex/core.c
@@ -0,0 +1,1141 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Fast Userspace Mutexes (which I call "Futexes!").
+ * (C) Rusty Russell, IBM 2002
+ *
+ * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
+ * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
+ *
+ * Removed page pinning, fix privately mapped COW pages and other cleanups
+ * (C) Copyright 2003, 2004 Jamie Lokier
+ *
+ * Robust futex support started by Ingo Molnar
+ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+ * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+ *
+ * PI-futex support started by Ingo Molnar and Thomas Gleixner
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * PRIVATE futexes by Eric Dumazet
+ * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+ *
+ * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ * Copyright (C) IBM Corporation, 2009
+ * Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
+ * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
+ * enough at me, Linus for the original (flawed) idea, Matthew
+ * Kirkwood for proof-of-concept implementation.
+ *
+ * "The futexes are also cursed."
+ * "But they come in a choice of three flavours!"
+ */
+#include <linux/compat.h>
+#include <linux/jhash.h>
+#include <linux/pagemap.h>
+#include <linux/memblock.h>
+#include <linux/fault-inject.h>
+#include <linux/slab.h>
+
+#include "futex.h"
+#include "../locking/rtmutex_common.h"
+
+/*
+ * The base of the bucket array and its size are always used together
+ * (after initialization only in futex_hash()), so ensure that they
+ * reside in the same cacheline.
+ */
+static struct {
+ struct futex_hash_bucket *queues;
+ unsigned long hashsize;
+} __futex_data __read_mostly __aligned(2*sizeof(long));
+#define futex_queues (__futex_data.queues)
+#define futex_hashsize (__futex_data.hashsize)
+
+
+/*
+ * Fault injections for futexes.
+ */
+#ifdef CONFIG_FAIL_FUTEX
+
+static struct {
+ struct fault_attr attr;
+
+ bool ignore_private;
+} fail_futex = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_private = false,
+};
+
+static int __init setup_fail_futex(char *str)
+{
+ return setup_fault_attr(&fail_futex.attr, str);
+}
+__setup("fail_futex=", setup_fail_futex);
+
+bool should_fail_futex(bool fshared)
+{
+ if (fail_futex.ignore_private && !fshared)
+ return false;
+
+ return should_fail(&fail_futex.attr, 1);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_futex_debugfs(void)
+{
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_futex", NULL,
+ &fail_futex.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ debugfs_create_bool("ignore-private", mode, dir,
+ &fail_futex.ignore_private);
+ return 0;
+}
+
+late_initcall(fail_futex_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#endif /* CONFIG_FAIL_FUTEX */
+
+/**
+ * futex_hash - Return the hash bucket in the global hash
+ * @key: Pointer to the futex key for which the hash is calculated
+ *
+ * We hash on the keys returned from get_futex_key (see below) and return the
+ * corresponding hash bucket in the global hash.
+ */
+struct futex_hash_bucket *futex_hash(union futex_key *key)
+{
+ u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
+ key->both.offset);
+
+ return &futex_queues[hash & (futex_hashsize - 1)];
+}
+
+
+/**
+ * futex_setup_timer - set up the sleeping hrtimer.
+ * @time: ptr to the given timeout value
+ * @timeout: the hrtimer_sleeper structure to be set up
+ * @flags: futex flags
+ * @range_ns: optional range in ns
+ *
+ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
+ * value given
+ */
+struct hrtimer_sleeper *
+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+ int flags, u64 range_ns)
+{
+ if (!time)
+ return NULL;
+
+ hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ /*
+ * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
+ * effectively the same as calling hrtimer_set_expires().
+ */
+ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
+
+ return timeout;
+}
+
+/*
+ * Generate a machine wide unique identifier for this inode.
+ *
+ * This relies on u64 not wrapping in the life-time of the machine; which with
+ * 1ns resolution means almost 585 years.
+ *
+ * This further relies on the fact that a well formed program will not unmap
+ * the file while it has a (shared) futex waiting on it. This mapping will have
+ * a file reference which pins the mount and inode.
+ *
+ * If for some reason an inode gets evicted and read back in again, it will get
+ * a new sequence number and will _NOT_ match, even though it is the exact same
+ * file.
+ *
+ * It is important that futex_match() will never have a false-positive, esp.
+ * for PI futexes that can mess up the state. The above argues that false-negatives
+ * are only possible for malformed programs.
+ */
+static u64 get_inode_sequence_number(struct inode *inode)
+{
+ static atomic64_t i_seq;
+ u64 old;
+
+ /* Does the inode already have a sequence number? */
+ old = atomic64_read(&inode->i_sequence);
+ if (likely(old))
+ return old;
+
+ for (;;) {
+ u64 new = atomic64_add_return(1, &i_seq);
+ if (WARN_ON_ONCE(!new))
+ continue;
+
+ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
+ if (old)
+ return old;
+ return new;
+ }
+}
+
+/**
+ * get_futex_key() - Get parameters which are the keys for a futex
+ * @uaddr: virtual address of the futex
+ * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
+ * @key: address where result is stored.
+ * @rw: mapping needs to be read/write (values: FUTEX_READ,
+ * FUTEX_WRITE)
+ *
+ * Return: a negative error code or 0
+ *
+ * The key words are stored in @key on success.
+ *
+ * For shared mappings (when @fshared), the key is:
+ *
+ * ( inode->i_sequence, page->index, offset_within_page )
+ *
+ * [ also see get_inode_sequence_number() ]
+ *
+ * For private mappings (or when !@fshared), the key is:
+ *
+ * ( current->mm, address, 0 )
+ *
+ * This allows (cross process, where applicable) identification of the futex
+ * without keeping the page pinned for the duration of the FUTEX_WAIT.
+ *
+ * lock_page() might sleep, the caller should not hold a spinlock.
+ */
+int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+ enum futex_access rw)
+{
+ unsigned long address = (unsigned long)uaddr;
+ struct mm_struct *mm = current->mm;
+ struct page *page, *tail;
+ struct address_space *mapping;
+ int err, ro = 0;
+
+ /*
+ * The futex address must be "naturally" aligned.
+ */
+ key->both.offset = address % PAGE_SIZE;
+ if (unlikely((address % sizeof(u32)) != 0))
+ return -EINVAL;
+ address -= key->both.offset;
+
+ if (unlikely(!access_ok(uaddr, sizeof(u32))))
+ return -EFAULT;
+
+ if (unlikely(should_fail_futex(fshared)))
+ return -EFAULT;
+
+ /*
+ * PROCESS_PRIVATE futexes are fast.
+ * As the mm cannot disappear under us and the 'key' only needs
+ * virtual address, we dont even have to find the underlying vma.
+ * Note : We do have to check 'uaddr' is a valid user address,
+ * but access_ok() should be faster than find_vma()
+ */
+ if (!fshared) {
+ key->private.mm = mm;
+ key->private.address = address;
+ return 0;
+ }
+
+again:
+ /* Ignore any VERIFY_READ mapping (futex common case) */
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
+ /*
+ * If write access is not required (eg. FUTEX_WAIT), try
+ * and get read-only access.
+ */
+ if (err == -EFAULT && rw == FUTEX_READ) {
+ err = get_user_pages_fast(address, 1, 0, &page);
+ ro = 1;
+ }
+ if (err < 0)
+ return err;
+ else
+ err = 0;
+
+ /*
+ * The treatment of mapping from this point on is critical. The page
+ * lock protects many things but in this context the page lock
+ * stabilizes mapping, prevents inode freeing in the shared
+ * file-backed region case and guards against movement to swap cache.
+ *
+ * Strictly speaking the page lock is not needed in all cases being
+ * considered here and page lock forces unnecessarily serialization
+ * From this point on, mapping will be re-verified if necessary and
+ * page lock will be acquired only if it is unavoidable
+ *
+ * Mapping checks require the head page for any compound page so the
+ * head page and mapping is looked up now. For anonymous pages, it
+ * does not matter if the page splits in the future as the key is
+ * based on the address. For filesystem-backed pages, the tail is
+ * required as the index of the page determines the key. For
+ * base pages, there is no tail page and tail == page.
+ */
+ tail = page;
+ page = compound_head(page);
+ mapping = READ_ONCE(page->mapping);
+
+ /*
+ * If page->mapping is NULL, then it cannot be a PageAnon
+ * page; but it might be the ZERO_PAGE or in the gate area or
+ * in a special mapping (all cases which we are happy to fail);
+ * or it may have been a good file page when get_user_pages_fast
+ * found it, but truncated or holepunched or subjected to
+ * invalidate_complete_page2 before we got the page lock (also
+ * cases which we are happy to fail). And we hold a reference,
+ * so refcount care in invalidate_complete_page's remove_mapping
+ * prevents drop_caches from setting mapping to NULL beneath us.
+ *
+ * The case we do have to guard against is when memory pressure made
+ * shmem_writepage move it from filecache to swapcache beneath us:
+ * an unlikely race, but we do need to retry for page->mapping.
+ */
+ if (unlikely(!mapping)) {
+ int shmem_swizzled;
+
+ /*
+ * Page lock is required to identify which special case above
+ * applies. If this is really a shmem page then the page lock
+ * will prevent unexpected transitions.
+ */
+ lock_page(page);
+ shmem_swizzled = PageSwapCache(page) || page->mapping;
+ unlock_page(page);
+ put_page(page);
+
+ if (shmem_swizzled)
+ goto again;
+
+ return -EFAULT;
+ }
+
+ /*
+ * Private mappings are handled in a simple way.
+ *
+ * If the futex key is stored on an anonymous page, then the associated
+ * object is the mm which is implicitly pinned by the calling process.
+ *
+ * NOTE: When userspace waits on a MAP_SHARED mapping, even if
+ * it's a read-only handle, it's expected that futexes attach to
+ * the object not the particular process.
+ */
+ if (PageAnon(page)) {
+ /*
+ * A RO anonymous page will never change and thus doesn't make
+ * sense for futex operations.
+ */
+ if (unlikely(should_fail_futex(true)) || ro) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
+ key->private.mm = mm;
+ key->private.address = address;
+
+ } else {
+ struct inode *inode;
+
+ /*
+ * The associated futex object in this case is the inode and
+ * the page->mapping must be traversed. Ordinarily this should
+ * be stabilised under page lock but it's not strictly
+ * necessary in this case as we just want to pin the inode, not
+ * update the radix tree or anything like that.
+ *
+ * The RCU read lock is taken as the inode is finally freed
+ * under RCU. If the mapping still matches expectations then the
+ * mapping->host can be safely accessed as being a valid inode.
+ */
+ rcu_read_lock();
+
+ if (READ_ONCE(page->mapping) != mapping) {
+ rcu_read_unlock();
+ put_page(page);
+
+ goto again;
+ }
+
+ inode = READ_ONCE(mapping->host);
+ if (!inode) {
+ rcu_read_unlock();
+ put_page(page);
+
+ goto again;
+ }
+
+ key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+ key->shared.i_seq = get_inode_sequence_number(inode);
+ key->shared.pgoff = page_to_pgoff(tail);
+ rcu_read_unlock();
+ }
+
+out:
+ put_page(page);
+ return err;
+}
+
+/**
+ * fault_in_user_writeable() - Fault in user address and verify RW access
+ * @uaddr: pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non-destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+int fault_in_user_writeable(u32 __user *uaddr)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ mmap_read_lock(mm);
+ ret = fixup_user_fault(mm, (unsigned long)uaddr,
+ FAULT_FLAG_WRITE, NULL);
+ mmap_read_unlock(mm);
+
+ return ret < 0 ? ret : 0;
+}
+
+/**
+ * futex_top_waiter() - Return the highest priority waiter on a futex
+ * @hb: the hash bucket the futex_q's reside in
+ * @key: the futex key (to distinguish it from other futex futex_q's)
+ *
+ * Must be called with the hb lock held.
+ */
+struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
+{
+ struct futex_q *this;
+
+ plist_for_each_entry(this, &hb->chain, list) {
+ if (futex_match(&this->key, key))
+ return this;
+ }
+ return NULL;
+}
+
+int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
+ pagefault_enable();
+
+ return ret;
+}
+
+int futex_get_value_locked(u32 *dest, u32 __user *from)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = __get_user(*dest, from);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
+/**
+ * wait_for_owner_exiting - Block until the owner has exited
+ * @ret: owner's current futex lock status
+ * @exiting: Pointer to the exiting task
+ *
+ * Caller must hold a refcount on @exiting.
+ */
+void wait_for_owner_exiting(int ret, struct task_struct *exiting)
+{
+ if (ret != -EBUSY) {
+ WARN_ON_ONCE(exiting);
+ return;
+ }
+
+ if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
+ return;
+
+ mutex_lock(&exiting->futex_exit_mutex);
+ /*
+ * No point in doing state checking here. If the waiter got here
+ * while the task was in exec()->exec_futex_release() then it can
+ * have any FUTEX_STATE_* value when the waiter has acquired the
+ * mutex. OK, if running, EXITING or DEAD if it reached exit()
+ * already. Highly unlikely and not a problem. Just one more round
+ * through the futex maze.
+ */
+ mutex_unlock(&exiting->futex_exit_mutex);
+
+ put_task_struct(exiting);
+}
+
+/**
+ * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+void __futex_unqueue(struct futex_q *q)
+{
+ struct futex_hash_bucket *hb;
+
+ if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
+ return;
+ lockdep_assert_held(q->lock_ptr);
+
+ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+ plist_del(&q->list, &hb->chain);
+ futex_hb_waiters_dec(hb);
+}
+
+/* The key must be already stored in q->key. */
+struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
+ __acquires(&hb->lock)
+{
+ struct futex_hash_bucket *hb;
+
+ hb = futex_hash(&q->key);
+
+ /*
+ * Increment the counter before taking the lock so that
+ * a potential waker won't miss a to-be-slept task that is
+ * waiting for the spinlock. This is safe as all futex_q_lock()
+ * users end up calling futex_queue(). Similarly, for housekeeping,
+ * decrement the counter at futex_q_unlock() when some error has
+ * occurred and we don't end up adding the task to the list.
+ */
+ futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */
+
+ q->lock_ptr = &hb->lock;
+
+ spin_lock(&hb->lock);
+ return hb;
+}
+
+void futex_q_unlock(struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ spin_unlock(&hb->lock);
+ futex_hb_waiters_dec(hb);
+}
+
+void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+{
+ int prio;
+
+ /*
+ * The priority used to register this element is
+ * - either the real thread-priority for the real-time threads
+ * (i.e. threads with a priority lower than MAX_RT_PRIO)
+ * - or MAX_RT_PRIO for non-RT threads.
+ * Thus, all RT-threads are woken first in priority order, and
+ * the others are woken last, in FIFO order.
+ */
+ prio = min(current->normal_prio, MAX_RT_PRIO);
+
+ plist_node_init(&q->list, prio);
+ plist_add(&q->list, &hb->chain);
+ q->task = current;
+}
+
+/**
+ * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
+ * be paired with exactly one earlier call to futex_queue().
+ *
+ * Return:
+ * - 1 - if the futex_q was still queued (and we removed unqueued it);
+ * - 0 - if the futex_q was already removed by the waking thread
+ */
+int futex_unqueue(struct futex_q *q)
+{
+ spinlock_t *lock_ptr;
+ int ret = 0;
+
+ /* In the common case we don't take the spinlock, which is nice. */
+retry:
+ /*
+ * q->lock_ptr can change between this read and the following spin_lock.
+ * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
+ * optimizing lock_ptr out of the logic below.
+ */
+ lock_ptr = READ_ONCE(q->lock_ptr);
+ if (lock_ptr != NULL) {
+ spin_lock(lock_ptr);
+ /*
+ * q->lock_ptr can change between reading it and
+ * spin_lock(), causing us to take the wrong lock. This
+ * corrects the race condition.
+ *
+ * Reasoning goes like this: if we have the wrong lock,
+ * q->lock_ptr must have changed (maybe several times)
+ * between reading it and the spin_lock(). It can
+ * change again after the spin_lock() but only if it was
+ * already changed before the spin_lock(). It cannot,
+ * however, change back to the original value. Therefore
+ * we can detect whether we acquired the correct lock.
+ */
+ if (unlikely(lock_ptr != q->lock_ptr)) {
+ spin_unlock(lock_ptr);
+ goto retry;
+ }
+ __futex_unqueue(q);
+
+ BUG_ON(q->pi_state);
+
+ spin_unlock(lock_ptr);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+/*
+ * PI futexes can not be requeued and must remove themselves from the
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
+ */
+void futex_unqueue_pi(struct futex_q *q)
+{
+ __futex_unqueue(q);
+
+ BUG_ON(!q->pi_state);
+ put_pi_state(q->pi_state);
+ q->pi_state = NULL;
+}
+
+/* Constants for the pending_op argument of handle_futex_death */
+#define HANDLE_DEATH_PENDING true
+#define HANDLE_DEATH_LIST false
+
+/*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
+ bool pi, bool pending_op)
+{
+ u32 uval, nval, mval;
+ int err;
+
+ /* Futex address must be 32bit aligned */
+ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+ return -1;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -1;
+
+ /*
+ * Special case for regular (non PI) futexes. The unlock path in
+ * user space has two race scenarios:
+ *
+ * 1. The unlock path releases the user space futex value and
+ * before it can execute the futex() syscall to wake up
+ * waiters it is killed.
+ *
+ * 2. A woken up waiter is killed before it can acquire the
+ * futex in user space.
+ *
+ * In both cases the TID validation below prevents a wakeup of
+ * potential waiters which can cause these waiters to block
+ * forever.
+ *
+ * In both cases the following conditions are met:
+ *
+ * 1) task->robust_list->list_op_pending != NULL
+ * @pending_op == true
+ * 2) User space futex value == 0
+ * 3) Regular futex: @pi == false
+ *
+ * If these conditions are met, it is safe to attempt waking up a
+ * potential waiter without touching the user space futex value and
+ * trying to set the OWNER_DIED bit. The user space futex value is
+ * uncontended and the rest of the user space mutex state is
+ * consistent, so a woken waiter will just take over the
+ * uncontended futex. Setting the OWNER_DIED bit would create
+ * inconsistent state and malfunction of the user space owner died
+ * handling.
+ */
+ if (pending_op && !pi && !uval) {
+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ return 0;
+ }
+
+ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+ return 0;
+
+ /*
+ * Ok, this dying thread is truly holding a futex
+ * of interest. Set the OWNER_DIED bit atomically
+ * via cmpxchg, and if the value had FUTEX_WAITERS
+ * set, wake up a waiter (if any). (We have to do a
+ * futex_wake() even if OWNER_DIED is already set -
+ * to handle the rare but possible case of recursive
+ * thread-death.) The rest of the cleanup is done in
+ * userspace.
+ */
+ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+
+ /*
+ * We are not holding a lock here, but we want to have
+ * the pagefault_disable/enable() protection because
+ * we want to handle the fault gracefully. If the
+ * access fails we try to fault in the futex with R/W
+ * verification via get_user_pages. get_user() above
+ * does not guarantee R/W access. If that fails we
+ * give up and leave the futex locked.
+ */
+ if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
+ switch (err) {
+ case -EFAULT:
+ if (fault_in_user_writeable(uaddr))
+ return -1;
+ goto retry;
+
+ case -EAGAIN:
+ cond_resched();
+ goto retry;
+
+ default:
+ WARN_ON_ONCE(1);
+ return err;
+ }
+ }
+
+ if (nval != uval)
+ goto retry;
+
+ /*
+ * Wake robust non-PI futexes here. The wakeup of
+ * PI futexes happens in exit_pi_state():
+ */
+ if (!pi && (uval & FUTEX_WAITERS))
+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+
+ return 0;
+}
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+ struct robust_list __user * __user *head,
+ unsigned int *pi)
+{
+ unsigned long uentry;
+
+ if (get_user(uentry, (unsigned long __user *)head))
+ return -EFAULT;
+
+ *entry = (void __user *)(uentry & ~1UL);
+ *pi = uentry & 1;
+
+ return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+static void exit_robust_list(struct task_struct *curr)
+{
+ struct robust_list_head __user *head = curr->robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int next_pi;
+ unsigned long futex_offset;
+ int rc;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (fetch_robust_entry(&entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * don't process it twice:
+ */
+ if (entry != pending) {
+ if (handle_futex_death((void __user *)entry + futex_offset,
+ curr, pi, HANDLE_DEATH_LIST))
+ return;
+ }
+ if (rc)
+ return;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+
+ if (pending) {
+ handle_futex_death((void __user *)pending + futex_offset,
+ curr, pip, HANDLE_DEATH_PENDING);
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static void __user *futex_uaddr(struct robust_list __user *entry,
+ compat_long_t futex_offset)
+{
+ compat_uptr_t base = ptr_to_compat(entry);
+ void __user *uaddr = compat_ptr(base + futex_offset);
+
+ return uaddr;
+}
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+ compat_uptr_t __user *head, unsigned int *pi)
+{
+ if (get_user(*uentry, head))
+ return -EFAULT;
+
+ *entry = compat_ptr((*uentry) & ~1);
+ *pi = (unsigned int)(*uentry) & 1;
+
+ return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+static void compat_exit_robust_list(struct task_struct *curr)
+{
+ struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int next_pi;
+ compat_uptr_t uentry, next_uentry, upending;
+ compat_long_t futex_offset;
+ int rc;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (compat_fetch_robust_entry(&upending, &pending,
+ &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != (struct robust_list __user *) &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
+ (compat_uptr_t __user *)&entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * dont process it twice:
+ */
+ if (entry != pending) {
+ void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+ if (handle_futex_death(uaddr, curr, pi,
+ HANDLE_DEATH_LIST))
+ return;
+ }
+ if (rc)
+ return;
+ uentry = next_uentry;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+ if (pending) {
+ void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+ handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
+ }
+}
+#endif
+
+#ifdef CONFIG_FUTEX_PI
+
+/*
+ * This task is holding PI mutexes at exit time => bad.
+ * Kernel cleans up PI-state, but userspace is likely hosed.
+ * (Robust-futex cleanup is separate and might save the day for userspace.)
+ */
+static void exit_pi_state_list(struct task_struct *curr)
+{
+ struct list_head *next, *head = &curr->pi_state_list;
+ struct futex_pi_state *pi_state;
+ struct futex_hash_bucket *hb;
+ union futex_key key = FUTEX_KEY_INIT;
+
+ /*
+ * We are a ZOMBIE and nobody can enqueue itself on
+ * pi_state_list anymore, but we have to be careful
+ * versus waiters unqueueing themselves:
+ */
+ raw_spin_lock_irq(&curr->pi_lock);
+ while (!list_empty(head)) {
+ next = head->next;
+ pi_state = list_entry(next, struct futex_pi_state, list);
+ key = pi_state->key;
+ hb = futex_hash(&key);
+
+ /*
+ * We can race against put_pi_state() removing itself from the
+ * list (a waiter going away). put_pi_state() will first
+ * decrement the reference count and then modify the list, so
+ * its possible to see the list entry but fail this reference
+ * acquire.
+ *
+ * In that case; drop the locks to let put_pi_state() make
+ * progress and retry the loop.
+ */
+ if (!refcount_inc_not_zero(&pi_state->refcount)) {
+ raw_spin_unlock_irq(&curr->pi_lock);
+ cpu_relax();
+ raw_spin_lock_irq(&curr->pi_lock);
+ continue;
+ }
+ raw_spin_unlock_irq(&curr->pi_lock);
+
+ spin_lock(&hb->lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
+ if (head->next != next) {
+ /* retain curr->pi_lock for the loop invariant */
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ WARN_ON(pi_state->owner != curr);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ pi_state->owner = NULL;
+
+ raw_spin_unlock(&curr->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+
+ raw_spin_lock_irq(&curr->pi_lock);
+ }
+ raw_spin_unlock_irq(&curr->pi_lock);
+}
+#else
+static inline void exit_pi_state_list(struct task_struct *curr) { }
+#endif
+
+static void futex_cleanup(struct task_struct *tsk)
+{
+ if (unlikely(tsk->robust_list)) {
+ exit_robust_list(tsk);
+ tsk->robust_list = NULL;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(tsk->compat_robust_list)) {
+ compat_exit_robust_list(tsk);
+ tsk->compat_robust_list = NULL;
+ }
+#endif
+
+ if (unlikely(!list_empty(&tsk->pi_state_list)))
+ exit_pi_state_list(tsk);
+}
+
+/**
+ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
+ * @tsk: task to set the state on
+ *
+ * Set the futex exit state of the task lockless. The futex waiter code
+ * observes that state when a task is exiting and loops until the task has
+ * actually finished the futex cleanup. The worst case for this is that the
+ * waiter runs through the wait loop until the state becomes visible.
+ *
+ * This is called from the recursive fault handling path in do_exit().
+ *
+ * This is best effort. Either the futex exit code has run already or
+ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
+ * take it over. If not, the problem is pushed back to user space. If the
+ * futex exit code did not run yet, then an already queued waiter might
+ * block forever, but there is nothing which can be done about that.
+ */
+void futex_exit_recursive(struct task_struct *tsk)
+{
+ /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
+ if (tsk->futex_state == FUTEX_STATE_EXITING)
+ mutex_unlock(&tsk->futex_exit_mutex);
+ tsk->futex_state = FUTEX_STATE_DEAD;
+}
+
+static void futex_cleanup_begin(struct task_struct *tsk)
+{
+ /*
+ * Prevent various race issues against a concurrent incoming waiter
+ * including live locks by forcing the waiter to block on
+ * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+ * attach_to_pi_owner().
+ */
+ mutex_lock(&tsk->futex_exit_mutex);
+
+ /*
+ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+ *
+ * This ensures that all subsequent checks of tsk->futex_state in
+ * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
+ * tsk->pi_lock held.
+ *
+ * It guarantees also that a pi_state which was queued right before
+ * the state change under tsk->pi_lock by a concurrent waiter must
+ * be observed in exit_pi_state_list().
+ */
+ raw_spin_lock_irq(&tsk->pi_lock);
+ tsk->futex_state = FUTEX_STATE_EXITING;
+ raw_spin_unlock_irq(&tsk->pi_lock);
+}
+
+static void futex_cleanup_end(struct task_struct *tsk, int state)
+{
+ /*
+ * Lockless store. The only side effect is that an observer might
+ * take another loop until it becomes visible.
+ */
+ tsk->futex_state = state;
+ /*
+ * Drop the exit protection. This unblocks waiters which observed
+ * FUTEX_STATE_EXITING to reevaluate the state.
+ */
+ mutex_unlock(&tsk->futex_exit_mutex);
+}
+
+void futex_exec_release(struct task_struct *tsk)
+{
+ /*
+ * The state handling is done for consistency, but in the case of
+ * exec() there is no way to prevent further damage as the PID stays
+ * the same. But for the unlikely and arguably buggy case that a
+ * futex is held on exec(), this provides at least as much state
+ * consistency protection which is possible.
+ */
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ /*
+ * Reset the state to FUTEX_STATE_OK. The task is alive and about
+ * exec a new binary.
+ */
+ futex_cleanup_end(tsk, FUTEX_STATE_OK);
+}
+
+void futex_exit_release(struct task_struct *tsk)
+{
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
+}
+
+static int __init futex_init(void)
+{
+ unsigned int futex_shift;
+ unsigned long i;
+
+#if CONFIG_BASE_SMALL
+ futex_hashsize = 16;
+#else
+ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+
+ futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+ futex_hashsize, 0,
+ futex_hashsize < 256 ? HASH_SMALL : 0,
+ &futex_shift, NULL,
+ futex_hashsize, futex_hashsize);
+ futex_hashsize = 1UL << futex_shift;
+
+ for (i = 0; i < futex_hashsize; i++) {
+ atomic_set(&futex_queues[i].waiters, 0);
+ plist_head_init(&futex_queues[i].chain);
+ spin_lock_init(&futex_queues[i].lock);
+ }
+
+ return 0;
+}
+core_initcall(futex_init);
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
new file mode 100644
index 000000000000..c264cbeab71c
--- /dev/null
+++ b/kernel/futex/futex.h
@@ -0,0 +1,293 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FUTEX_H
+#define _FUTEX_H
+
+#include <linux/futex.h>
+#include <linux/sched/wake_q.h>
+
+#ifdef CONFIG_PREEMPT_RT
+#include <linux/rcuwait.h>
+#endif
+
+#include <asm/futex.h>
+
+/*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#ifdef CONFIG_MMU
+# define FLAGS_SHARED 0x01
+#else
+/*
+ * NOMMU does not have per process address space. Let the compiler optimize
+ * code away.
+ */
+# define FLAGS_SHARED 0x00
+#endif
+#define FLAGS_CLOCKRT 0x02
+#define FLAGS_HAS_TIMEOUT 0x04
+
+#ifdef CONFIG_FAIL_FUTEX
+extern bool should_fail_futex(bool fshared);
+#else
+static inline bool should_fail_futex(bool fshared)
+{
+ return false;
+}
+#endif
+
+/*
+ * Hash buckets are shared by all the futex_keys that hash to the same
+ * location. Each key may have multiple futex_q structures, one for each task
+ * waiting on a futex.
+ */
+struct futex_hash_bucket {
+ atomic_t waiters;
+ spinlock_t lock;
+ struct plist_head chain;
+} ____cacheline_aligned_in_smp;
+
+/*
+ * Priority Inheritance state:
+ */
+struct futex_pi_state {
+ /*
+ * list of 'owned' pi_state instances - these have to be
+ * cleaned up in do_exit() if the task exits prematurely:
+ */
+ struct list_head list;
+
+ /*
+ * The PI object:
+ */
+ struct rt_mutex_base pi_mutex;
+
+ struct task_struct *owner;
+ refcount_t refcount;
+
+ union futex_key key;
+} __randomize_layout;
+
+/**
+ * struct futex_q - The hashed futex queue entry, one per waiting task
+ * @list: priority-sorted list of tasks waiting on this futex
+ * @task: the task waiting on the futex
+ * @lock_ptr: the hash bucket lock
+ * @key: the key the futex is hashed on
+ * @pi_state: optional priority inheritance state
+ * @rt_waiter: rt_waiter storage for use with requeue_pi
+ * @requeue_pi_key: the requeue_pi target futex key
+ * @bitset: bitset for the optional bitmasked wakeup
+ * @requeue_state: State field for futex_requeue_pi()
+ * @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
+ *
+ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
+ * we can wake only the relevant ones (hashed queues may be shared).
+ *
+ * A futex_q has a woken state, just like tasks have TASK_RUNNING.
+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
+ * The order of wakeup is always to make the first condition true, then
+ * the second.
+ *
+ * PI futexes are typically woken before they are removed from the hash list via
+ * the rt_mutex code. See futex_unqueue_pi().
+ */
+struct futex_q {
+ struct plist_node list;
+
+ struct task_struct *task;
+ spinlock_t *lock_ptr;
+ union futex_key key;
+ struct futex_pi_state *pi_state;
+ struct rt_mutex_waiter *rt_waiter;
+ union futex_key *requeue_pi_key;
+ u32 bitset;
+ atomic_t requeue_state;
+#ifdef CONFIG_PREEMPT_RT
+ struct rcuwait requeue_wait;
+#endif
+} __randomize_layout;
+
+extern const struct futex_q futex_q_init;
+
+enum futex_access {
+ FUTEX_READ,
+ FUTEX_WRITE
+};
+
+extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+ enum futex_access rw);
+
+extern struct hrtimer_sleeper *
+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+ int flags, u64 range_ns);
+
+extern struct futex_hash_bucket *futex_hash(union futex_key *key);
+
+/**
+ * futex_match - Check whether two futex keys are equal
+ * @key1: Pointer to key1
+ * @key2: Pointer to key2
+ *
+ * Return 1 if two futex_keys are equal, 0 otherwise.
+ */
+static inline int futex_match(union futex_key *key1, union futex_key *key2)
+{
+ return (key1 && key2
+ && key1->both.word == key2->both.word
+ && key1->both.ptr == key2->both.ptr
+ && key1->both.offset == key2->both.offset);
+}
+
+extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ struct futex_q *q, struct futex_hash_bucket **hb);
+extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
+ struct hrtimer_sleeper *timeout);
+extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
+
+extern int fault_in_user_writeable(u32 __user *uaddr);
+extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval);
+extern int futex_get_value_locked(u32 *dest, u32 __user *from);
+extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key);
+
+extern void __futex_unqueue(struct futex_q *q);
+extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb);
+extern int futex_unqueue(struct futex_q *q);
+
+/**
+ * futex_queue() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb: The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * futex_queue() is typically paired with exactly one call to futex_unqueue(). The
+ * exceptions involve the PI related operations, which may use futex_unqueue_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ __futex_queue(q, hb);
+ spin_unlock(&hb->lock);
+}
+
+extern void futex_unqueue_pi(struct futex_q *q);
+
+extern void wait_for_owner_exiting(int ret, struct task_struct *exiting);
+
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_inc(&hb->waiters);
+ /*
+ * Full barrier (A), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+#endif
+}
+
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_dec(&hb->waiters);
+#endif
+}
+
+static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Full barrier (B), see the ordering comment above.
+ */
+ smp_mb();
+ return atomic_read(&hb->waiters);
+#else
+ return 1;
+#endif
+}
+
+extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q);
+extern void futex_q_unlock(struct futex_hash_bucket *hb);
+
+
+extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct *task,
+ struct task_struct **exiting,
+ int set_waiters);
+
+extern int refill_pi_state_cache(void);
+extern void get_pi_state(struct futex_pi_state *pi_state);
+extern void put_pi_state(struct futex_pi_state *pi_state);
+extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked);
+
+/*
+ * Express the locking dependencies for lockdep:
+ */
+static inline void
+double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ if (hb1 > hb2)
+ swap(hb1, hb2);
+
+ spin_lock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+}
+
+static inline void
+double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+}
+
+/* syscalls */
+
+extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
+ val, ktime_t *abs_time, u32 bitset, u32 __user
+ *uaddr2);
+
+extern int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ u32 __user *uaddr2, int nr_wake, int nr_requeue,
+ u32 *cmpval, int requeue_pi);
+
+extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 bitset);
+
+/**
+ * struct futex_vector - Auxiliary struct for futex_waitv()
+ * @w: Userspace provided data
+ * @q: Kernel side data
+ *
+ * Struct used to build an array with all data need for futex_waitv()
+ */
+struct futex_vector {
+ struct futex_waitv w;
+ struct futex_q q;
+};
+
+extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to);
+
+extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset);
+
+extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags,
+ u32 __user *uaddr2, int nr_wake, int nr_wake2, int op);
+
+extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags);
+
+extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock);
+
+#endif /* _FUTEX_H */
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
new file mode 100644
index 000000000000..183b28c32c83
--- /dev/null
+++ b/kernel/futex/pi.c
@@ -0,0 +1,1233 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/slab.h>
+#include <linux/sched/task.h>
+
+#include "futex.h"
+#include "../locking/rtmutex_common.h"
+
+/*
+ * PI code:
+ */
+int refill_pi_state_cache(void)
+{
+ struct futex_pi_state *pi_state;
+
+ if (likely(current->pi_state_cache))
+ return 0;
+
+ pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
+
+ if (!pi_state)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&pi_state->list);
+ /* pi_mutex gets initialized later */
+ pi_state->owner = NULL;
+ refcount_set(&pi_state->refcount, 1);
+ pi_state->key = FUTEX_KEY_INIT;
+
+ current->pi_state_cache = pi_state;
+
+ return 0;
+}
+
+static struct futex_pi_state *alloc_pi_state(void)
+{
+ struct futex_pi_state *pi_state = current->pi_state_cache;
+
+ WARN_ON(!pi_state);
+ current->pi_state_cache = NULL;
+
+ return pi_state;
+}
+
+static void pi_state_update_owner(struct futex_pi_state *pi_state,
+ struct task_struct *new_owner)
+{
+ struct task_struct *old_owner = pi_state->owner;
+
+ lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
+
+ if (old_owner) {
+ raw_spin_lock(&old_owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ raw_spin_unlock(&old_owner->pi_lock);
+ }
+
+ if (new_owner) {
+ raw_spin_lock(&new_owner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &new_owner->pi_state_list);
+ pi_state->owner = new_owner;
+ raw_spin_unlock(&new_owner->pi_lock);
+ }
+}
+
+void get_pi_state(struct futex_pi_state *pi_state)
+{
+ WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
+}
+
+/*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ */
+void put_pi_state(struct futex_pi_state *pi_state)
+{
+ if (!pi_state)
+ return;
+
+ if (!refcount_dec_and_test(&pi_state->refcount))
+ return;
+
+ /*
+ * If pi_state->owner is NULL, the owner is most probably dying
+ * and has cleaned up the pi_state already
+ */
+ if (pi_state->owner) {
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
+ pi_state_update_owner(pi_state, NULL);
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex);
+ raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
+ }
+
+ if (current->pi_state_cache) {
+ kfree(pi_state);
+ } else {
+ /*
+ * pi_state->list is already empty.
+ * clear pi_state->owner.
+ * refcount is at 0 - put it back to 1.
+ */
+ pi_state->owner = NULL;
+ refcount_set(&pi_state->refcount, 1);
+ current->pi_state_cache = pi_state;
+ }
+}
+
+/*
+ * We need to check the following states:
+ *
+ * Waiter | pi_state | pi->owner | uTID | uODIED | ?
+ *
+ * [1] NULL | --- | --- | 0 | 0/1 | Valid
+ * [2] NULL | --- | --- | >0 | 0/1 | Valid
+ *
+ * [3] Found | NULL | -- | Any | 0/1 | Invalid
+ *
+ * [4] Found | Found | NULL | 0 | 1 | Valid
+ * [5] Found | Found | NULL | >0 | 1 | Invalid
+ *
+ * [6] Found | Found | task | 0 | 1 | Valid
+ *
+ * [7] Found | Found | NULL | Any | 0 | Invalid
+ *
+ * [8] Found | Found | task | ==taskTID | 0/1 | Valid
+ * [9] Found | Found | task | 0 | 0 | Invalid
+ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
+ *
+ * [1] Indicates that the kernel can acquire the futex atomically. We
+ * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ *
+ * [2] Valid, if TID does not belong to a kernel thread. If no matching
+ * thread is found then it indicates that the owner TID has died.
+ *
+ * [3] Invalid. The waiter is queued on a non PI futex
+ *
+ * [4] Valid state after exit_robust_list(), which sets the user space
+ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+ *
+ * [5] The user space value got manipulated between exit_robust_list()
+ * and exit_pi_state_list()
+ *
+ * [6] Valid state after exit_pi_state_list() which sets the new owner in
+ * the pi_state but cannot access the user space value.
+ *
+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+ *
+ * [8] Owner and user space value match
+ *
+ * [9] There is no transient state which sets the user space TID to 0
+ * except exit_robust_list(), but this is indicated by the
+ * FUTEX_OWNER_DIED bit. See [4]
+ *
+ * [10] There is no transient state which leaves owner and user space
+ * TID out of sync. Except one error case where the kernel is denied
+ * write access to the user address, see fixup_pi_state_owner().
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ * pi_mutex->owner -> pi_state->owner, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
+ */
+
+/*
+ * Validate that the existing waiter has a pi_state and sanity check
+ * the pi_state against the user space value. If correct, attach to
+ * it.
+ */
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
+ struct futex_pi_state **ps)
+{
+ pid_t pid = uval & FUTEX_TID_MASK;
+ u32 uval2;
+ int ret;
+
+ /*
+ * Userspace might have messed up non-PI and PI futexes [3]
+ */
+ if (unlikely(!pi_state))
+ return -EINVAL;
+
+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ *
+ * The waiter holding a reference on @pi_state also protects against
+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+ * free pi_state before we can take a reference ourselves.
+ */
+ WARN_ON(!refcount_read(&pi_state->refcount));
+
+ /*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (futex_get_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
+ * Handle the owner died case:
+ */
+ if (uval & FUTEX_OWNER_DIED) {
+ /*
+ * exit_pi_state_list sets owner to NULL and wakes the
+ * topmost waiter. The task which acquires the
+ * pi_state->rt_mutex will fixup owner.
+ */
+ if (!pi_state->owner) {
+ /*
+ * No pi state owner, but the user space TID
+ * is not 0. Inconsistent state. [5]
+ */
+ if (pid)
+ goto out_einval;
+ /*
+ * Take a ref on the state and return success. [4]
+ */
+ goto out_attach;
+ }
+
+ /*
+ * If TID is 0, then either the dying owner has not
+ * yet executed exit_pi_state_list() or some waiter
+ * acquired the rtmutex in the pi state, but did not
+ * yet fixup the TID in user space.
+ *
+ * Take a ref on the state and return success. [6]
+ */
+ if (!pid)
+ goto out_attach;
+ } else {
+ /*
+ * If the owner died bit is not set, then the pi_state
+ * must have an owner. [7]
+ */
+ if (!pi_state->owner)
+ goto out_einval;
+ }
+
+ /*
+ * Bail out if user space manipulated the futex value. If pi
+ * state exists then the owner TID must be the same as the
+ * user space TID. [9/10]
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ goto out_einval;
+
+out_attach:
+ get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ *ps = pi_state;
+ return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
+}
+
+static int handle_exit_race(u32 __user *uaddr, u32 uval,
+ struct task_struct *tsk)
+{
+ u32 uval2;
+
+ /*
+ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
+ * caller that the alleged owner is busy.
+ */
+ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+ return -EBUSY;
+
+ /*
+ * Reread the user space value to handle the following situation:
+ *
+ * CPU0 CPU1
+ *
+ * sys_exit() sys_futex()
+ * do_exit() futex_lock_pi()
+ * futex_lock_pi_atomic()
+ * exit_signals(tsk) No waiters:
+ * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
+ * mm_release(tsk) Set waiter bit
+ * exit_robust_list(tsk) { *uaddr = 0x80000PID;
+ * Set owner died attach_to_pi_owner() {
+ * *uaddr = 0xC0000000; tsk = get_task(PID);
+ * } if (!tsk->flags & PF_EXITING) {
+ * ... attach();
+ * tsk->futex_state = } else {
+ * FUTEX_STATE_DEAD; if (tsk->futex_state !=
+ * FUTEX_STATE_DEAD)
+ * return -EAGAIN;
+ * return -ESRCH; <--- FAIL
+ * }
+ *
+ * Returning ESRCH unconditionally is wrong here because the
+ * user space value has been changed by the exiting task.
+ *
+ * The same logic applies to the case where the exiting task is
+ * already gone.
+ */
+ if (futex_get_value_locked(&uval2, uaddr))
+ return -EFAULT;
+
+ /* If the user space value has changed, try again. */
+ if (uval2 != uval)
+ return -EAGAIN;
+
+ /*
+ * The exiting task did not have a robust list, the robust list was
+ * corrupted or the user space value in *uaddr is simply bogus.
+ * Give up and tell user space.
+ */
+ return -ESRCH;
+}
+
+static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
+ struct futex_pi_state **ps)
+{
+ /*
+ * No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
+ */
+ struct futex_pi_state *pi_state = alloc_pi_state();
+
+ /*
+ * Initialize the pi_mutex in locked state and make @p
+ * the owner of it:
+ */
+ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+ /* Store the key for possible exit cleanups: */
+ pi_state->key = *key;
+
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &p->pi_state_list);
+ /*
+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+ * because there is no concurrency as the object is not published yet.
+ */
+ pi_state->owner = p;
+
+ *ps = pi_state;
+}
+/*
+ * Lookup the task for the TID provided from user space and attach to
+ * it after doing proper sanity checks.
+ */
+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct **exiting)
+{
+ pid_t pid = uval & FUTEX_TID_MASK;
+ struct task_struct *p;
+
+ /*
+ * We are the first waiter - try to look up the real owner and attach
+ * the new pi_state to it, but bail out when TID = 0 [1]
+ *
+ * The !pid check is paranoid. None of the call sites should end up
+ * with pid == 0, but better safe than sorry. Let the caller retry
+ */
+ if (!pid)
+ return -EAGAIN;
+ p = find_get_task_by_vpid(pid);
+ if (!p)
+ return handle_exit_race(uaddr, uval, NULL);
+
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ put_task_struct(p);
+ return -EPERM;
+ }
+
+ /*
+ * We need to look at the task state to figure out, whether the
+ * task is exiting. To protect against the change of the task state
+ * in futex_exit_release(), we do this protected by p->pi_lock:
+ */
+ raw_spin_lock_irq(&p->pi_lock);
+ if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
+ /*
+ * The task is on the way out. When the futex state is
+ * FUTEX_STATE_DEAD, we know that the task has finished
+ * the cleanup:
+ */
+ int ret = handle_exit_race(uaddr, uval, p);
+
+ raw_spin_unlock_irq(&p->pi_lock);
+ /*
+ * If the owner task is between FUTEX_STATE_EXITING and
+ * FUTEX_STATE_DEAD then store the task pointer and keep
+ * the reference on the task struct. The calling code will
+ * drop all locks, wait for the task to reach
+ * FUTEX_STATE_DEAD and then drop the refcount. This is
+ * required to prevent a live lock when the current task
+ * preempted the exiting task between the two states.
+ */
+ if (ret == -EBUSY)
+ *exiting = p;
+ else
+ put_task_struct(p);
+ return ret;
+ }
+
+ __attach_to_pi_owner(p, key, ps);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+{
+ int err;
+ u32 curval;
+
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
+ if (unlikely(err))
+ return err;
+
+ /* If user space value changed, let the caller retry */
+ return curval != uval ? -EAGAIN : 0;
+}
+
+/**
+ * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
+ * @uaddr: the pi futex user address
+ * @hb: the pi futex hash bucket
+ * @key: the futex key associated with uaddr and hb
+ * @ps: the pi_state pointer where we store the result of the
+ * lookup
+ * @task: the task to perform the atomic lock work for. This will
+ * be "current" except in the case of requeue pi.
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Return:
+ * - 0 - ready to wait;
+ * - 1 - acquired the lock;
+ * - <0 - error
+ *
+ * The hb->lock must be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ */
+int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct *task,
+ struct task_struct **exiting,
+ int set_waiters)
+{
+ u32 uval, newval, vpid = task_pid_vnr(task);
+ struct futex_q *top_waiter;
+ int ret;
+
+ /*
+ * Read the user space value first so we can validate a few
+ * things before proceeding further.
+ */
+ if (futex_get_value_locked(&uval, uaddr))
+ return -EFAULT;
+
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ /*
+ * Detect deadlocks.
+ */
+ if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
+ return -EDEADLK;
+
+ if ((unlikely(should_fail_futex(true))))
+ return -EDEADLK;
+
+ /*
+ * Lookup existing state first. If it exists, try to attach to
+ * its pi_state.
+ */
+ top_waiter = futex_top_waiter(hb, key);
+ if (top_waiter)
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
+
+ /*
+ * No waiter and user TID is 0. We are here because the
+ * waiters or the owner died bit is set or called from
+ * requeue_cmp_pi or for whatever reason something took the
+ * syscall.
+ */
+ if (!(uval & FUTEX_TID_MASK)) {
+ /*
+ * We take over the futex. No other waiters and the user space
+ * TID is 0. We preserve the owner died bit.
+ */
+ newval = uval & FUTEX_OWNER_DIED;
+ newval |= vpid;
+
+ /* The futex requeue_pi code can enforce the waiters bit */
+ if (set_waiters)
+ newval |= FUTEX_WAITERS;
+
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ if (ret)
+ return ret;
+
+ /*
+ * If the waiter bit was requested the caller also needs PI
+ * state attached to the new owner of the user space futex.
+ *
+ * @task is guaranteed to be alive and it cannot be exiting
+ * because it is either sleeping or waiting in
+ * futex_requeue_pi_wakeup_sync().
+ *
+ * No need to do the full attach_to_pi_owner() exercise
+ * because @task is known and valid.
+ */
+ if (set_waiters) {
+ raw_spin_lock_irq(&task->pi_lock);
+ __attach_to_pi_owner(task, key, ps);
+ raw_spin_unlock_irq(&task->pi_lock);
+ }
+ return 1;
+ }
+
+ /*
+ * First waiter. Set the waiters bit before attaching ourself to
+ * the owner. If owner tries to unlock, it will be forced into
+ * the kernel and blocked on hb->lock.
+ */
+ newval = uval | FUTEX_WAITERS;
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ if (ret)
+ return ret;
+ /*
+ * If the update of the user space value succeeded, we try to
+ * attach to the owner. If that fails, no harm done, we only
+ * set the FUTEX_WAITERS bit in the user space variable.
+ */
+ return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
+}
+
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
+{
+ struct rt_mutex_waiter *top_waiter;
+ struct task_struct *new_owner;
+ bool postunlock = false;
+ DEFINE_RT_WAKE_Q(wqh);
+ u32 curval, newval;
+ int ret = 0;
+
+ top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
+ if (WARN_ON_ONCE(!top_waiter)) {
+ /*
+ * As per the comment in futex_unlock_pi() this should not happen.
+ *
+ * When this happens, give up our locks and try again, giving
+ * the futex_lock_pi() instance time to complete, either by
+ * waiting on the rtmutex or removing itself from the futex
+ * queue.
+ */
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
+ new_owner = top_waiter->task;
+
+ /*
+ * We pass it to the next owner. The WAITERS bit is always kept
+ * enabled while there is PI state around. We cleanup the owner
+ * died bit, because we are the owner.
+ */
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+
+ if (unlikely(should_fail_futex(true))) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
+ if (!ret && (curval != uval)) {
+ /*
+ * If a unconditional UNLOCK_PI operation (user space did not
+ * try the TID->0 transition) raced with a waiter setting the
+ * FUTEX_WAITERS flag between get_user() and locking the hash
+ * bucket lock, retry the operation.
+ */
+ if ((FUTEX_TID_MASK & curval) == uval)
+ ret = -EAGAIN;
+ else
+ ret = -EINVAL;
+ }
+
+ if (!ret) {
+ /*
+ * This is a point of no return; once we modified the uval
+ * there is no going back and subsequent operations must
+ * not fail.
+ */
+ pi_state_update_owner(pi_state, new_owner);
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
+ }
+
+out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wqh);
+
+ return ret;
+}
+
+static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct task_struct *argowner)
+{
+ struct futex_pi_state *pi_state = q->pi_state;
+ struct task_struct *oldowner, *newowner;
+ u32 uval, curval, newval, newtid;
+ int err = 0;
+
+ oldowner = pi_state->owner;
+
+ /*
+ * We are here because either:
+ *
+ * - we stole the lock and pi_state->owner needs updating to reflect
+ * that (@argowner == current),
+ *
+ * or:
+ *
+ * - someone stole our lock and we need to fix things to point to the
+ * new owner (@argowner == NULL).
+ *
+ * Either way, we have to replace the TID in the user space variable.
+ * This must be atomic as we have to preserve the owner died bit here.
+ *
+ * Note: We write the user space value _before_ changing the pi_state
+ * because we can fault here. Imagine swapped out pages or a fork
+ * that marked all the anonymous memory readonly for cow.
+ *
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID checks when attaching to PI state .
+ */
+retry:
+ if (!argowner) {
+ if (oldowner != current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ return 0;
+ }
+
+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+ /* We got the lock. pi_state is correct. Tell caller. */
+ return 1;
+ }
+
+ /*
+ * The trylock just failed, so either there is an owner or
+ * there is a higher priority waiter than this one.
+ */
+ newowner = rt_mutex_owner(&pi_state->pi_mutex);
+ /*
+ * If the higher priority waiter has not yet taken over the
+ * rtmutex then newowner is NULL. We can't return here with
+ * that state because it's inconsistent vs. the user space
+ * state. So drop the locks and try again. It's a valid
+ * situation and not any different from the other retry
+ * conditions.
+ */
+ if (unlikely(!newowner)) {
+ err = -EAGAIN;
+ goto handle_err;
+ }
+ } else {
+ WARN_ON_ONCE(argowner != current);
+ if (oldowner == current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ return 1;
+ }
+ newowner = argowner;
+ }
+
+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ /* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
+ err = futex_get_value_locked(&uval, uaddr);
+ if (err)
+ goto handle_err;
+
+ for (;;) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+ err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
+ if (err)
+ goto handle_err;
+
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+
+ /*
+ * We fixed up user space. Now we need to fix the pi_state
+ * itself.
+ */
+ pi_state_update_owner(pi_state, newowner);
+
+ return argowner == current;
+
+ /*
+ * In order to reschedule or handle a page fault, we need to drop the
+ * locks here. In the case of a fault, this gives the other task
+ * (either the highest priority waiter itself or the task which stole
+ * the rtmutex) the chance to try the fixup of the pi_state. So once we
+ * are back from handling the fault we need to check the pi_state after
+ * reacquiring the locks and before trying to do another fixup. When
+ * the fixup has been done already we simply return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
+ */
+handle_err:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(q->lock_ptr);
+
+ switch (err) {
+ case -EFAULT:
+ err = fault_in_user_writeable(uaddr);
+ break;
+
+ case -EAGAIN:
+ cond_resched();
+ err = 0;
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Check if someone else fixed it for us:
+ */
+ if (pi_state->owner != oldowner)
+ return argowner == current;
+
+ /* Retry if err was -EAGAIN or the fault in succeeded */
+ if (!err)
+ goto retry;
+
+ /*
+ * fault_in_user_writeable() failed so user state is immutable. At
+ * best we can make the kernel state consistent but user state will
+ * be most likely hosed and any subsequent unlock operation will be
+ * rejected due to PI futex rule [10].
+ *
+ * Ensure that the rtmutex owner is also the pi_state owner despite
+ * the user space value claiming something different. There is no
+ * point in unlocking the rtmutex if current is the owner as it
+ * would need to wait until the next waiter has taken the rtmutex
+ * to guarantee consistent state. Keep it simple. Userspace asked
+ * for this wreckaged state.
+ *
+ * The rtmutex has an owner - either current or some other
+ * task. See the EAGAIN loop above.
+ */
+ pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
+
+ return err;
+}
+
+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct task_struct *argowner)
+{
+ struct futex_pi_state *pi_state = q->pi_state;
+ int ret;
+
+ lockdep_assert_held(q->lock_ptr);
+
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ ret = __fixup_pi_state_owner(uaddr, q, argowner);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
+}
+
+/**
+ * fixup_pi_owner() - Post lock pi_state and corner case management
+ * @uaddr: user address of the futex
+ * @q: futex_q (contains pi_state and access to the rt_mutex)
+ * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
+ *
+ * After attempting to lock an rt_mutex, this function is called to cleanup
+ * the pi_state owner as well as handle race conditions that may allow us to
+ * acquire the lock. Must be called with the hb lock held.
+ *
+ * Return:
+ * - 1 - success, lock taken;
+ * - 0 - success, lock not taken;
+ * - <0 - on error (-EFAULT)
+ */
+int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
+{
+ if (locked) {
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case:
+ *
+ * Speculative pi_state->owner read (we don't hold wait_lock);
+ * since we own the lock pi_state->owner == current is the
+ * stable state, anything else needs more attention.
+ */
+ if (q->pi_state->owner != current)
+ return fixup_pi_state_owner(uaddr, q, current);
+ return 1;
+ }
+
+ /*
+ * If we didn't get the lock; check if anybody stole it from us. In
+ * that case, we need to fix up the uval to point to them instead of
+ * us, otherwise bad things happen. [10]
+ *
+ * Another speculative read; pi_state->owner == current is unstable
+ * but needs our attention.
+ */
+ if (q->pi_state->owner == current)
+ return fixup_pi_state_owner(uaddr, q, NULL);
+
+ /*
+ * Paranoia check. If we did not take the lock, then we should not be
+ * the owner of the rt_mutex. Warn and establish consistent state.
+ */
+ if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
+ return fixup_pi_state_owner(uaddr, q, current);
+
+ return 0;
+}
+
+/*
+ * Userspace tried a 0 -> TID atomic transition of the futex value
+ * and failed. The kernel side here does the whole locking operation:
+ * if there are waiters then it will block as a consequence of relying
+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
+ * a 0 value of the futex too.).
+ *
+ * Also serves as futex trylock_pi()'ing, and due semantics.
+ */
+int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct task_struct *exiting = NULL;
+ struct rt_mutex_waiter rt_waiter;
+ struct futex_hash_bucket *hb;
+ struct futex_q q = futex_q_init;
+ int res, ret;
+
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+ to = futex_setup_timer(time, &timeout, flags, 0);
+
+retry:
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+ goto out;
+
+retry_private:
+ hb = futex_q_lock(&q);
+
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+ &exiting, 0);
+ if (unlikely(ret)) {
+ /*
+ * Atomic work succeeded and we got the lock,
+ * or failed. Either way, we do _not_ block.
+ */
+ switch (ret) {
+ case 1:
+ /* We got the lock. */
+ ret = 0;
+ goto out_unlock_put_key;
+ case -EFAULT:
+ goto uaddr_faulted;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Task is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ futex_q_unlock(hb);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock_put_key;
+ }
+ }
+
+ WARN_ON(!q.pi_state);
+
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+ __futex_queue(&q, hb);
+
+ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
+ }
+
+ rt_mutex_init_waiter(&rt_waiter);
+
+ /*
+ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+ * hold it while doing rt_mutex_start_proxy(), because then it will
+ * include hb->lock in the blocking chain, even through we'll not in
+ * fact hold it while blocking. This will lead it to report -EDEADLK
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+ * latter before calling __rt_mutex_start_proxy_lock(). This
+ * interleaves with futex_unlock_pi() -- which does a similar lock
+ * handoff -- such that the latter can observe the futex_q::pi_state
+ * before __rt_mutex_start_proxy_lock() is done.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ spin_unlock(q.lock_ptr);
+ /*
+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+ * such that futex_unlock_pi() is guaranteed to observe the waiter when
+ * it sees the futex_q::pi_state.
+ */
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+ goto cleanup;
+ }
+
+ if (unlikely(to))
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
+
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
+cleanup:
+ spin_lock(q.lock_ptr);
+ /*
+ * If we failed to acquire the lock (deadlock/signal/timeout), we must
+ * first acquire the hb->lock before removing the lock from the
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
+ * lists consistent.
+ *
+ * In particular; it is important that futex_unlock_pi() can not
+ * observe this inconsistency.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+
+no_block:
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_pi_owner(uaddr, &q, !ret);
+ /*
+ * If fixup_pi_owner() returned an error, propagate that. If it acquired
+ * the lock, clear our -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ futex_unqueue_pi(&q);
+ spin_unlock(q.lock_ptr);
+ goto out;
+
+out_unlock_put_key:
+ futex_q_unlock(hb);
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+ return ret != -EINTR ? ret : -ERESTARTNOINTR;
+
+uaddr_faulted:
+ futex_q_unlock(hb);
+
+ ret = fault_in_user_writeable(uaddr);
+ if (ret)
+ goto out;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+}
+
+/*
+ * Userspace attempted a TID -> 0 atomic transition, and failed.
+ * This is the in-kernel slowpath: we look up the PI state (if any),
+ * and do the rt-mutex unlock.
+ */
+int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
+{
+ u32 curval, uval, vpid = task_pid_vnr(current);
+ union futex_key key = FUTEX_KEY_INIT;
+ struct futex_hash_bucket *hb;
+ struct futex_q *top_waiter;
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -EFAULT;
+ /*
+ * We release only a lock we actually own:
+ */
+ if ((uval & FUTEX_TID_MASK) != vpid)
+ return -EPERM;
+
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
+ if (ret)
+ return ret;
+
+ hb = futex_hash(&key);
+ spin_lock(&hb->lock);
+
+ /*
+ * Check waiters first. We do not trust user space values at
+ * all and we at least want to know if user space fiddled
+ * with the futex value instead of blindly unlocking.
+ */
+ top_waiter = futex_top_waiter(hb, &key);
+ if (top_waiter) {
+ struct futex_pi_state *pi_state = top_waiter->pi_state;
+
+ ret = -EINVAL;
+ if (!pi_state)
+ goto out_unlock;
+
+ /*
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ goto out_unlock;
+
+ get_pi_state(pi_state);
+ /*
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and therefore
+ * wake_futex_p() must observe a state consistent with what we
+ * observed.
+ *
+ * In particular; this forces __rt_mutex_start_proxy() to
+ * complete such that we're guaranteed to observe the
+ * rt_waiter. Also see the WARN in wake_futex_pi().
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
+ /* drops pi_state->pi_mutex.wait_lock */
+ ret = wake_futex_pi(uaddr, uval, pi_state);
+
+ put_pi_state(pi_state);
+
+ /*
+ * Success, we're done! No tricky corner cases.
+ */
+ if (!ret)
+ return ret;
+ /*
+ * The atomic access to the futex value generated a
+ * pagefault, so retry the user-access and the wakeup:
+ */
+ if (ret == -EFAULT)
+ goto pi_faulted;
+ /*
+ * A unconditional UNLOCK_PI op raced against a waiter
+ * setting the FUTEX_WAITERS bit. Try again.
+ */
+ if (ret == -EAGAIN)
+ goto pi_retry;
+ /*
+ * wake_futex_pi has detected invalid state. Tell user
+ * space.
+ */
+ return ret;
+ }
+
+ /*
+ * We have no kernel internal state, i.e. no waiters in the
+ * kernel. Waiters which are about to queue themselves are stuck
+ * on hb->lock. So we can safely ignore them. We do neither
+ * preserve the WAITERS bit not the OWNER_DIED one. We are the
+ * owner.
+ */
+ if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
+ spin_unlock(&hb->lock);
+ switch (ret) {
+ case -EFAULT:
+ goto pi_faulted;
+
+ case -EAGAIN:
+ goto pi_retry;
+
+ default:
+ WARN_ON_ONCE(1);
+ return ret;
+ }
+ }
+
+ /*
+ * If uval has changed, let user space handle it.
+ */
+ ret = (curval == uval) ? 0 : -EAGAIN;
+
+out_unlock:
+ spin_unlock(&hb->lock);
+ return ret;
+
+pi_retry:
+ cond_resched();
+ goto retry;
+
+pi_faulted:
+
+ ret = fault_in_user_writeable(uaddr);
+ if (!ret)
+ goto retry;
+
+ return ret;
+}
+
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
new file mode 100644
index 000000000000..cba8b1a6a4cc
--- /dev/null
+++ b/kernel/futex/requeue.c
@@ -0,0 +1,897 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/sched/signal.h>
+
+#include "futex.h"
+#include "../locking/rtmutex_common.h"
+
+/*
+ * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
+ * underlying rtmutex. The task which is about to be requeued could have
+ * just woken up (timeout, signal). After the wake up the task has to
+ * acquire hash bucket lock, which is held by the requeue code. As a task
+ * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
+ * and the hash bucket lock blocking would collide and corrupt state.
+ *
+ * On !PREEMPT_RT this is not a problem and everything could be serialized
+ * on hash bucket lock, but aside of having the benefit of common code,
+ * this allows to avoid doing the requeue when the task is already on the
+ * way out and taking the hash bucket lock of the original uaddr1 when the
+ * requeue has been completed.
+ *
+ * The following state transitions are valid:
+ *
+ * On the waiter side:
+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT
+ *
+ * On the requeue side:
+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed)
+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED
+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed)
+ *
+ * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
+ * signals that the waiter is already on the way out. It also means that
+ * the waiter is still on the 'wait' futex, i.e. uaddr1.
+ *
+ * The waiter side signals early wakeup to the requeue side either through
+ * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
+ * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
+ * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
+ * which means the wakeup is interleaving with a requeue in progress it has
+ * to wait for the requeue side to change the state. Either to DONE/LOCKED
+ * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
+ * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
+ * the requeue side when the requeue attempt failed via deadlock detection
+ * and therefore the waiter q is still on the uaddr1 futex.
+ */
+enum {
+ Q_REQUEUE_PI_NONE = 0,
+ Q_REQUEUE_PI_IGNORE,
+ Q_REQUEUE_PI_IN_PROGRESS,
+ Q_REQUEUE_PI_WAIT,
+ Q_REQUEUE_PI_DONE,
+ Q_REQUEUE_PI_LOCKED,
+};
+
+const struct futex_q futex_q_init = {
+ /* list gets initialized in futex_queue()*/
+ .key = FUTEX_KEY_INIT,
+ .bitset = FUTEX_BITSET_MATCH_ANY,
+ .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
+};
+
+/**
+ * requeue_futex() - Requeue a futex_q from one hb to another
+ * @q: the futex_q to requeue
+ * @hb1: the source hash_bucket
+ * @hb2: the target hash_bucket
+ * @key2: the new key for the requeued futex_q
+ */
+static inline
+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key2)
+{
+
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(&hb1->chain != &hb2->chain)) {
+ plist_del(&q->list, &hb1->chain);
+ futex_hb_waiters_dec(hb1);
+ futex_hb_waiters_inc(hb2);
+ plist_add(&q->list, &hb2->chain);
+ q->lock_ptr = &hb2->lock;
+ }
+ q->key = *key2;
+}
+
+static inline bool futex_requeue_pi_prepare(struct futex_q *q,
+ struct futex_pi_state *pi_state)
+{
+ int old, new;
+
+ /*
+ * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
+ * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
+ * ignore the waiter.
+ */
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ if (old == Q_REQUEUE_PI_IGNORE)
+ return false;
+
+ /*
+ * futex_proxy_trylock_atomic() might have set it to
+ * IN_PROGRESS and a interleaved early wake to WAIT.
+ *
+ * It was considered to have an extra state for that
+ * trylock, but that would just add more conditionals
+ * all over the place for a dubious value.
+ */
+ if (old != Q_REQUEUE_PI_NONE)
+ break;
+
+ new = Q_REQUEUE_PI_IN_PROGRESS;
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+ q->pi_state = pi_state;
+ return true;
+}
+
+static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
+{
+ int old, new;
+
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ if (old == Q_REQUEUE_PI_IGNORE)
+ return;
+
+ if (locked >= 0) {
+ /* Requeue succeeded. Set DONE or LOCKED */
+ WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&
+ old != Q_REQUEUE_PI_WAIT);
+ new = Q_REQUEUE_PI_DONE + locked;
+ } else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
+ /* Deadlock, no early wakeup interleave */
+ new = Q_REQUEUE_PI_NONE;
+ } else {
+ /* Deadlock, early wakeup interleave. */
+ WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);
+ new = Q_REQUEUE_PI_IGNORE;
+ }
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+#ifdef CONFIG_PREEMPT_RT
+ /* If the waiter interleaved with the requeue let it know */
+ if (unlikely(old == Q_REQUEUE_PI_WAIT))
+ rcuwait_wake_up(&q->requeue_wait);
+#endif
+}
+
+static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
+{
+ int old, new;
+
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ /* Is requeue done already? */
+ if (old >= Q_REQUEUE_PI_DONE)
+ return old;
+
+ /*
+ * If not done, then tell the requeue code to either ignore
+ * the waiter or to wake it up once the requeue is done.
+ */
+ new = Q_REQUEUE_PI_WAIT;
+ if (old == Q_REQUEUE_PI_NONE)
+ new = Q_REQUEUE_PI_IGNORE;
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+ /* If the requeue was in progress, wait for it to complete */
+ if (old == Q_REQUEUE_PI_IN_PROGRESS) {
+#ifdef CONFIG_PREEMPT_RT
+ rcuwait_wait_event(&q->requeue_wait,
+ atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
+ TASK_UNINTERRUPTIBLE);
+#else
+ (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
+#endif
+ }
+
+ /*
+ * Requeue is now either prohibited or complete. Reread state
+ * because during the wait above it might have changed. Nothing
+ * will modify q->requeue_state after this point.
+ */
+ return atomic_read(&q->requeue_state);
+}
+
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * @q: the futex_q
+ * @key: the key of the requeue target futex
+ * @hb: the hash_bucket of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal.
+ *
+ * 1) Set @q::key to the requeue target futex key so the waiter can detect
+ * the wakeup on the right futex.
+ *
+ * 2) Dequeue @q from the hash bucket.
+ *
+ * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
+ * acquisition.
+ *
+ * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
+ * the waiter has to fixup the pi state.
+ *
+ * 5) Complete the requeue state so the waiter can make progress. After
+ * this point the waiter task can return from the syscall immediately in
+ * case that the pi state does not have to be fixed up.
+ *
+ * 6) Wake the waiter task.
+ *
+ * Must be called with both q->lock_ptr and hb->lock held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
+ struct futex_hash_bucket *hb)
+{
+ q->key = *key;
+
+ __futex_unqueue(q);
+
+ WARN_ON(!q->rt_waiter);
+ q->rt_waiter = NULL;
+
+ q->lock_ptr = &hb->lock;
+
+ /* Signal locked state to the waiter */
+ futex_requeue_pi_complete(q, 1);
+ wake_up_state(q->task, TASK_NORMAL);
+}
+
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex: the user address of the to futex
+ * @hb1: the from futex hash bucket, must be locked by the caller
+ * @hb2: the to futex hash bucket, must be locked by the caller
+ * @key1: the from futex key
+ * @key2: the to futex key
+ * @ps: address to store the pi_state pointer
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed. If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ *
+ * Return:
+ * - 0 - failed to acquire the lock atomically;
+ * - >0 - acquired the lock, return value is vpid of the top_waiter
+ * - <0 - error
+ */
+static int
+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key1,
+ union futex_key *key2, struct futex_pi_state **ps,
+ struct task_struct **exiting, int set_waiters)
+{
+ struct futex_q *top_waiter = NULL;
+ u32 curval;
+ int ret;
+
+ if (futex_get_value_locked(&curval, pifutex))
+ return -EFAULT;
+
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ /*
+ * Find the top_waiter and determine if there are additional waiters.
+ * If the caller intends to requeue more than 1 waiter to pifutex,
+ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+ * as we have means to handle the possible fault. If not, don't set
+ * the bit unnecessarily as it will force the subsequent unlock to enter
+ * the kernel.
+ */
+ top_waiter = futex_top_waiter(hb1, key1);
+
+ /* There are no waiters, nothing for us to do. */
+ if (!top_waiter)
+ return 0;
+
+ /*
+ * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
+ * and waiting on the 'waitqueue' futex which is always !PI.
+ */
+ if (!top_waiter->rt_waiter || top_waiter->pi_state)
+ return -EINVAL;
+
+ /* Ensure we requeue to the expected futex. */
+ if (!futex_match(top_waiter->requeue_pi_key, key2))
+ return -EINVAL;
+
+ /* Ensure that this does not race against an early wakeup */
+ if (!futex_requeue_pi_prepare(top_waiter, NULL))
+ return -EAGAIN;
+
+ /*
+ * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
+ * in the contended case or if @set_waiters is true.
+ *
+ * In the contended case PI state is attached to the lock owner. If
+ * the user space lock can be acquired then PI state is attached to
+ * the new owner (@top_waiter->task) when @set_waiters is true.
+ */
+ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+ exiting, set_waiters);
+ if (ret == 1) {
+ /*
+ * Lock was acquired in user space and PI state was
+ * attached to @top_waiter->task. That means state is fully
+ * consistent and the waiter can return to user space
+ * immediately after the wakeup.
+ */
+ requeue_pi_wake_futex(top_waiter, key2, hb2);
+ } else if (ret < 0) {
+ /* Rewind top_waiter::requeue_state */
+ futex_requeue_pi_complete(top_waiter, ret);
+ } else {
+ /*
+ * futex_lock_pi_atomic() did not acquire the user space
+ * futex, but managed to establish the proxy lock and pi
+ * state. top_waiter::requeue_state cannot be fixed up here
+ * because the waiter is not enqueued on the rtmutex
+ * yet. This is handled at the callsite depending on the
+ * result of rt_mutex_start_proxy_lock() which is
+ * guaranteed to be reached with this function returning 0.
+ */
+ }
+ return ret;
+}
+
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * @uaddr1: source futex user address
+ * @flags: futex flags (FLAGS_SHARED, etc.)
+ * @uaddr2: target futex user address
+ * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
+ * @nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * @cmpval: @uaddr1 expected value (or %NULL)
+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ * pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Return:
+ * - >=0 - on success, the number of tasks requeued or woken;
+ * - <0 - on error
+ */
+int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)
+{
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+ int task_count = 0, ret;
+ struct futex_pi_state *pi_state = NULL;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct futex_q *this, *next;
+ DEFINE_WAKE_Q(wake_q);
+
+ if (nr_wake < 0 || nr_requeue < 0)
+ return -EINVAL;
+
+ /*
+ * When PI not supported: return -ENOSYS if requeue_pi is true,
+ * consequently the compiler knows requeue_pi is always false past
+ * this point which will optimize away all the conditional code
+ * further down.
+ */
+ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
+ return -ENOSYS;
+
+ if (requeue_pi) {
+ /*
+ * Requeue PI only works on two distinct uaddrs. This
+ * check is only valid for private futexes. See below.
+ */
+ if (uaddr1 == uaddr2)
+ return -EINVAL;
+
+ /*
+ * futex_requeue() allows the caller to define the number
+ * of waiters to wake up via the @nr_wake argument. With
+ * REQUEUE_PI, waking up more than one waiter is creating
+ * more problems than it solves. Waking up a waiter makes
+ * only sense if the PI futex @uaddr2 is uncontended as
+ * this allows the requeue code to acquire the futex
+ * @uaddr2 before waking the waiter. The waiter can then
+ * return to user space without further action. A secondary
+ * wakeup would just make the futex_wait_requeue_pi()
+ * handling more complex, because that code would have to
+ * look up pi_state and do more or less all the handling
+ * which the requeue code has to do for the to be requeued
+ * waiters. So restrict the number of waiters to wake to
+ * one, and only wake it up when the PI futex is
+ * uncontended. Otherwise requeue it and let the unlock of
+ * the PI futex handle the wakeup.
+ *
+ * All REQUEUE_PI users, e.g. pthread_cond_signal() and
+ * pthread_cond_broadcast() must use nr_wake=1.
+ */
+ if (nr_wake != 1)
+ return -EINVAL;
+
+ /*
+ * requeue_pi requires a pi_state, try to allocate it now
+ * without any locks in case it fails.
+ */
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+ }
+
+retry:
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+ requeue_pi ? FUTEX_WRITE : FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (requeue_pi && futex_match(&key1, &key2))
+ return -EINVAL;
+
+ hb1 = futex_hash(&key1);
+ hb2 = futex_hash(&key2);
+
+retry_private:
+ futex_hb_waiters_inc(hb2);
+ double_lock_hb(hb1, hb2);
+
+ if (likely(cmpval != NULL)) {
+ u32 curval;
+
+ ret = futex_get_value_locked(&curval, uaddr1);
+
+ if (unlikely(ret)) {
+ double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+
+ ret = get_user(curval, uaddr1);
+ if (ret)
+ return ret;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+
+ if (requeue_pi) {
+ struct task_struct *exiting = NULL;
+
+ /*
+ * Attempt to acquire uaddr2 and wake the top waiter. If we
+ * intend to requeue waiters, force setting the FUTEX_WAITERS
+ * bit. We force this here where we are able to easily handle
+ * faults rather in the requeue loop below.
+ *
+ * Updates topwaiter::requeue_state if a top waiter exists.
+ */
+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+ &key2, &pi_state,
+ &exiting, nr_requeue);
+
+ /*
+ * At this point the top_waiter has either taken uaddr2 or
+ * is waiting on it. In both cases pi_state has been
+ * established and an initial refcount on it. In case of an
+ * error there's nothing.
+ *
+ * The top waiter's requeue_state is up to date:
+ *
+ * - If the lock was acquired atomically (ret == 1), then
+ * the state is Q_REQUEUE_PI_LOCKED.
+ *
+ * The top waiter has been dequeued and woken up and can
+ * return to user space immediately. The kernel/user
+ * space state is consistent. In case that there must be
+ * more waiters requeued the WAITERS bit in the user
+ * space futex is set so the top waiter task has to go
+ * into the syscall slowpath to unlock the futex. This
+ * will block until this requeue operation has been
+ * completed and the hash bucket locks have been
+ * dropped.
+ *
+ * - If the trylock failed with an error (ret < 0) then
+ * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
+ * happened", or Q_REQUEUE_PI_IGNORE when there was an
+ * interleaved early wakeup.
+ *
+ * - If the trylock did not succeed (ret == 0) then the
+ * state is either Q_REQUEUE_PI_IN_PROGRESS or
+ * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
+ * This will be cleaned up in the loop below, which
+ * cannot fail because futex_proxy_trylock_atomic() did
+ * the same sanity checks for requeue_pi as the loop
+ * below does.
+ */
+ switch (ret) {
+ case 0:
+ /* We hold a reference on the pi state. */
+ break;
+
+ case 1:
+ /*
+ * futex_proxy_trylock_atomic() acquired the user space
+ * futex. Adjust task_count.
+ */
+ task_count++;
+ ret = 0;
+ break;
+
+ /*
+ * If the above failed, then pi_state is NULL and
+ * waiter::requeue_state is correct.
+ */
+ case -EFAULT:
+ double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+ ret = fault_in_user_writeable(uaddr2);
+ if (!ret)
+ goto retry;
+ return ret;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Owner is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock;
+ }
+ }
+
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (task_count - nr_wake >= nr_requeue)
+ break;
+
+ if (!futex_match(&this->key, &key1))
+ continue;
+
+ /*
+ * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
+ */
+ if ((requeue_pi && !this->rt_waiter) ||
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Plain futexes just wake or requeue and are done */
+ if (!requeue_pi) {
+ if (++task_count <= nr_wake)
+ futex_wake_mark(&wake_q, this);
+ else
+ requeue_futex(this, hb1, hb2, &key2);
+ continue;
+ }
+
+ /* Ensure we requeue to the expected futex for requeue_pi. */
+ if (!futex_match(this->requeue_pi_key, &key2)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /*
+ * Requeue nr_requeue waiters and possibly one more in the case
+ * of requeue_pi if we couldn't acquire the lock atomically.
+ *
+ * Prepare the waiter to take the rt_mutex. Take a refcount
+ * on the pi_state and store the pointer in the futex_q
+ * object of the waiter.
+ */
+ get_pi_state(pi_state);
+
+ /* Don't requeue when the waiter is already on the way out. */
+ if (!futex_requeue_pi_prepare(this, pi_state)) {
+ /*
+ * Early woken waiter signaled that it is on the
+ * way out. Drop the pi_state reference and try the
+ * next waiter. @this->pi_state is still NULL.
+ */
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
+ this->task);
+
+ if (ret == 1) {
+ /*
+ * We got the lock. We do neither drop the refcount
+ * on pi_state nor clear this->pi_state because the
+ * waiter needs the pi_state for cleaning up the
+ * user space value. It will drop the refcount
+ * after doing so. this::requeue_state is updated
+ * in the wakeup as well.
+ */
+ requeue_pi_wake_futex(this, &key2, hb2);
+ task_count++;
+ } else if (!ret) {
+ /* Waiter is queued, move it to hb2 */
+ requeue_futex(this, hb1, hb2, &key2);
+ futex_requeue_pi_complete(this, 0);
+ task_count++;
+ } else {
+ /*
+ * rt_mutex_start_proxy_lock() detected a potential
+ * deadlock when we tried to queue that waiter.
+ * Drop the pi_state reference which we took above
+ * and remove the pointer to the state from the
+ * waiters futex_q object.
+ */
+ this->pi_state = NULL;
+ put_pi_state(pi_state);
+ futex_requeue_pi_complete(this, ret);
+ /*
+ * We stop queueing more waiters and let user space
+ * deal with the mess.
+ */
+ break;
+ }
+ }
+
+ /*
+ * We took an extra initial reference to the pi_state in
+ * futex_proxy_trylock_atomic(). We need to drop it here again.
+ */
+ put_pi_state(pi_state);
+
+out_unlock:
+ double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
+ futex_hb_waiters_dec(hb2);
+ return ret ? ret : task_count;
+}
+
+/**
+ * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
+ * @hb: the hash_bucket futex_q was original enqueued on
+ * @q: the futex_q woken while waiting to be requeued
+ * @timeout: the timeout associated with the wait (NULL if none)
+ *
+ * Determine the cause for the early wakeup.
+ *
+ * Return:
+ * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+ struct futex_q *q,
+ struct hrtimer_sleeper *timeout)
+{
+ int ret;
+
+ /*
+ * With the hb lock held, we avoid races while we process the wakeup.
+ * We only need to hold hb (and not hb2) to ensure atomicity as the
+ * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+ * It can't be requeued from uaddr2 to something else since we don't
+ * support a PI aware source futex for requeue.
+ */
+ WARN_ON_ONCE(&hb->lock != q->lock_ptr);
+
+ /*
+ * We were woken prior to requeue by a timeout or a signal.
+ * Unqueue the futex_q and determine which it was.
+ */
+ plist_del(&q->list, &hb->chain);
+ futex_hb_waiters_dec(hb);
+
+ /* Handle spurious wakeups gracefully */
+ ret = -EWOULDBLOCK;
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ else if (signal_pending(current))
+ ret = -ERESTARTNOINTR;
+ return ret;
+}
+
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr: the futex we initially wait on (non-pi)
+ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
+ * the same type, no requeueing from private to shared, etc.
+ * @val: the expected value of uaddr
+ * @abs_time: absolute timeout
+ * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
+ * @uaddr2: the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
+ * without one, the pi logic would not know which task to boost/deboost, if
+ * there was a need to.
+ *
+ * We call schedule in futex_wait_queue() when we enqueue and return there
+ * via the following--
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue
+ * 3) signal
+ * 4) timeout
+ *
+ * If 3, cleanup and return -ERESTARTNOINTR.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Return:
+ * - 0 - On success;
+ * - <0 - On error
+ */
+int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
+ u32 val, ktime_t *abs_time, u32 bitset,
+ u32 __user *uaddr2)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct rt_mutex_waiter rt_waiter;
+ struct futex_hash_bucket *hb;
+ union futex_key key2 = FUTEX_KEY_INIT;
+ struct futex_q q = futex_q_init;
+ struct rt_mutex_base *pi_mutex;
+ int res, ret;
+
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
+ if (uaddr == uaddr2)
+ return -EINVAL;
+
+ if (!bitset)
+ return -EINVAL;
+
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
+
+ /*
+ * The waiter is allocated on our stack, manipulated by the requeue
+ * code while we sleep on uaddr.
+ */
+ rt_mutex_init_waiter(&rt_waiter);
+
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+ goto out;
+
+ q.bitset = bitset;
+ q.rt_waiter = &rt_waiter;
+ q.requeue_pi_key = &key2;
+
+ /*
+ * Prepare to wait on uaddr. On success, it holds hb->lock and q
+ * is initialized.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ if (ret)
+ goto out;
+
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (futex_match(&q.key, &key2)) {
+ futex_q_unlock(hb);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Queue the futex_q, drop the hb lock, wait for wakeup. */
+ futex_wait_queue(hb, &q, to);
+
+ switch (futex_requeue_pi_wakeup_sync(&q)) {
+ case Q_REQUEUE_PI_IGNORE:
+ /* The waiter is still on uaddr1 */
+ spin_lock(&hb->lock);
+ ret = handle_early_requeue_pi_wakeup(hb, &q, to);
+ spin_unlock(&hb->lock);
+ break;
+
+ case Q_REQUEUE_PI_LOCKED:
+ /* The requeue acquired the lock */
+ if (q.pi_state && (q.pi_state->owner != current)) {
+ spin_lock(q.lock_ptr);
+ ret = fixup_pi_owner(uaddr2, &q, true);
+ /*
+ * Drop the reference to the pi state which the
+ * requeue_pi() code acquired for us.
+ */
+ put_pi_state(q.pi_state);
+ spin_unlock(q.lock_ptr);
+ /*
+ * Adjust the return value. It's either -EFAULT or
+ * success (1) but the caller expects 0 for success.
+ */
+ ret = ret < 0 ? ret : 0;
+ }
+ break;
+
+ case Q_REQUEUE_PI_DONE:
+ /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
+ pi_mutex = &q.pi_state->pi_mutex;
+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
+
+ /* Current is not longer pi_blocked_on */
+ spin_lock(q.lock_ptr);
+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+ ret = 0;
+
+ debug_rt_mutex_free_waiter(&rt_waiter);
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_pi_owner(uaddr2, &q, !ret);
+ /*
+ * If fixup_pi_owner() returned an error, propagate that. If it
+ * acquired the lock, clear -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ futex_unqueue_pi(&q);
+ spin_unlock(q.lock_ptr);
+
+ if (ret == -EINTR) {
+ /*
+ * We've already been requeued, but cannot restart
+ * by calling futex_lock_pi() directly. We could
+ * restart this syscall, but it would detect that
+ * the user space "val" changed and return
+ * -EWOULDBLOCK. Save the overhead of the restart
+ * and return -EWOULDBLOCK directly.
+ */
+ ret = -EWOULDBLOCK;
+ }
+ break;
+ default:
+ BUG();
+ }
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+ return ret;
+}
+
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
new file mode 100644
index 000000000000..086a22d1adb7
--- /dev/null
+++ b/kernel/futex/syscalls.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <linux/time_namespace.h>
+
+#include "futex.h"
+
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ *
+ * Implementation: user-space maintains a per-thread list of locks it
+ * is holding. Upon do_exit(), the kernel carefully walks this list,
+ * and marks all locks that are owned by this thread with the
+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
+ * always manipulated with the lock held, so the list is private and
+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+ * field, to allow the kernel to clean up if the thread dies after
+ * acquiring the lock, but just before it could have added itself to
+ * the list. There can only be one such pending lock.
+ */
+
+/**
+ * sys_set_robust_list() - Set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
+ size_t, len)
+{
+ /*
+ * The kernel knows only one size for now:
+ */
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->robust_list = head;
+
+ return 0;
+}
+
+/**
+ * sys_get_robust_list() - Get the robust-futex list head of a task
+ * @pid: pid of the process [zero for current task]
+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
+ * @len_ptr: pointer to a length field, the kernel fills in the header size
+ */
+SYSCALL_DEFINE3(get_robust_list, int, pid,
+ struct robust_list_head __user * __user *, head_ptr,
+ size_t __user *, len_ptr)
+{
+ struct robust_list_head __user *head;
+ unsigned long ret;
+ struct task_struct *p;
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
+ if (!pid)
+ p = current;
+ else {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ }
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = p->robust_list;
+ rcu_read_unlock();
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(head, head_ptr);
+
+err_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+ u32 __user *uaddr2, u32 val2, u32 val3)
+{
+ int cmd = op & FUTEX_CMD_MASK;
+ unsigned int flags = 0;
+
+ if (!(op & FUTEX_PRIVATE_FLAG))
+ flags |= FLAGS_SHARED;
+
+ if (op & FUTEX_CLOCK_REALTIME) {
+ flags |= FLAGS_CLOCKRT;
+ if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
+ cmd != FUTEX_LOCK_PI2)
+ return -ENOSYS;
+ }
+
+ switch (cmd) {
+ case FUTEX_WAIT:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ fallthrough;
+ case FUTEX_WAIT_BITSET:
+ return futex_wait(uaddr, flags, val, timeout, val3);
+ case FUTEX_WAKE:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ fallthrough;
+ case FUTEX_WAKE_BITSET:
+ return futex_wake(uaddr, flags, val, val3);
+ case FUTEX_REQUEUE:
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
+ case FUTEX_CMP_REQUEUE:
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
+ case FUTEX_WAKE_OP:
+ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
+ case FUTEX_LOCK_PI:
+ flags |= FLAGS_CLOCKRT;
+ fallthrough;
+ case FUTEX_LOCK_PI2:
+ return futex_lock_pi(uaddr, flags, timeout, 0);
+ case FUTEX_UNLOCK_PI:
+ return futex_unlock_pi(uaddr, flags);
+ case FUTEX_TRYLOCK_PI:
+ return futex_lock_pi(uaddr, flags, NULL, 1);
+ case FUTEX_WAIT_REQUEUE_PI:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+ uaddr2);
+ case FUTEX_CMP_REQUEUE_PI:
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+ }
+ return -ENOSYS;
+}
+
+static __always_inline bool futex_cmd_has_timeout(u32 cmd)
+{
+ switch (cmd) {
+ case FUTEX_WAIT:
+ case FUTEX_LOCK_PI:
+ case FUTEX_LOCK_PI2:
+ case FUTEX_WAIT_BITSET:
+ case FUTEX_WAIT_REQUEUE_PI:
+ return true;
+ }
+ return false;
+}
+
+static __always_inline int
+futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
+{
+ if (!timespec64_valid(ts))
+ return -EINVAL;
+
+ *t = timespec64_to_ktime(*ts);
+ if (cmd == FUTEX_WAIT)
+ *t = ktime_add_safe(ktime_get(), *t);
+ else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
+ *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
+ return 0;
+}
+
+SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ const struct __kernel_timespec __user *, utime,
+ u32 __user *, uaddr2, u32, val3)
+{
+ int ret, cmd = op & FUTEX_CMD_MASK;
+ ktime_t t, *tp = NULL;
+ struct timespec64 ts;
+
+ if (utime && futex_cmd_has_timeout(cmd)) {
+ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
+ return -EFAULT;
+ if (get_timespec64(&ts, utime))
+ return -EFAULT;
+ ret = futex_init_timeout(cmd, op, &ts, &t);
+ if (ret)
+ return ret;
+ tp = &t;
+ }
+
+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
+}
+
+/* Mask of available flags for each futex in futex_waitv list */
+#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
+
+/**
+ * futex_parse_waitv - Parse a waitv array from userspace
+ * @futexv: Kernel side list of waiters to be filled
+ * @uwaitv: Userspace list to be parsed
+ * @nr_futexes: Length of futexv
+ *
+ * Return: Error code on failure, 0 on success
+ */
+static int futex_parse_waitv(struct futex_vector *futexv,
+ struct futex_waitv __user *uwaitv,
+ unsigned int nr_futexes)
+{
+ struct futex_waitv aux;
+ unsigned int i;
+
+ for (i = 0; i < nr_futexes; i++) {
+ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
+ return -EFAULT;
+
+ if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved)
+ return -EINVAL;
+
+ if (!(aux.flags & FUTEX_32))
+ return -EINVAL;
+
+ futexv[i].w.flags = aux.flags;
+ futexv[i].w.val = aux.val;
+ futexv[i].w.uaddr = aux.uaddr;
+ futexv[i].q = futex_q_init;
+ }
+
+ return 0;
+}
+
+/**
+ * sys_futex_waitv - Wait on a list of futexes
+ * @waiters: List of futexes to wait on
+ * @nr_futexes: Length of futexv
+ * @flags: Flag for timeout (monotonic/realtime)
+ * @timeout: Optional absolute timeout.
+ * @clockid: Clock to be used for the timeout, realtime or monotonic.
+ *
+ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
+ * if a futex_wake() is performed at any uaddr. The syscall returns immediately
+ * if any waiter has *uaddr != val. *timeout is an optional timeout value for
+ * the operation. Each waiter has individual flags. The `flags` argument for
+ * the syscall should be used solely for specifying the timeout as realtime, if
+ * needed. Flags for private futexes, sizes, etc. should be used on the
+ * individual flags of each waiter.
+ *
+ * Returns the array index of one of the woken futexes. No further information
+ * is provided: any number of other futexes may also have been woken by the
+ * same event, and if more than one futex was woken, the retrned index may
+ * refer to any one of them. (It is not necessaryily the futex with the
+ * smallest index, nor the one most recently woken, nor...)
+ */
+
+SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
+ unsigned int, nr_futexes, unsigned int, flags,
+ struct __kernel_timespec __user *, timeout, clockid_t, clockid)
+{
+ struct hrtimer_sleeper to;
+ struct futex_vector *futexv;
+ struct timespec64 ts;
+ ktime_t time;
+ int ret;
+
+ /* This syscall supports no flags for now */
+ if (flags)
+ return -EINVAL;
+
+ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
+ return -EINVAL;
+
+ if (timeout) {
+ int flag_clkid = 0, flag_init = 0;
+
+ if (clockid == CLOCK_REALTIME) {
+ flag_clkid = FLAGS_CLOCKRT;
+ flag_init = FUTEX_CLOCK_REALTIME;
+ }
+
+ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
+ return -EINVAL;
+
+ if (get_timespec64(&ts, timeout))
+ return -EFAULT;
+
+ /*
+ * Since there's no opcode for futex_waitv, use
+ * FUTEX_WAIT_BITSET that uses absolute timeout as well
+ */
+ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
+ if (ret)
+ return ret;
+
+ futex_setup_timer(&time, &to, flag_clkid, 0);
+ }
+
+ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
+ if (!futexv)
+ return -ENOMEM;
+
+ ret = futex_parse_waitv(futexv, waiters, nr_futexes);
+ if (!ret)
+ ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
+
+ if (timeout) {
+ hrtimer_cancel(&to.timer);
+ destroy_hrtimer_on_stack(&to.timer);
+ }
+
+ kfree(futexv);
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
+{
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->compat_robust_list = head;
+
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
+{
+ struct compat_robust_list_head __user *head;
+ unsigned long ret;
+ struct task_struct *p;
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
+ if (!pid)
+ p = current;
+ else {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ }
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = p->compat_robust_list;
+ rcu_read_unlock();
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+#endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
+ const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
+{
+ int ret, cmd = op & FUTEX_CMD_MASK;
+ ktime_t t, *tp = NULL;
+ struct timespec64 ts;
+
+ if (utime && futex_cmd_has_timeout(cmd)) {
+ if (get_old_timespec32(&ts, utime))
+ return -EFAULT;
+ ret = futex_init_timeout(cmd, op, &ts, &t);
+ if (ret)
+ return ret;
+ tp = &t;
+ }
+
+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
+}
+#endif /* CONFIG_COMPAT_32BIT_TIME */
+
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
new file mode 100644
index 000000000000..4ce0923f1ce3
--- /dev/null
+++ b/kernel/futex/waitwake.c
@@ -0,0 +1,708 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
+#include <linux/freezer.h>
+
+#include "futex.h"
+
+/*
+ * READ this before attempting to hack on futexes!
+ *
+ * Basic futex operation and ordering guarantees
+ * =============================================
+ *
+ * The waiter reads the futex value in user space and calls
+ * futex_wait(). This function computes the hash bucket and acquires
+ * the hash bucket lock. After that it reads the futex user space value
+ * again and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash bucket lock
+ * and schedules.
+ *
+ * The waker side modifies the user space value of the futex and calls
+ * futex_wake(). This function computes the hash bucket and acquires the
+ * hash bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
+ *
+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
+ * the hb spinlock can be avoided and simply return. In order for this
+ * optimization to work, ordering guarantees must exist so that the waiter
+ * being added to the list is acknowledged when the list is concurrently being
+ * checked by the waker, avoiding scenarios like the following:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ * uval = *futex;
+ * *futex = newval;
+ * sys_futex(WAKE, futex);
+ * futex_wake(futex);
+ * if (queue_empty())
+ * return;
+ * if (uval == val)
+ * lock(hash_bucket(futex));
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule();
+ *
+ * This would cause the waiter on CPU 0 to wait forever because it
+ * missed the transition of the user space value from val to newval
+ * and the waker did not find the waiter in the hash bucket queue.
+ *
+ * The correct serialization ensures that a waiter either observes
+ * the changed user space value before blocking or is woken by a
+ * concurrent waker:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ *
+ * waiters++; (a)
+ * smp_mb(); (A) <-- paired with -.
+ * |
+ * lock(hash_bucket(futex)); |
+ * |
+ * uval = *futex; |
+ * | *futex = newval;
+ * | sys_futex(WAKE, futex);
+ * | futex_wake(futex);
+ * |
+ * `--------> smp_mb(); (B)
+ * if (uval == val)
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule(); if (waiters)
+ * lock(hash_bucket(futex));
+ * else wake_waiters(futex);
+ * waiters--; (b) unlock(hash_bucket(futex));
+ *
+ * Where (A) orders the waiters increment and the futex value read through
+ * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write
+ * to futex and the waiters read (see futex_hb_waiters_pending()).
+ *
+ * This yields the following case (where X:=waiters, Y:=futex):
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
+ * the guarantee that we cannot both miss the futex variable change and the
+ * enqueue.
+ *
+ * Note that a new waiter is accounted for in (a) even when it is possible that
+ * the wait call can return error, in which case we backtrack from it in (b).
+ * Refer to the comment in futex_q_lock().
+ *
+ * Similarly, in order to account for waiters being requeued on another
+ * address we always increment the waiters for the destination bucket before
+ * acquiring the lock. It then decrements them again after releasing it -
+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+ * will do the additional required waiter count housekeeping. This is done for
+ * double_lock_hb() and double_unlock_hb(), respectively.
+ */
+
+/*
+ * The hash bucket lock must be held when this is called.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
+ */
+void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
+{
+ struct task_struct *p = q->task;
+
+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+ return;
+
+ get_task_struct(p);
+ __futex_unqueue(q);
+ /*
+ * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
+ * is written, without taking any locks. This is possible in the event
+ * of a spurious wakeup, for example. A memory barrier is required here
+ * to prevent the following store to lock_ptr from getting ahead of the
+ * plist_del in __futex_unqueue().
+ */
+ smp_store_release(&q->lock_ptr, NULL);
+
+ /*
+ * Queue the task for later wakeup for after we've released
+ * the hb->lock.
+ */
+ wake_q_add_safe(wake_q, p);
+}
+
+/*
+ * Wake up waiters matching bitset queued on this futex (uaddr).
+ */
+int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+{
+ struct futex_hash_bucket *hb;
+ struct futex_q *this, *next;
+ union futex_key key = FUTEX_KEY_INIT;
+ int ret;
+ DEFINE_WAKE_Q(wake_q);
+
+ if (!bitset)
+ return -EINVAL;
+
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+ hb = futex_hash(&key);
+
+ /* Make sure we really have tasks to wakeup */
+ if (!futex_hb_waiters_pending(hb))
+ return ret;
+
+ spin_lock(&hb->lock);
+
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
+ if (futex_match (&this->key, &key)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Check if one of the bits is set in both bitsets */
+ if (!(this->bitset & bitset))
+ continue;
+
+ futex_wake_mark(&wake_q, this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ spin_unlock(&hb->lock);
+ wake_up_q(&wake_q);
+ return ret;
+}
+
+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+{
+ unsigned int op = (encoded_op & 0x70000000) >> 28;
+ unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
+ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
+ int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
+ int oldval, ret;
+
+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
+ if (oparg < 0 || oparg > 31) {
+ char comm[sizeof(current->comm)];
+ /*
+ * kill this print and return -EINVAL when userspace
+ * is sane again
+ */
+ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
+ get_task_comm(comm, current), oparg);
+ oparg &= 31;
+ }
+ oparg = 1 << oparg;
+ }
+
+ pagefault_disable();
+ ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
+ pagefault_enable();
+ if (ret)
+ return ret;
+
+ switch (cmp) {
+ case FUTEX_OP_CMP_EQ:
+ return oldval == cmparg;
+ case FUTEX_OP_CMP_NE:
+ return oldval != cmparg;
+ case FUTEX_OP_CMP_LT:
+ return oldval < cmparg;
+ case FUTEX_OP_CMP_GE:
+ return oldval >= cmparg;
+ case FUTEX_OP_CMP_LE:
+ return oldval <= cmparg;
+ case FUTEX_OP_CMP_GT:
+ return oldval > cmparg;
+ default:
+ return -ENOSYS;
+ }
+}
+
+/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ int nr_wake, int nr_wake2, int op)
+{
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct futex_q *this, *next;
+ int ret, op_ret;
+ DEFINE_WAKE_Q(wake_q);
+
+retry:
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+ return ret;
+
+ hb1 = futex_hash(&key1);
+ hb2 = futex_hash(&key2);
+
+retry_private:
+ double_lock_hb(hb1, hb2);
+ op_ret = futex_atomic_op_inuser(op, uaddr2);
+ if (unlikely(op_ret < 0)) {
+ double_unlock_hb(hb1, hb2);
+
+ if (!IS_ENABLED(CONFIG_MMU) ||
+ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+ /*
+ * we don't get EFAULT from MMU faults if we don't have
+ * an MMU, but we might get them from range checking
+ */
+ ret = op_ret;
+ return ret;
+ }
+
+ if (op_ret == -EFAULT) {
+ ret = fault_in_user_writeable(uaddr2);
+ if (ret)
+ return ret;
+ }
+
+ cond_resched();
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+ goto retry;
+ }
+
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (futex_match (&this->key, &key1)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ futex_wake_mark(&wake_q, this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ if (op_ret > 0) {
+ op_ret = 0;
+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
+ if (futex_match (&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ futex_wake_mark(&wake_q, this);
+ if (++op_ret >= nr_wake2)
+ break;
+ }
+ }
+ ret += op_ret;
+ }
+
+out_unlock:
+ double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
+ return ret;
+}
+
+static long futex_wait_restart(struct restart_block *restart);
+
+/**
+ * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal
+ * @hb: the futex hash bucket, must be locked by the caller
+ * @q: the futex_q to queue up on
+ * @timeout: the prepared hrtimer_sleeper, or null for no timeout
+ */
+void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
+ struct hrtimer_sleeper *timeout)
+{
+ /*
+ * The task state is guaranteed to be set before another task can
+ * wake it. set_current_state() is implemented using smp_store_mb() and
+ * futex_queue() calls spin_unlock() upon completion, both serializing
+ * access to the hash list and forcing another memory barrier.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ futex_queue(q, hb);
+
+ /* Arm the timer */
+ if (timeout)
+ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
+
+ /*
+ * If we have been removed from the hash list, then another task
+ * has tried to wake us, and we can skip the call to schedule().
+ */
+ if (likely(!plist_node_empty(&q->list))) {
+ /*
+ * If the timer has already expired, current will already be
+ * flagged for rescheduling. Only call schedule if there
+ * is no timeout, or if it has yet to expire.
+ */
+ if (!timeout || timeout->task)
+ freezable_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
+/**
+ * unqueue_multiple - Remove various futexes from their hash bucket
+ * @v: The list of futexes to unqueue
+ * @count: Number of futexes in the list
+ *
+ * Helper to unqueue a list of futexes. This can't fail.
+ *
+ * Return:
+ * - >=0 - Index of the last futex that was awoken;
+ * - -1 - No futex was awoken
+ */
+static int unqueue_multiple(struct futex_vector *v, int count)
+{
+ int ret = -1, i;
+
+ for (i = 0; i < count; i++) {
+ if (!futex_unqueue(&v[i].q))
+ ret = i;
+ }
+
+ return ret;
+}
+
+/**
+ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
+ * @vs: The futex list to wait on
+ * @count: The size of the list
+ * @woken: Index of the last woken futex, if any. Used to notify the
+ * caller that it can return this index to userspace (return parameter)
+ *
+ * Prepare multiple futexes in a single step and enqueue them. This may fail if
+ * the futex list is invalid or if any futex was already awoken. On success the
+ * task is ready to interruptible sleep.
+ *
+ * Return:
+ * - 1 - One of the futexes was woken by another thread
+ * - 0 - Success
+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
+ */
+static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
+{
+ struct futex_hash_bucket *hb;
+ bool retry = false;
+ int ret, i;
+ u32 uval;
+
+ /*
+ * Enqueuing multiple futexes is tricky, because we need to enqueue
+ * each futex on the list before dealing with the next one to avoid
+ * deadlocking on the hash bucket. But, before enqueuing, we need to
+ * make sure that current->state is TASK_INTERRUPTIBLE, so we don't
+ * lose any wake events, which cannot be done before the get_futex_key
+ * of the next key, because it calls get_user_pages, which can sleep.
+ * Thus, we fetch the list of futexes keys in two steps, by first
+ * pinning all the memory keys in the futex key, and only then we read
+ * each key and queue the corresponding futex.
+ *
+ * Private futexes doesn't need to recalculate hash in retry, so skip
+ * get_futex_key() when retrying.
+ */
+retry:
+ for (i = 0; i < count; i++) {
+ if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry)
+ continue;
+
+ ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
+ !(vs[i].w.flags & FUTEX_PRIVATE_FLAG),
+ &vs[i].q.key, FUTEX_READ);
+
+ if (unlikely(ret))
+ return ret;
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ for (i = 0; i < count; i++) {
+ u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
+ struct futex_q *q = &vs[i].q;
+ u32 val = (u32)vs[i].w.val;
+
+ hb = futex_q_lock(q);
+ ret = futex_get_value_locked(&uval, uaddr);
+
+ if (!ret && uval == val) {
+ /*
+ * The bucket lock can't be held while dealing with the
+ * next futex. Queue each futex at this moment so hb can
+ * be unlocked.
+ */
+ futex_queue(q, hb);
+ continue;
+ }
+
+ futex_q_unlock(hb);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * Even if something went wrong, if we find out that a futex
+ * was woken, we don't return error and return this index to
+ * userspace
+ */
+ *woken = unqueue_multiple(vs, i);
+ if (*woken >= 0)
+ return 1;
+
+ if (ret) {
+ /*
+ * If we need to handle a page fault, we need to do so
+ * without any lock and any enqueued futex (otherwise
+ * we could lose some wakeup). So we do it here, after
+ * undoing all the work done so far. In success, we
+ * retry all the work.
+ */
+ if (get_user(uval, uaddr))
+ return -EFAULT;
+
+ retry = true;
+ goto retry;
+ }
+
+ if (uval != val)
+ return -EWOULDBLOCK;
+ }
+
+ return 0;
+}
+
+/**
+ * futex_sleep_multiple - Check sleeping conditions and sleep
+ * @vs: List of futexes to wait for
+ * @count: Length of vs
+ * @to: Timeout
+ *
+ * Sleep if and only if the timeout hasn't expired and no futex on the list has
+ * been woken up.
+ */
+static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to)
+{
+ if (to && !to->task)
+ return;
+
+ for (; count; count--, vs++) {
+ if (!READ_ONCE(vs->q.lock_ptr))
+ return;
+ }
+
+ freezable_schedule();
+}
+
+/**
+ * futex_wait_multiple - Prepare to wait on and enqueue several futexes
+ * @vs: The list of futexes to wait on
+ * @count: The number of objects
+ * @to: Timeout before giving up and returning to userspace
+ *
+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function
+ * sleeps on a group of futexes and returns on the first futex that is
+ * wake, or after the timeout has elapsed.
+ *
+ * Return:
+ * - >=0 - Hint to the futex that was awoken
+ * - <0 - On error
+ */
+int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to)
+{
+ int ret, hint = 0;
+
+ if (to)
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
+
+ while (1) {
+ ret = futex_wait_multiple_setup(vs, count, &hint);
+ if (ret) {
+ if (ret > 0) {
+ /* A futex was woken during setup */
+ ret = hint;
+ }
+ return ret;
+ }
+
+ futex_sleep_multiple(vs, count, to);
+
+ __set_current_state(TASK_RUNNING);
+
+ ret = unqueue_multiple(vs, count);
+ if (ret >= 0)
+ return ret;
+
+ if (to && !to->task)
+ return -ETIMEDOUT;
+ else if (signal_pending(current))
+ return -ERESTARTSYS;
+ /*
+ * The final case is a spurious wakeup, for
+ * which just retry.
+ */
+ }
+}
+
+/**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr: the futex userspace address
+ * @val: the expected value
+ * @flags: futex flags (FLAGS_SHARED, etc.)
+ * @q: the associated futex_q
+ * @hb: storage for hash_bucket pointer to be returned to caller
+ *
+ * Setup the futex_q and locate the hash_bucket. Get the futex value and
+ * compare it with the expected value. Handle atomic faults internally.
+ * Return with the hb lock held on success, and unlocked on failure.
+ *
+ * Return:
+ * - 0 - uaddr contains val and hb has been locked;
+ * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
+ */
+int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ struct futex_q *q, struct futex_hash_bucket **hb)
+{
+ u32 uval;
+ int ret;
+
+ /*
+ * Access the page AFTER the hash-bucket is locked.
+ * Order is important:
+ *
+ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
+ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
+ *
+ * The basic logical guarantee of a futex is that it blocks ONLY
+ * if cond(var) is known to be true at the time of blocking, for
+ * any cond. If we locked the hash-bucket after testing *uaddr, that
+ * would open a race condition where we could block indefinitely with
+ * cond(var) false, which would violate the guarantee.
+ *
+ * On the other hand, we insert q and release the hash-bucket only
+ * after testing *uaddr. This guarantees that futex_wait() will NOT
+ * absorb a wakeup if *uaddr does not match the desired values
+ * while the syscall executes.
+ */
+retry:
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+retry_private:
+ *hb = futex_q_lock(q);
+
+ ret = futex_get_value_locked(&uval, uaddr);
+
+ if (ret) {
+ futex_q_unlock(*hb);
+
+ ret = get_user(uval, uaddr);
+ if (ret)
+ return ret;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
+
+ if (uval != val) {
+ futex_q_unlock(*hb);
+ ret = -EWOULDBLOCK;
+ }
+
+ return ret;
+}
+
+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct restart_block *restart;
+ struct futex_hash_bucket *hb;
+ struct futex_q q = futex_q_init;
+ int ret;
+
+ if (!bitset)
+ return -EINVAL;
+ q.bitset = bitset;
+
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
+retry:
+ /*
+ * Prepare to wait on uaddr. On success, it holds hb->lock and q
+ * is initialized.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ if (ret)
+ goto out;
+
+ /* futex_queue and wait for wakeup, timeout, or a signal. */
+ futex_wait_queue(hb, &q, to);
+
+ /* If we were woken (and unqueued), we succeeded, whatever. */
+ ret = 0;
+ if (!futex_unqueue(&q))
+ goto out;
+ ret = -ETIMEDOUT;
+ if (to && !to->task)
+ goto out;
+
+ /*
+ * We expect signal_pending(current), but we might be the
+ * victim of a spurious wakeup as well.
+ */
+ if (!signal_pending(current))
+ goto retry;
+
+ ret = -ERESTARTSYS;
+ if (!abs_time)
+ goto out;
+
+ restart = &current->restart_block;
+ restart->futex.uaddr = uaddr;
+ restart->futex.val = val;
+ restart->futex.time = *abs_time;
+ restart->futex.bitset = bitset;
+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+
+ ret = set_restart_fn(restart, futex_wait_restart);
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+ return ret;
+}
+
+static long futex_wait_restart(struct restart_block *restart)
+{
+ u32 __user *uaddr = restart->futex.uaddr;
+ ktime_t t, *tp = NULL;
+
+ if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+ t = restart->futex.time;
+ tp = &t;
+ }
+ restart->fn = do_no_restart_syscall;
+
+ return (long)futex_wait(uaddr, restart->futex.flags,
+ restart->futex.val, tp, restart->futex.bitset);
+}
+
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index fbc54c2a7f23..10929eda9825 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -97,9 +97,6 @@ config GENERIC_MSI_IRQ_DOMAIN
config IRQ_MSI_IOMMU
bool
-config HANDLE_DOMAIN_IRQ
- bool
-
config IRQ_TIMINGS
bool
@@ -144,3 +141,10 @@ config GENERIC_IRQ_MULTI_HANDLER
bool
help
Allow to specify the low level IRQ handler at run time.
+
+# Cavium Octeon is the last system to use this deprecated option
+# Do not even think of enabling this on any new platform
+config DEPRECATED_IRQ_CPU_ONOFFLINE
+ bool
+ depends on CAVIUM_OCTEON_SOC
+ default CAVIUM_OCTEON_SOC
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a98bcfc4be7b..c09324663088 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -575,8 +575,6 @@ EXPORT_SYMBOL_GPL(handle_simple_irq);
*/
void handle_untracked_irq(struct irq_desc *desc)
{
- unsigned int flags = 0;
-
raw_spin_lock(&desc->lock);
if (!irq_may_run(desc))
@@ -593,7 +591,7 @@ void handle_untracked_irq(struct irq_desc *desc)
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock(&desc->lock);
- __handle_irq_event_percpu(desc, &flags);
+ __handle_irq_event_percpu(desc);
raw_spin_lock(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@@ -1122,6 +1120,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
}
EXPORT_SYMBOL_GPL(irq_modify_status);
+#ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE
/**
* irq_cpu_online - Invoke all irq_cpu_online functions.
*
@@ -1181,6 +1180,7 @@ void irq_cpu_offline(void)
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
+#endif
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index cc7cdd26e23e..f0862eb6b506 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -25,6 +25,7 @@ static DEFINE_RAW_SPINLOCK(gc_lock);
void irq_gc_noop(struct irq_data *d)
{
}
+EXPORT_SYMBOL_GPL(irq_gc_noop);
/**
* irq_gc_mask_disable_reg - Mask chip via disable register
@@ -44,6 +45,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
*ct->mask_cache &= ~mask;
irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_mask_disable_reg);
/**
* irq_gc_mask_set_bit - Mask chip via setting bit in mask register
@@ -103,6 +105,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
*ct->mask_cache |= mask;
irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_unmask_enable_reg);
/**
* irq_gc_ack_set_bit - Ack pending interrupt via setting bit
@@ -448,7 +451,7 @@ static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
}
-struct irq_domain_ops irq_generic_chip_ops = {
+const struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
.unmap = irq_unmap_generic_chip,
.xlate = irq_domain_xlate_onetwocell,
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 221d80c31e94..9489f93b3db3 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -14,6 +14,8 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <asm/irq_regs.h>
+
#include <trace/events/irq.h>
#include "internals.h"
@@ -134,7 +136,7 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
wake_up_process(action->thread);
}
-irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval = IRQ_NONE;
unsigned int irq = desc->irq_data.irq;
@@ -172,10 +174,6 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
}
__irq_wake_thread(desc, action);
-
- fallthrough; /* to add to randomness */
- case IRQ_HANDLED:
- *flags |= action->flags;
break;
default:
@@ -191,11 +189,10 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval;
- unsigned int flags = 0;
- retval = __handle_irq_event_percpu(desc, &flags);
+ retval = __handle_irq_event_percpu(desc);
- add_interrupt_randomness(desc->irq_data.irq, flags);
+ add_interrupt_randomness(desc->irq_data.irq);
if (!irq_settings_no_debug(desc))
note_interrupt(desc, retval);
@@ -226,4 +223,20 @@ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
handle_arch_irq = handle_irq;
return 0;
}
+
+/**
+ * generic_handle_arch_irq - root irq handler for architectures which do no
+ * entry accounting themselves
+ * @regs: Register file coming from the low-level handling code
+ */
+asmlinkage void noinstr generic_handle_arch_irq(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs;
+
+ irq_enter();
+ old_regs = set_irq_regs(regs);
+ handle_arch_irq(regs);
+ set_irq_regs(old_regs);
+ irq_exit();
+}
#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 54363527feea..99cbdf55a8bd 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -103,7 +103,7 @@ extern int __irq_get_irqchip_state(struct irq_data *data,
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4e3c29bb603c..2267e6527db3 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -646,13 +646,16 @@ int handle_irq_desc(struct irq_desc *desc)
generic_handle_irq_desc(desc);
return 0;
}
-EXPORT_SYMBOL_GPL(handle_irq_desc);
/**
* generic_handle_irq - Invoke the handler for a particular irq
* @irq: The irq number to handle
*
- */
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ *
+ * This function must be called from an IRQ context with irq regs
+ * initialized.
+ */
int generic_handle_irq(unsigned int irq)
{
return handle_irq_desc(irq_to_desc(irq));
@@ -662,89 +665,39 @@ EXPORT_SYMBOL_GPL(generic_handle_irq);
#ifdef CONFIG_IRQ_DOMAIN
/**
* generic_handle_domain_irq - Invoke the handler for a HW irq belonging
- * to a domain, usually for a non-root interrupt
- * controller
+ * to a domain.
* @domain: The domain where to perform the lookup
* @hwirq: The HW irq number to convert to a logical one
*
* Returns: 0 on success, or -EINVAL if conversion has failed
*
+ * This function must be called from an IRQ context with irq regs
+ * initialized.
*/
int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
{
+ WARN_ON_ONCE(!in_irq());
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
-#ifdef CONFIG_HANDLE_DOMAIN_IRQ
/**
- * handle_domain_irq - Invoke the handler for a HW irq belonging to a domain,
- * usually for a root interrupt controller
+ * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
+ * to a domain.
* @domain: The domain where to perform the lookup
* @hwirq: The HW irq number to convert to a logical one
- * @regs: Register file coming from the low-level handling code
*
* Returns: 0 on success, or -EINVAL if conversion has failed
- */
-int handle_domain_irq(struct irq_domain *domain,
- unsigned int hwirq, struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
- struct irq_desc *desc;
- int ret = 0;
-
- irq_enter();
-
- /* The irqdomain code provides boundary checks */
- desc = irq_resolve_mapping(domain, hwirq);
- if (likely(desc))
- handle_irq_desc(desc);
- else
- ret = -EINVAL;
-
- irq_exit();
- set_irq_regs(old_regs);
- return ret;
-}
-
-/**
- * handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain
- * @domain: The domain where to perform the lookup
- * @hwirq: The HW irq number to convert to a logical one
- * @regs: Register file coming from the low-level handling code
*
- * This function must be called from an NMI context.
- *
- * Returns: 0 on success, or -EINVAL if conversion has failed
- */
-int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
- struct pt_regs *regs)
+ * This function must be called from an NMI context with irq regs
+ * initialized.
+ **/
+int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
- struct irq_desc *desc;
- int ret = 0;
-
- /*
- * NMI context needs to be setup earlier in order to deal with tracing.
- */
- WARN_ON(!in_nmi());
-
- desc = irq_resolve_mapping(domain, hwirq);
-
- /*
- * ack_bad_irq is not NMI-safe, just report
- * an invalid interrupt.
- */
- if (likely(desc))
- handle_irq_desc(desc);
- else
- ret = -EINVAL;
-
- set_irq_regs(old_regs);
- return ret;
+ WARN_ON_ONCE(!in_nmi());
+ return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
#endif
-#endif
/* Dynamic interrupt handling */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 62be16135e7c..bf38c546aa25 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -136,7 +136,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
* Allocates and initializes an irq_domain structure.
* Returns pointer to IRQ domain, or NULL on failure.
*/
-struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
+struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
irq_hw_number_t hwirq_max, int direct_max,
const struct irq_domain_ops *ops,
void *host_data)
@@ -491,6 +491,7 @@ struct irq_domain *irq_get_default_host(void)
{
return irq_default_domain;
}
+EXPORT_SYMBOL_GPL(irq_get_default_host);
static bool irq_domain_is_nomap(struct irq_domain *domain)
{
@@ -743,9 +744,8 @@ static int irq_domain_translate(struct irq_domain *d,
return 0;
}
-static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
- unsigned int count,
- struct irq_fwspec *fwspec)
+void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
+ unsigned int count, struct irq_fwspec *fwspec)
{
int i;
@@ -755,6 +755,7 @@ static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
for (i = 0; i < count; i++)
fwspec->param[i] = args[i];
}
+EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec);
unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
{
@@ -1501,6 +1502,7 @@ out_free_desc:
irq_free_descs(virq, nr_irqs);
return ret;
}
+EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs);
/* The irq_data was moved, fix the revmap to refer to the new location */
static void irq_domain_fix_revmap(struct irq_data *d)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 27667e82ecc9..f23ffd30385b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -486,7 +486,8 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
}
EXPORT_SYMBOL_GPL(irq_force_affinity);
-int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
+int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
+ bool setaffinity)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
@@ -495,12 +496,11 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
return -EINVAL;
desc->affinity_hint = m;
irq_put_desc_unlock(desc, flags);
- /* set the initial affinity to prevent every interrupt being on CPU0 */
- if (m)
+ if (m && setaffinity)
__irq_set_affinity(irq, m, false);
return 0;
}
-EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint);
static void irq_affinity_notify(struct work_struct *work)
{
@@ -1259,6 +1259,8 @@ static int irq_thread(void *data)
irqreturn_t (*handler_fn)(struct irq_desc *desc,
struct irqaction *action);
+ sched_set_fifo(current);
+
if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
&action->thread_flags))
handler_fn = irq_forced_thread_fn;
@@ -1424,8 +1426,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
if (IS_ERR(t))
return PTR_ERR(t);
- sched_set_fifo(t);
-
/*
* We keep the reference to the task struct even if
* the thread dies to avoid that the interrupt code
@@ -2827,7 +2827,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
* This call sets the internal irqchip state of an interrupt,
* depending on the value of @which.
*
- * This function should be called with preemption disabled if the
+ * This function should be called with migration disabled if the
* interrupt controller has per-cpu registers.
*/
int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 6a5ecee6e567..2bdfce5edafd 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -14,12 +14,15 @@
#include <linux/irqdomain.h>
#include <linux/msi.h>
#include <linux/slab.h>
+#include <linux/sysfs.h>
#include <linux/pci.h>
#include "internals.h"
+static inline int msi_sysfs_create_group(struct device *dev);
+
/**
- * alloc_msi_entry - Allocate an initialized msi_desc
+ * msi_alloc_desc - Allocate an initialized msi_desc
* @dev: Pointer to the device for which this is allocated
* @nvec: The number of vectors used in this entry
* @affinity: Optional pointer to an affinity mask array size of @nvec
@@ -29,34 +32,134 @@
*
* Return: pointer to allocated &msi_desc on success or %NULL on failure
*/
-struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
- const struct irq_affinity_desc *affinity)
+static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
+ const struct irq_affinity_desc *affinity)
{
- struct msi_desc *desc;
+ struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
- desc = kzalloc(sizeof(*desc), GFP_KERNEL);
if (!desc)
return NULL;
- INIT_LIST_HEAD(&desc->list);
desc->dev = dev;
desc->nvec_used = nvec;
if (affinity) {
- desc->affinity = kmemdup(affinity,
- nvec * sizeof(*desc->affinity), GFP_KERNEL);
+ desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL);
if (!desc->affinity) {
kfree(desc);
return NULL;
}
}
-
return desc;
}
-void free_msi_entry(struct msi_desc *entry)
+static void msi_free_desc(struct msi_desc *desc)
{
- kfree(entry->affinity);
- kfree(entry);
+ kfree(desc->affinity);
+ kfree(desc);
+}
+
+static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index)
+{
+ int ret;
+
+ desc->msi_index = index;
+ ret = xa_insert(&md->__store, index, desc, GFP_KERNEL);
+ if (ret)
+ msi_free_desc(desc);
+ return ret;
+}
+
+/**
+ * msi_add_msi_desc - Allocate and initialize a MSI descriptor
+ * @dev: Pointer to the device for which the descriptor is allocated
+ * @init_desc: Pointer to an MSI descriptor to initialize the new descriptor
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc)
+{
+ struct msi_desc *desc;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ desc = msi_alloc_desc(dev, init_desc->nvec_used, init_desc->affinity);
+ if (!desc)
+ return -ENOMEM;
+
+ /* Copy type specific data to the new descriptor. */
+ desc->pci = init_desc->pci;
+ return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index);
+}
+
+/**
+ * msi_add_simple_msi_descs - Allocate and initialize MSI descriptors
+ * @dev: Pointer to the device for which the descriptors are allocated
+ * @index: Index for the first MSI descriptor
+ * @ndesc: Number of descriptors to allocate
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc)
+{
+ unsigned int idx, last = index + ndesc - 1;
+ struct msi_desc *desc;
+ int ret;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ for (idx = index; idx <= last; idx++) {
+ desc = msi_alloc_desc(dev, 1, NULL);
+ if (!desc)
+ goto fail_mem;
+ ret = msi_insert_desc(dev->msi.data, desc, idx);
+ if (ret)
+ goto fail;
+ }
+ return 0;
+
+fail_mem:
+ ret = -ENOMEM;
+fail:
+ msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last);
+ return ret;
+}
+
+static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
+{
+ switch (filter) {
+ case MSI_DESC_ALL:
+ return true;
+ case MSI_DESC_NOTASSOCIATED:
+ return !desc->irq;
+ case MSI_DESC_ASSOCIATED:
+ return !!desc->irq;
+ }
+ WARN_ON_ONCE(1);
+ return false;
+}
+
+/**
+ * msi_free_msi_descs_range - Free MSI descriptors of a device
+ * @dev: Device to free the descriptors
+ * @filter: Descriptor state filter
+ * @first_index: Index to start freeing from
+ * @last_index: Last index to be freed
+ */
+void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
+ unsigned int first_index, unsigned int last_index)
+{
+ struct xarray *xa = &dev->msi.data->__store;
+ struct msi_desc *desc;
+ unsigned long idx;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ xa_for_each_range(xa, idx, desc, first_index, last_index) {
+ if (msi_desc_match(desc, filter)) {
+ xa_erase(xa, idx);
+ msi_free_desc(desc);
+ }
+ }
}
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@@ -72,139 +175,290 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
}
EXPORT_SYMBOL_GPL(get_cached_msi_msg);
-static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
- char *buf)
+static void msi_device_data_release(struct device *dev, void *res)
{
- struct msi_desc *entry;
- bool is_msix = false;
- unsigned long irq;
- int retval;
+ struct msi_device_data *md = res;
- retval = kstrtoul(attr->attr.name, 10, &irq);
- if (retval)
- return retval;
+ WARN_ON_ONCE(!xa_empty(&md->__store));
+ xa_destroy(&md->__store);
+ dev->msi.data = NULL;
+}
- entry = irq_get_msi_desc(irq);
- if (!entry)
- return -ENODEV;
+/**
+ * msi_setup_device_data - Setup MSI device data
+ * @dev: Device for which MSI device data should be set up
+ *
+ * Return: 0 on success, appropriate error code otherwise
+ *
+ * This can be called more than once for @dev. If the MSI device data is
+ * already allocated the call succeeds. The allocated memory is
+ * automatically released when the device is destroyed.
+ */
+int msi_setup_device_data(struct device *dev)
+{
+ struct msi_device_data *md;
+ int ret;
- if (dev_is_pci(dev))
- is_msix = entry->msi_attrib.is_msix;
+ if (dev->msi.data)
+ return 0;
- return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
+ md = devres_alloc(msi_device_data_release, sizeof(*md), GFP_KERNEL);
+ if (!md)
+ return -ENOMEM;
+
+ ret = msi_sysfs_create_group(dev);
+ if (ret) {
+ devres_free(md);
+ return ret;
+ }
+
+ xa_init(&md->__store);
+ mutex_init(&md->mutex);
+ dev->msi.data = md;
+ devres_add(dev, md);
+ return 0;
+}
+
+/**
+ * msi_lock_descs - Lock the MSI descriptor storage of a device
+ * @dev: Device to operate on
+ */
+void msi_lock_descs(struct device *dev)
+{
+ mutex_lock(&dev->msi.data->mutex);
+}
+EXPORT_SYMBOL_GPL(msi_lock_descs);
+
+/**
+ * msi_unlock_descs - Unlock the MSI descriptor storage of a device
+ * @dev: Device to operate on
+ */
+void msi_unlock_descs(struct device *dev)
+{
+ /* Invalidate the index wich was cached by the iterator */
+ dev->msi.data->__iter_idx = MSI_MAX_INDEX;
+ mutex_unlock(&dev->msi.data->mutex);
+}
+EXPORT_SYMBOL_GPL(msi_unlock_descs);
+
+static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter)
+{
+ struct msi_desc *desc;
+
+ xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) {
+ if (msi_desc_match(desc, filter))
+ return desc;
+ }
+ md->__iter_idx = MSI_MAX_INDEX;
+ return NULL;
}
/**
- * msi_populate_sysfs - Populate msi_irqs sysfs entries for devices
- * @dev: The device(PCI, platform etc) who will get sysfs entries
+ * msi_first_desc - Get the first MSI descriptor of a device
+ * @dev: Device to operate on
+ * @filter: Descriptor state filter
+ *
+ * Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs()
+ * must be invoked before the call.
*
- * Return attribute_group ** so that specific bus MSI can save it to
- * somewhere during initilizing msi irqs. If devices has no MSI irq,
- * return NULL; if it fails to populate sysfs, return ERR_PTR
+ * Return: Pointer to the first MSI descriptor matching the search
+ * criteria, NULL if none found.
*/
-const struct attribute_group **msi_populate_sysfs(struct device *dev)
-{
- const struct attribute_group **msi_irq_groups;
- struct attribute **msi_attrs, *msi_attr;
- struct device_attribute *msi_dev_attr;
- struct attribute_group *msi_irq_group;
- struct msi_desc *entry;
- int ret = -ENOMEM;
- int num_msi = 0;
- int count = 0;
- int i;
+struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter)
+{
+ struct msi_device_data *md = dev->msi.data;
- /* Determine how many msi entries we have */
- for_each_msi_entry(entry, dev)
- num_msi += entry->nvec_used;
- if (!num_msi)
+ if (WARN_ON_ONCE(!md))
return NULL;
- /* Dynamically create the MSI attributes for the device */
- msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL);
- if (!msi_attrs)
- return ERR_PTR(-ENOMEM);
-
- for_each_msi_entry(entry, dev) {
- for (i = 0; i < entry->nvec_used; i++) {
- msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
- if (!msi_dev_attr)
- goto error_attrs;
- msi_attrs[count] = &msi_dev_attr->attr;
-
- sysfs_attr_init(&msi_dev_attr->attr);
- msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d",
- entry->irq + i);
- if (!msi_dev_attr->attr.name)
- goto error_attrs;
- msi_dev_attr->attr.mode = 0444;
- msi_dev_attr->show = msi_mode_show;
- ++count;
+ lockdep_assert_held(&md->mutex);
+
+ md->__iter_idx = 0;
+ return msi_find_desc(md, filter);
+}
+EXPORT_SYMBOL_GPL(msi_first_desc);
+
+/**
+ * msi_next_desc - Get the next MSI descriptor of a device
+ * @dev: Device to operate on
+ *
+ * The first invocation of msi_next_desc() has to be preceeded by a
+ * successful invocation of __msi_first_desc(). Consecutive invocations are
+ * only valid if the previous one was successful. All these operations have
+ * to be done within the same MSI mutex held region.
+ *
+ * Return: Pointer to the next MSI descriptor matching the search
+ * criteria, NULL if none found.
+ */
+struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter)
+{
+ struct msi_device_data *md = dev->msi.data;
+
+ if (WARN_ON_ONCE(!md))
+ return NULL;
+
+ lockdep_assert_held(&md->mutex);
+
+ if (md->__iter_idx >= (unsigned long)MSI_MAX_INDEX)
+ return NULL;
+
+ md->__iter_idx++;
+ return msi_find_desc(md, filter);
+}
+EXPORT_SYMBOL_GPL(msi_next_desc);
+
+/**
+ * msi_get_virq - Return Linux interrupt number of a MSI interrupt
+ * @dev: Device to operate on
+ * @index: MSI interrupt index to look for (0-based)
+ *
+ * Return: The Linux interrupt number on success (> 0), 0 if not found
+ */
+unsigned int msi_get_virq(struct device *dev, unsigned int index)
+{
+ struct msi_desc *desc;
+ unsigned int ret = 0;
+ bool pcimsi;
+
+ if (!dev->msi.data)
+ return 0;
+
+ pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false;
+
+ msi_lock_descs(dev);
+ desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index);
+ if (desc && desc->irq) {
+ /*
+ * PCI-MSI has only one descriptor for multiple interrupts.
+ * PCI-MSIX and platform MSI use a descriptor per
+ * interrupt.
+ */
+ if (pcimsi) {
+ if (index < desc->nvec_used)
+ ret = desc->irq + index;
+ } else {
+ ret = desc->irq;
}
}
+ msi_unlock_descs(dev);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(msi_get_virq);
- msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
- if (!msi_irq_group)
- goto error_attrs;
- msi_irq_group->name = "msi_irqs";
- msi_irq_group->attrs = msi_attrs;
+#ifdef CONFIG_SYSFS
+static struct attribute *msi_dev_attrs[] = {
+ NULL
+};
- msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL);
- if (!msi_irq_groups)
- goto error_irq_group;
- msi_irq_groups[0] = msi_irq_group;
+static const struct attribute_group msi_irqs_group = {
+ .name = "msi_irqs",
+ .attrs = msi_dev_attrs,
+};
- ret = sysfs_create_groups(&dev->kobj, msi_irq_groups);
- if (ret)
- goto error_irq_groups;
-
- return msi_irq_groups;
-
-error_irq_groups:
- kfree(msi_irq_groups);
-error_irq_group:
- kfree(msi_irq_group);
-error_attrs:
- count = 0;
- msi_attr = msi_attrs[count];
- while (msi_attr) {
- msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
- kfree(msi_attr->name);
- kfree(msi_dev_attr);
- ++count;
- msi_attr = msi_attrs[count];
+static inline int msi_sysfs_create_group(struct device *dev)
+{
+ return devm_device_add_group(dev, &msi_irqs_group);
+}
+
+static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ /* MSI vs. MSIX is per device not per interrupt */
+ bool is_msix = dev_is_pci(dev) ? to_pci_dev(dev)->msix_enabled : false;
+
+ return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
+}
+
+static void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc)
+{
+ struct device_attribute *attrs = desc->sysfs_attrs;
+ int i;
+
+ if (!attrs)
+ return;
+
+ desc->sysfs_attrs = NULL;
+ for (i = 0; i < desc->nvec_used; i++) {
+ if (attrs[i].show)
+ sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
+ kfree(attrs[i].attr.name);
}
- kfree(msi_attrs);
- return ERR_PTR(ret);
+ kfree(attrs);
}
+static int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc)
+{
+ struct device_attribute *attrs;
+ int ret, i;
+
+ attrs = kcalloc(desc->nvec_used, sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+
+ desc->sysfs_attrs = attrs;
+ for (i = 0; i < desc->nvec_used; i++) {
+ sysfs_attr_init(&attrs[i].attr);
+ attrs[i].attr.name = kasprintf(GFP_KERNEL, "%d", desc->irq + i);
+ if (!attrs[i].attr.name) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ attrs[i].attr.mode = 0444;
+ attrs[i].show = msi_mode_show;
+
+ ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
+ if (ret) {
+ attrs[i].show = NULL;
+ goto fail;
+ }
+ }
+ return 0;
+
+fail:
+ msi_sysfs_remove_desc(dev, desc);
+ return ret;
+}
+
+#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
/**
- * msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices
- * @dev: The device(PCI, platform etc) who will remove sysfs entries
- * @msi_irq_groups: attribute_group for device msi_irqs entries
+ * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device
+ * @dev: The device (PCI, platform etc) which will get sysfs entries
*/
-void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups)
-{
- struct device_attribute *dev_attr;
- struct attribute **msi_attrs;
- int count = 0;
-
- if (msi_irq_groups) {
- sysfs_remove_groups(&dev->kobj, msi_irq_groups);
- msi_attrs = msi_irq_groups[0]->attrs;
- while (msi_attrs[count]) {
- dev_attr = container_of(msi_attrs[count],
- struct device_attribute, attr);
- kfree(dev_attr->attr.name);
- kfree(dev_attr);
- ++count;
- }
- kfree(msi_attrs);
- kfree(msi_irq_groups[0]);
- kfree(msi_irq_groups);
+int msi_device_populate_sysfs(struct device *dev)
+{
+ struct msi_desc *desc;
+ int ret;
+
+ msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ if (desc->sysfs_attrs)
+ continue;
+ ret = msi_sysfs_populate_desc(dev, desc);
+ if (ret)
+ return ret;
}
+ return 0;
}
+/**
+ * msi_device_destroy_sysfs - Destroy msi_irqs sysfs entries for a device
+ * @dev: The device (PCI, platform etc) for which to remove
+ * sysfs entries
+ */
+void msi_device_destroy_sysfs(struct device *dev)
+{
+ struct msi_desc *desc;
+
+ msi_for_each_desc(desc, dev, MSI_DESC_ALL)
+ msi_sysfs_remove_desc(dev, desc);
+}
+#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */
+#else /* CONFIG_SYSFS */
+static inline int msi_sysfs_create_group(struct device *dev) { return 0; }
+static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; }
+static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { }
+#endif /* !CONFIG_SYSFS */
+
#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
@@ -456,43 +710,38 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
}
int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
- int virq, int nvec, msi_alloc_info_t *arg)
+ int virq_base, int nvec, msi_alloc_info_t *arg)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
struct msi_desc *desc;
- int ret = 0;
+ int ret, virq;
- for_each_msi_entry(desc, dev) {
- /* Don't even try the multi-MSI brain damage. */
- if (WARN_ON(!desc->irq || desc->nvec_used != 1)) {
- ret = -EINVAL;
- break;
- }
+ msi_lock_descs(dev);
+ ret = msi_add_simple_msi_descs(dev, virq_base, nvec);
+ if (ret)
+ goto unlock;
- if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
- continue;
+ for (virq = virq_base; virq < virq_base + nvec; virq++) {
+ desc = xa_load(&dev->msi.data->__store, virq);
+ desc->irq = virq;
ops->set_desc(arg, desc);
- /* Assumes the domain mutex is held! */
- ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1,
- arg);
+ ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
if (ret)
- break;
+ goto fail;
- irq_set_msi_desc_off(desc->irq, 0, desc);
- }
-
- if (ret) {
- /* Mop up the damage */
- for_each_msi_entry(desc, dev) {
- if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
- continue;
-
- irq_domain_free_irqs_common(domain, desc->irq, 1);
- }
+ irq_set_msi_desc(virq, desc);
}
+ msi_unlock_descs(dev);
+ return 0;
+fail:
+ for (--virq; virq >= virq_base; virq--)
+ irq_domain_free_irqs_common(domain, virq, 1);
+ msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1);
+unlock:
+ msi_unlock_descs(dev);
return ret;
}
@@ -529,10 +778,61 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
/*
* Checking the first MSI descriptor is sufficient. MSIX supports
- * masking and MSI does so when the maskbit is set.
+ * masking and MSI does so when the can_mask attribute is set.
*/
- desc = first_msi_entry(dev);
- return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
+ desc = msi_first_desc(dev, MSI_DESC_ALL);
+ return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask;
+}
+
+static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
+ int allocated)
+{
+ switch(domain->bus_token) {
+ case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_VMD_MSI:
+ if (IS_ENABLED(CONFIG_PCI_MSI))
+ break;
+ fallthrough;
+ default:
+ return -ENOSPC;
+ }
+
+ /* Let a failed PCI multi MSI allocation retry */
+ if (desc->nvec_used > 1)
+ return 1;
+
+ /* If there was a successful allocation let the caller know */
+ return allocated ? allocated : -ENOSPC;
+}
+
+#define VIRQ_CAN_RESERVE 0x01
+#define VIRQ_ACTIVATE 0x02
+#define VIRQ_NOMASK_QUIRK 0x04
+
+static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags)
+{
+ struct irq_data *irqd = irq_domain_get_irq_data(domain, virq);
+ int ret;
+
+ if (!(vflags & VIRQ_CAN_RESERVE)) {
+ irqd_clr_can_reserve(irqd);
+ if (vflags & VIRQ_NOMASK_QUIRK)
+ irqd_set_msi_nomask_quirk(irqd);
+ }
+
+ if (!(vflags & VIRQ_ACTIVATE))
+ return 0;
+
+ ret = irq_domain_activate_irq(irqd, vflags & VIRQ_CAN_RESERVE);
+ if (ret)
+ return ret;
+ /*
+ * If the interrupt uses reservation mode, clear the activated bit
+ * so request_irq() will assign the final vector.
+ */
+ if (vflags & VIRQ_CAN_RESERVE)
+ irqd_clr_activated(irqd);
+ return 0;
}
int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
@@ -540,83 +840,103 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- struct irq_data *irq_data;
- struct msi_desc *desc;
msi_alloc_info_t arg = { };
+ unsigned int vflags = 0;
+ struct msi_desc *desc;
+ int allocated = 0;
int i, ret, virq;
- bool can_reserve;
ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
if (ret)
return ret;
- for_each_msi_entry(desc, dev) {
+ /*
+ * This flag is set by the PCI layer as we need to activate
+ * the MSI entries before the PCI layer enables MSI in the
+ * card. Otherwise the card latches a random msi message.
+ */
+ if (info->flags & MSI_FLAG_ACTIVATE_EARLY)
+ vflags |= VIRQ_ACTIVATE;
+
+ /*
+ * Interrupt can use a reserved vector and will not occupy
+ * a real device vector until the interrupt is requested.
+ */
+ if (msi_check_reservation_mode(domain, info, dev)) {
+ vflags |= VIRQ_CAN_RESERVE;
+ /*
+ * MSI affinity setting requires a special quirk (X86) when
+ * reservation mode is active.
+ */
+ if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ vflags |= VIRQ_NOMASK_QUIRK;
+ }
+
+ msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) {
ops->set_desc(&arg, desc);
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
dev_to_node(dev), &arg, false,
desc->affinity);
- if (virq < 0) {
- ret = -ENOSPC;
- if (ops->handle_error)
- ret = ops->handle_error(domain, desc, ret);
- if (ops->msi_finish)
- ops->msi_finish(&arg, ret);
- return ret;
- }
+ if (virq < 0)
+ return msi_handle_pci_fail(domain, desc, allocated);
for (i = 0; i < desc->nvec_used; i++) {
irq_set_msi_desc_off(virq, i, desc);
irq_debugfs_copy_devname(virq + i, dev);
+ ret = msi_init_virq(domain, virq + i, vflags);
+ if (ret)
+ return ret;
+ }
+ if (info->flags & MSI_FLAG_DEV_SYSFS) {
+ ret = msi_sysfs_populate_desc(dev, desc);
+ if (ret)
+ return ret;
}
+ allocated++;
}
+ return 0;
+}
- if (ops->msi_finish)
- ops->msi_finish(&arg, 0);
+static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info,
+ struct device *dev,
+ unsigned int num_descs)
+{
+ if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS))
+ return 0;
- can_reserve = msi_check_reservation_mode(domain, info, dev);
+ return msi_add_simple_msi_descs(dev, 0, num_descs);
+}
- /*
- * This flag is set by the PCI layer as we need to activate
- * the MSI entries before the PCI layer enables MSI in the
- * card. Otherwise the card latches a random msi message.
- */
- if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
- goto skip_activate;
-
- for_each_msi_vector(desc, i, dev) {
- if (desc->irq == i) {
- virq = desc->irq;
- dev_dbg(dev, "irq [%d-%d] for MSI\n",
- virq, virq + desc->nvec_used - 1);
- }
+/**
+ * msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain
+ * @domain: The domain to allocate from
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @nvec: The number of interrupts to allocate
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own vector
+ * allocation/free.
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev,
+ int nvec)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ int ret;
- irq_data = irq_domain_get_irq_data(domain, i);
- if (!can_reserve) {
- irqd_clr_can_reserve(irq_data);
- if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
- irqd_set_msi_nomask_quirk(irq_data);
- }
- ret = irq_domain_activate_irq(irq_data, can_reserve);
- if (ret)
- goto cleanup;
- }
+ lockdep_assert_held(&dev->msi.data->mutex);
-skip_activate:
- /*
- * If these interrupts use reservation mode, clear the activated bit
- * so request_irq() will assign the final vector.
- */
- if (can_reserve) {
- for_each_msi_vector(desc, i, dev) {
- irq_data = irq_domain_get_irq_data(domain, i);
- irqd_clr_activated(irq_data);
- }
- }
- return 0;
+ ret = msi_domain_add_simple_msi_descs(info, dev, nvec);
+ if (ret)
+ return ret;
-cleanup:
- msi_domain_free_irqs(domain, dev);
+ ret = ops->domain_alloc_irqs(domain, dev, nvec);
+ if (ret)
+ msi_domain_free_irqs_descs_locked(domain, dev);
return ret;
}
@@ -629,52 +949,78 @@ cleanup:
*
* Return: %0 on success or an error code.
*/
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec)
+int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
+ int ret;
- return ops->domain_alloc_irqs(domain, dev, nvec);
+ msi_lock_descs(dev);
+ ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec);
+ msi_unlock_descs(dev);
+ return ret;
}
void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
- struct irq_data *irq_data;
+ struct msi_domain_info *info = domain->host_data;
+ struct irq_data *irqd;
struct msi_desc *desc;
int i;
- for_each_msi_vector(desc, i, dev) {
- irq_data = irq_domain_get_irq_data(domain, i);
- if (irqd_is_activated(irq_data))
- irq_domain_deactivate_irq(irq_data);
- }
-
- for_each_msi_entry(desc, dev) {
- /*
- * We might have failed to allocate an MSI early
- * enough that there is no IRQ associated to this
- * entry. If that's the case, don't do anything.
- */
- if (desc->irq) {
- irq_domain_free_irqs(desc->irq, desc->nvec_used);
- desc->irq = 0;
+ /* Only handle MSI entries which have an interrupt associated */
+ msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ /* Make sure all interrupts are deactivated */
+ for (i = 0; i < desc->nvec_used; i++) {
+ irqd = irq_domain_get_irq_data(domain, desc->irq + i);
+ if (irqd && irqd_is_activated(irqd))
+ irq_domain_deactivate_irq(irqd);
}
+
+ irq_domain_free_irqs(desc->irq, desc->nvec_used);
+ if (info->flags & MSI_FLAG_DEV_SYSFS)
+ msi_sysfs_remove_desc(dev, desc);
+ desc->irq = 0;
}
}
+static void msi_domain_free_msi_descs(struct msi_domain_info *info,
+ struct device *dev)
+{
+ if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
+ msi_free_msi_descs(dev);
+}
+
/**
- * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev
+ * msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev
* @domain: The domain to managing the interrupts
* @dev: Pointer to device struct of the device for which the interrupts
* are free
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own vector
+ * allocation.
*/
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- return ops->domain_free_irqs(domain, dev);
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ ops->domain_free_irqs(domain, dev);
+ msi_domain_free_msi_descs(info, dev);
+}
+
+/**
+ * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev
+ * @domain: The domain to managing the interrupts
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are free
+ */
+void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_descs_locked(domain, dev);
+ msi_unlock_descs(dev);
}
/**
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index c481d8458325..02b2daf07441 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -447,6 +447,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
static int __init irqfixup_setup(char *str)
{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ pr_warn("irqfixup boot option not supported with PREEMPT_RT\n");
+ return 1;
+ }
irqfixup = 1;
printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
printk(KERN_WARNING "This may impact system performance.\n");
@@ -459,6 +463,10 @@ module_param(irqfixup, int, 0644);
static int __init irqpoll_setup(char *str)
{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ pr_warn("irqpoll boot option not supported with PREEMPT_RT\n");
+ return 1;
+ }
irqfixup = 2;
printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
"enabled\n");
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index db8c248ebc8c..f7df715ec28e 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -18,11 +18,36 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
+#include <linux/smpboot.h>
#include <asm/processor.h>
#include <linux/kasan.h>
static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
+static DEFINE_PER_CPU(struct task_struct *, irq_workd);
+
+static void wake_irq_workd(void)
+{
+ struct task_struct *tsk = __this_cpu_read(irq_workd);
+
+ if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk)
+ wake_up_process(tsk);
+}
+
+#ifdef CONFIG_SMP
+static void irq_work_wake(struct irq_work *entry)
+{
+ wake_irq_workd();
+}
+
+static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) =
+ IRQ_WORK_INIT_HARD(irq_work_wake);
+#endif
+
+static int irq_workd_should_run(unsigned int cpu)
+{
+ return !llist_empty(this_cpu_ptr(&lazy_list));
+}
/*
* Claim the entry so that no one else will poke at it.
@@ -52,15 +77,29 @@ void __weak arch_irq_work_raise(void)
/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
+ struct llist_head *list;
+ bool rt_lazy_work = false;
+ bool lazy_work = false;
+ int work_flags;
+
+ work_flags = atomic_read(&work->node.a_flags);
+ if (work_flags & IRQ_WORK_LAZY)
+ lazy_work = true;
+ else if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ !(work_flags & IRQ_WORK_HARD_IRQ))
+ rt_lazy_work = true;
+
+ if (lazy_work || rt_lazy_work)
+ list = this_cpu_ptr(&lazy_list);
+ else
+ list = this_cpu_ptr(&raised_list);
+
+ if (!llist_add(&work->node.llist, list))
+ return;
+
/* If the work is "lazy", handle it from next tick if any */
- if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) {
- if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) &&
- tick_nohz_tick_stopped())
- arch_irq_work_raise();
- } else {
- if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list)))
- arch_irq_work_raise();
- }
+ if (!lazy_work || tick_nohz_tick_stopped())
+ arch_irq_work_raise();
}
/* Enqueue the irq work @work on the current CPU */
@@ -104,17 +143,34 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (cpu != smp_processor_id()) {
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
+
+ /*
+ * On PREEMPT_RT the items which are not marked as
+ * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work
+ * item is used on the remote CPU to wake the thread.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) {
+
+ if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu)))
+ goto out;
+
+ work = &per_cpu(irq_work_wakeup, cpu);
+ if (!irq_work_claim(work))
+ goto out;
+ }
+
__smp_call_single_queue(cpu, &work->node.llist);
} else {
__irq_work_queue_local(work);
}
+out:
preempt_enable();
return true;
#endif /* CONFIG_SMP */
}
-
bool irq_work_needs_cpu(void)
{
struct llist_head *raised, *lazy;
@@ -160,6 +216,10 @@ void irq_work_single(void *arg)
* else claimed it meanwhile.
*/
(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
+
+ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
+ !arch_irq_work_has_interrupt())
+ rcuwait_wake_up(&work->irqwait);
}
static void irq_work_run_list(struct llist_head *list)
@@ -167,7 +227,12 @@ static void irq_work_run_list(struct llist_head *list)
struct irq_work *work, *tmp;
struct llist_node *llnode;
- BUG_ON(!irqs_disabled());
+ /*
+ * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed
+ * in a per-CPU thread in preemptible context. Only the items which are
+ * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context.
+ */
+ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT));
if (llist_empty(list))
return;
@@ -184,7 +249,10 @@ static void irq_work_run_list(struct llist_head *list)
void irq_work_run(void)
{
irq_work_run_list(this_cpu_ptr(&raised_list));
- irq_work_run_list(this_cpu_ptr(&lazy_list));
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+ else
+ wake_irq_workd();
}
EXPORT_SYMBOL_GPL(irq_work_run);
@@ -194,7 +262,11 @@ void irq_work_tick(void)
if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
irq_work_run_list(raised);
- irq_work_run_list(this_cpu_ptr(&lazy_list));
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+ else
+ wake_irq_workd();
}
/*
@@ -204,8 +276,42 @@ void irq_work_tick(void)
void irq_work_sync(struct irq_work *work)
{
lockdep_assert_irqs_enabled();
+ might_sleep();
+
+ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
+ !arch_irq_work_has_interrupt()) {
+ rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work),
+ TASK_UNINTERRUPTIBLE);
+ return;
+ }
while (irq_work_is_busy(work))
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
+
+static void run_irq_workd(unsigned int cpu)
+{
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+}
+
+static void irq_workd_setup(unsigned int cpu)
+{
+ sched_set_fifo_low(current);
+}
+
+static struct smp_hotplug_thread irqwork_threads = {
+ .store = &irq_workd,
+ .setup = irq_workd_setup,
+ .thread_should_run = irq_workd_should_run,
+ .thread_fn = run_irq_workd,
+ .thread_comm = "irq_work/%u",
+};
+
+static __init int irq_work_init_threads(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ BUG_ON(smpboot_register_percpu_thread(&irqwork_threads));
+ return 0;
+}
+early_initcall(irq_work_init_threads);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 2a9afe484aec..951c93216fc4 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -164,26 +164,46 @@ static unsigned long kallsyms_sym_address(int idx)
return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
}
-#if defined(CONFIG_CFI_CLANG) && defined(CONFIG_LTO_CLANG_THIN)
-/*
- * LLVM appends a hash to static function names when ThinLTO and CFI are
- * both enabled, i.e. foo() becomes foo$707af9a22804d33c81801f27dcfe489b.
- * This causes confusion and potentially breaks user space tools, so we
- * strip the suffix from expanded symbol names.
- */
-static inline bool cleanup_symbol_name(char *s)
+static bool cleanup_symbol_name(char *s)
{
char *res;
+ if (!IS_ENABLED(CONFIG_LTO_CLANG))
+ return false;
+
+ /*
+ * LLVM appends various suffixes for local functions and variables that
+ * must be promoted to global scope as part of LTO. This can break
+ * hooking of static functions with kprobes. '.' is not a valid
+ * character in an identifier in C. Suffixes observed:
+ * - foo.llvm.[0-9a-f]+
+ * - foo.[0-9a-f]+
+ * - foo.[0-9a-f]+.cfi_jt
+ */
+ res = strchr(s, '.');
+ if (res) {
+ *res = '\0';
+ return true;
+ }
+
+ if (!IS_ENABLED(CONFIG_CFI_CLANG) ||
+ !IS_ENABLED(CONFIG_LTO_CLANG_THIN) ||
+ CONFIG_CLANG_VERSION >= 130000)
+ return false;
+
+ /*
+ * Prior to LLVM 13, the following suffixes were observed when thinLTO
+ * and CFI are both enabled:
+ * - foo$[0-9]+
+ */
res = strrchr(s, '$');
- if (res)
+ if (res) {
*res = '\0';
+ return true;
+ }
- return res != NULL;
+ return false;
}
-#else
-static inline bool cleanup_symbol_name(char *s) { return false; }
-#endif
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 80bfe71bbe13..36ca640c4f8e 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -88,6 +88,7 @@ static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas);
struct kcov_percpu_data {
void *irq_area;
+ local_lock_t lock;
unsigned int saved_mode;
unsigned int saved_size;
@@ -96,7 +97,9 @@ struct kcov_percpu_data {
int saved_sequence;
};
-static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
+static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_find(u64 handle)
@@ -824,7 +827,7 @@ void kcov_remote_start(u64 handle)
if (!in_task() && !in_serving_softirq())
return;
- local_irq_save(flags);
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
/*
* Check that kcov_remote_start() is not called twice in background
@@ -832,7 +835,7 @@ void kcov_remote_start(u64 handle)
*/
mode = READ_ONCE(t->kcov_mode);
if (WARN_ON(in_task() && kcov_mode_enabled(mode))) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/*
@@ -841,14 +844,15 @@ void kcov_remote_start(u64 handle)
* happened while collecting coverage from a background thread.
*/
if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
spin_lock(&kcov_remote_lock);
remote = kcov_remote_find(handle);
if (!remote) {
- spin_unlock_irqrestore(&kcov_remote_lock, flags);
+ spin_unlock(&kcov_remote_lock);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
kcov_debug("handle = %llx, context: %s\n", handle,
@@ -869,19 +873,19 @@ void kcov_remote_start(u64 handle)
size = CONFIG_KCOV_IRQ_AREA_SIZE;
area = this_cpu_ptr(&kcov_percpu_data)->irq_area;
}
- spin_unlock_irqrestore(&kcov_remote_lock, flags);
+ spin_unlock(&kcov_remote_lock);
/* Can only happen when in_task(). */
if (!area) {
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
area = vmalloc(size * sizeof(unsigned long));
if (!area) {
kcov_put(kcov);
return;
}
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
}
- local_irq_save(flags);
-
/* Reset coverage size. */
*(u64 *)area = 0;
@@ -891,7 +895,7 @@ void kcov_remote_start(u64 handle)
}
kcov_start(t, kcov, size, area, mode, sequence);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
}
EXPORT_SYMBOL(kcov_remote_start);
@@ -965,12 +969,12 @@ void kcov_remote_stop(void)
if (!in_task() && !in_serving_softirq())
return;
- local_irq_save(flags);
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
mode = READ_ONCE(t->kcov_mode);
barrier();
if (!kcov_mode_enabled(mode)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/*
@@ -978,12 +982,12 @@ void kcov_remote_stop(void)
* actually found the remote handle and started collecting coverage.
*/
if (in_serving_softirq() && !t->kcov_softirq) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/* Make sure that kcov_softirq is only set when in softirq. */
if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
@@ -1013,7 +1017,7 @@ void kcov_remote_stop(void)
spin_unlock(&kcov_remote_lock);
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
/* Get in kcov_remote_start(). */
kcov_put(kcov);
@@ -1034,8 +1038,8 @@ static int __init kcov_init(void)
int cpu;
for_each_possible_cpu(cpu) {
- void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE *
- sizeof(unsigned long));
+ void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE *
+ sizeof(unsigned long), cpu_to_node(cpu));
if (!area)
return -ENOMEM;
per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area;
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index c2bb07f5bcc7..4f35d1bced6a 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -8,9 +8,12 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
+ $(call cc-option,-mno-outline-atomics) \
-fno-stack-protector -DDISABLE_BRANCH_PROFILING
obj-y := core.o debugfs.o report.o
+
+KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 76e67d1e02d4..fe12dfe254ec 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -40,15 +40,17 @@ module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644);
module_param_named(skip_watch, kcsan_skip_watch, long, 0644);
module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444);
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+static bool kcsan_weak_memory = true;
+module_param_named(weak_memory, kcsan_weak_memory, bool, 0644);
+#else
+#define kcsan_weak_memory false
+#endif
+
bool kcsan_enabled;
/* Per-CPU kcsan_ctx for interrupts */
static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = {
- .disable_count = 0,
- .atomic_next = 0,
- .atomic_nest_count = 0,
- .in_flat_atomic = false,
- .access_mask = 0,
.scoped_accesses = {LIST_POISON1, NULL},
};
@@ -202,22 +204,29 @@ static __always_inline struct kcsan_ctx *get_ctx(void)
return in_task() ? &current->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx);
}
+static __always_inline void
+check_access(const volatile void *ptr, size_t size, int type, unsigned long ip);
+
/* Check scoped accesses; never inline because this is a slow-path! */
static noinline void kcsan_check_scoped_accesses(void)
{
struct kcsan_ctx *ctx = get_ctx();
- struct list_head *prev_save = ctx->scoped_accesses.prev;
struct kcsan_scoped_access *scoped_access;
- ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */
- list_for_each_entry(scoped_access, &ctx->scoped_accesses, list)
- __kcsan_check_access(scoped_access->ptr, scoped_access->size, scoped_access->type);
- ctx->scoped_accesses.prev = prev_save;
+ if (ctx->disable_scoped)
+ return;
+
+ ctx->disable_scoped++;
+ list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) {
+ check_access(scoped_access->ptr, scoped_access->size,
+ scoped_access->type, scoped_access->ip);
+ }
+ ctx->disable_scoped--;
}
/* Rules for generic atomic accesses. Called from fast-path. */
static __always_inline bool
-is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
+is_atomic(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
{
if (type & KCSAN_ACCESS_ATOMIC)
return true;
@@ -254,7 +263,7 @@ is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx
}
static __always_inline bool
-should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
+should_watch(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
{
/*
* Never set up watchpoints when memory operations are atomic.
@@ -263,7 +272,7 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *
* should not count towards skipped instructions, and (2) to actually
* decrement kcsan_atomic_next for consecutive instruction stream.
*/
- if (is_atomic(ptr, size, type, ctx))
+ if (is_atomic(ctx, ptr, size, type))
return false;
if (this_cpu_dec_return(kcsan_skip) >= 0)
@@ -320,6 +329,21 @@ static void delay_access(int type)
udelay(delay);
}
+/*
+ * Reads the instrumented memory for value change detection; value change
+ * detection is currently done for accesses up to a size of 8 bytes.
+ */
+static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size)
+{
+ switch (size) {
+ case 1: return READ_ONCE(*(const u8 *)ptr);
+ case 2: return READ_ONCE(*(const u16 *)ptr);
+ case 4: return READ_ONCE(*(const u32 *)ptr);
+ case 8: return READ_ONCE(*(const u64 *)ptr);
+ default: return 0; /* Ignore; we do not diff the values. */
+ }
+}
+
void kcsan_save_irqtrace(struct task_struct *task)
{
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -334,6 +358,76 @@ void kcsan_restore_irqtrace(struct task_struct *task)
#endif
}
+static __always_inline int get_kcsan_stack_depth(void)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ return current->kcsan_stack_depth;
+#else
+ BUILD_BUG();
+ return 0;
+#endif
+}
+
+static __always_inline void add_kcsan_stack_depth(int val)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ current->kcsan_stack_depth += val;
+#else
+ BUILD_BUG();
+#endif
+}
+
+static __always_inline struct kcsan_scoped_access *get_reorder_access(struct kcsan_ctx *ctx)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ return ctx->disable_scoped ? NULL : &ctx->reorder_access;
+#else
+ return NULL;
+#endif
+}
+
+static __always_inline bool
+find_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
+ int type, unsigned long ip)
+{
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (!reorder_access)
+ return false;
+
+ /*
+ * Note: If accesses are repeated while reorder_access is identical,
+ * never matches the new access, because !(type & KCSAN_ACCESS_SCOPED).
+ */
+ return reorder_access->ptr == ptr && reorder_access->size == size &&
+ reorder_access->type == type && reorder_access->ip == ip;
+}
+
+static inline void
+set_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
+ int type, unsigned long ip)
+{
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (!reorder_access || !kcsan_weak_memory)
+ return;
+
+ /*
+ * To avoid nested interrupts or scheduler (which share kcsan_ctx)
+ * reading an inconsistent reorder_access, ensure that the below has
+ * exclusive access to reorder_access by disallowing concurrent use.
+ */
+ ctx->disable_scoped++;
+ barrier();
+ reorder_access->ptr = ptr;
+ reorder_access->size = size;
+ reorder_access->type = type | KCSAN_ACCESS_SCOPED;
+ reorder_access->ip = ip;
+ reorder_access->stack_depth = get_kcsan_stack_depth();
+ barrier();
+ ctx->disable_scoped--;
+}
+
/*
* Pull everything together: check_access() below contains the performance
* critical operations; the fast-path (including check_access) functions should
@@ -350,6 +444,7 @@ void kcsan_restore_irqtrace(struct task_struct *task)
static noinline void kcsan_found_watchpoint(const volatile void *ptr,
size_t size,
int type,
+ unsigned long ip,
atomic_long_t *watchpoint,
long encoded_watchpoint)
{
@@ -371,8 +466,10 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
* The access_mask check relies on value-change comparison. To avoid
* reporting a race where e.g. the writer set up the watchpoint, but the
* reader has access_mask!=0, we have to ignore the found watchpoint.
+ *
+ * reorder_access is never created from an access with access_mask set.
*/
- if (ctx->access_mask)
+ if (ctx->access_mask && !find_reorder_access(ctx, ptr, size, type, ip))
return;
/*
@@ -396,7 +493,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
if (consumed) {
kcsan_save_irqtrace(current);
- kcsan_report_set_info(ptr, size, type, watchpoint - watchpoints);
+ kcsan_report_set_info(ptr, size, type, ip, watchpoint - watchpoints);
kcsan_restore_irqtrace(current);
} else {
/*
@@ -416,17 +513,19 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
}
static noinline void
-kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
+kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned long ip)
{
const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
atomic_long_t *watchpoint;
u64 old, new, diff;
- unsigned long access_mask;
enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE;
+ bool interrupt_watcher = kcsan_interrupt_watcher;
unsigned long ua_flags = user_access_save();
struct kcsan_ctx *ctx = get_ctx();
+ unsigned long access_mask = ctx->access_mask;
unsigned long irq_flags = 0;
+ bool is_reorder_access;
/*
* Always reset kcsan_skip counter in slow-path to avoid underflow; see
@@ -450,12 +549,32 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
}
/*
+ * The local CPU cannot observe reordering of its own accesses, and
+ * therefore we need to take care of 2 cases to avoid false positives:
+ *
+ * 1. Races of the reordered access with interrupts. To avoid, if
+ * the current access is reorder_access, disable interrupts.
+ * 2. Avoid races of scoped accesses from nested interrupts (below).
+ */
+ is_reorder_access = find_reorder_access(ctx, ptr, size, type, ip);
+ if (is_reorder_access)
+ interrupt_watcher = false;
+ /*
+ * Avoid races of scoped accesses from nested interrupts (or scheduler).
+ * Assume setting up a watchpoint for a non-scoped (normal) access that
+ * also conflicts with a current scoped access. In a nested interrupt,
+ * which shares the context, it would check a conflicting scoped access.
+ * To avoid, disable scoped access checking.
+ */
+ ctx->disable_scoped++;
+
+ /*
* Save and restore the IRQ state trace touched by KCSAN, since KCSAN's
* runtime is entered for every memory access, and potentially useful
* information is lost if dirtied by KCSAN.
*/
kcsan_save_irqtrace(current);
- if (!kcsan_interrupt_watcher)
+ if (!interrupt_watcher)
local_irq_save(irq_flags);
watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write);
@@ -476,23 +595,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* Read the current value, to later check and infer a race if the data
* was modified via a non-instrumented access, e.g. from a device.
*/
- old = 0;
- switch (size) {
- case 1:
- old = READ_ONCE(*(const u8 *)ptr);
- break;
- case 2:
- old = READ_ONCE(*(const u16 *)ptr);
- break;
- case 4:
- old = READ_ONCE(*(const u32 *)ptr);
- break;
- case 8:
- old = READ_ONCE(*(const u64 *)ptr);
- break;
- default:
- break; /* ignore; we do not diff the values */
- }
+ old = is_reorder_access ? 0 : read_instrumented_memory(ptr, size);
/*
* Delay this thread, to increase probability of observing a racy
@@ -504,23 +607,16 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* Re-read value, and check if it is as expected; if not, we infer a
* racy access.
*/
- access_mask = ctx->access_mask;
- new = 0;
- switch (size) {
- case 1:
- new = READ_ONCE(*(const u8 *)ptr);
- break;
- case 2:
- new = READ_ONCE(*(const u16 *)ptr);
- break;
- case 4:
- new = READ_ONCE(*(const u32 *)ptr);
- break;
- case 8:
- new = READ_ONCE(*(const u64 *)ptr);
- break;
- default:
- break; /* ignore; we do not diff the values */
+ if (!is_reorder_access) {
+ new = read_instrumented_memory(ptr, size);
+ } else {
+ /*
+ * Reordered accesses cannot be used for value change detection,
+ * because the memory location may no longer be accessible and
+ * could result in a fault.
+ */
+ new = 0;
+ access_mask = 0;
}
diff = old ^ new;
@@ -568,8 +664,8 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
- kcsan_report_known_origin(ptr, size, type, value_change,
- watchpoint - watchpoints,
+ kcsan_report_known_origin(ptr, size, type, ip,
+ value_change, watchpoint - watchpoints,
old, new, access_mask);
} else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
/* Inferring a race, since the value should not have changed. */
@@ -578,8 +674,10 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
if (is_assert)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
- if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert)
- kcsan_report_unknown_origin(ptr, size, type, old, new, access_mask);
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) {
+ kcsan_report_unknown_origin(ptr, size, type, ip,
+ old, new, access_mask);
+ }
}
/*
@@ -588,18 +686,27 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
*/
remove_watchpoint(watchpoint);
atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
+
out_unlock:
- if (!kcsan_interrupt_watcher)
+ if (!interrupt_watcher)
local_irq_restore(irq_flags);
kcsan_restore_irqtrace(current);
+ ctx->disable_scoped--;
+
+ /*
+ * Reordered accesses cannot be used for value change detection,
+ * therefore never consider for reordering if access_mask is set.
+ * ASSERT_EXCLUSIVE are not real accesses, ignore them as well.
+ */
+ if (!access_mask && !is_assert)
+ set_reorder_access(ctx, ptr, size, type, ip);
out:
user_access_restore(ua_flags);
}
-static __always_inline void check_access(const volatile void *ptr, size_t size,
- int type)
+static __always_inline void
+check_access(const volatile void *ptr, size_t size, int type, unsigned long ip)
{
- const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
atomic_long_t *watchpoint;
long encoded_watchpoint;
@@ -610,12 +717,14 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
if (unlikely(size == 0))
return;
+again:
/*
* Avoid user_access_save in fast-path: find_watchpoint is safe without
* user_access_save, as the address that ptr points to is only used to
* check if a watchpoint exists; ptr is never dereferenced.
*/
- watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write,
+ watchpoint = find_watchpoint((unsigned long)ptr, size,
+ !(type & KCSAN_ACCESS_WRITE),
&encoded_watchpoint);
/*
* It is safe to check kcsan_is_enabled() after find_watchpoint in the
@@ -625,14 +734,46 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
*/
if (unlikely(watchpoint != NULL))
- kcsan_found_watchpoint(ptr, size, type, watchpoint,
- encoded_watchpoint);
+ kcsan_found_watchpoint(ptr, size, type, ip, watchpoint, encoded_watchpoint);
else {
struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */
- if (unlikely(should_watch(ptr, size, type, ctx)))
- kcsan_setup_watchpoint(ptr, size, type);
- else if (unlikely(ctx->scoped_accesses.prev))
+ if (unlikely(should_watch(ctx, ptr, size, type))) {
+ kcsan_setup_watchpoint(ptr, size, type, ip);
+ return;
+ }
+
+ if (!(type & KCSAN_ACCESS_SCOPED)) {
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (reorder_access) {
+ /*
+ * reorder_access check: simulates reordering of
+ * the access after subsequent operations.
+ */
+ ptr = reorder_access->ptr;
+ type = reorder_access->type;
+ ip = reorder_access->ip;
+ /*
+ * Upon a nested interrupt, this context's
+ * reorder_access can be modified (shared ctx).
+ * We know that upon return, reorder_access is
+ * always invalidated by setting size to 0 via
+ * __tsan_func_exit(). Therefore we must read
+ * and check size after the other fields.
+ */
+ barrier();
+ size = READ_ONCE(reorder_access->size);
+ if (size)
+ goto again;
+ }
+ }
+
+ /*
+ * Always checked last, right before returning from runtime;
+ * if reorder_access is valid, checked after it was checked.
+ */
+ if (unlikely(ctx->scoped_accesses.prev))
kcsan_check_scoped_accesses();
}
}
@@ -757,7 +898,7 @@ kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
{
struct kcsan_ctx *ctx = get_ctx();
- __kcsan_check_access(ptr, size, type);
+ check_access(ptr, size, type, _RET_IP_);
ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */
@@ -765,6 +906,7 @@ kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
sa->ptr = ptr;
sa->size = size;
sa->type = type;
+ sa->ip = _RET_IP_;
if (!ctx->scoped_accesses.prev) /* Lazy initialize list head. */
INIT_LIST_HEAD(&ctx->scoped_accesses);
@@ -796,16 +938,32 @@ void kcsan_end_scoped_access(struct kcsan_scoped_access *sa)
ctx->disable_count--;
- __kcsan_check_access(sa->ptr, sa->size, sa->type);
+ check_access(sa->ptr, sa->size, sa->type, sa->ip);
}
EXPORT_SYMBOL(kcsan_end_scoped_access);
void __kcsan_check_access(const volatile void *ptr, size_t size, int type)
{
- check_access(ptr, size, type);
+ check_access(ptr, size, type, _RET_IP_);
}
EXPORT_SYMBOL(__kcsan_check_access);
+#define DEFINE_MEMORY_BARRIER(name, order_before_cond) \
+ void __kcsan_##name(void) \
+ { \
+ struct kcsan_scoped_access *sa = get_reorder_access(get_ctx()); \
+ if (!sa) \
+ return; \
+ if (order_before_cond) \
+ sa->size = 0; \
+ } \
+ EXPORT_SYMBOL(__kcsan_##name)
+
+DEFINE_MEMORY_BARRIER(mb, true);
+DEFINE_MEMORY_BARRIER(wmb, sa->type & (KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND));
+DEFINE_MEMORY_BARRIER(rmb, !(sa->type & KCSAN_ACCESS_WRITE) || (sa->type & KCSAN_ACCESS_COMPOUND));
+DEFINE_MEMORY_BARRIER(release, true);
+
/*
* KCSAN uses the same instrumentation that is emitted by supported compilers
* for ThreadSanitizer (TSAN).
@@ -823,7 +981,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
void __tsan_read##size(void *ptr); \
void __tsan_read##size(void *ptr) \
{ \
- check_access(ptr, size, 0); \
+ check_access(ptr, size, 0, _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_read##size); \
void __tsan_unaligned_read##size(void *ptr) \
@@ -832,7 +990,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
void __tsan_write##size(void *ptr); \
void __tsan_write##size(void *ptr) \
{ \
- check_access(ptr, size, KCSAN_ACCESS_WRITE); \
+ check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_write##size); \
void __tsan_unaligned_write##size(void *ptr) \
@@ -842,7 +1000,8 @@ EXPORT_SYMBOL(__kcsan_check_access);
void __tsan_read_write##size(void *ptr) \
{ \
check_access(ptr, size, \
- KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE); \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, \
+ _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_read_write##size); \
void __tsan_unaligned_read_write##size(void *ptr) \
@@ -858,14 +1017,14 @@ DEFINE_TSAN_READ_WRITE(16);
void __tsan_read_range(void *ptr, size_t size);
void __tsan_read_range(void *ptr, size_t size)
{
- check_access(ptr, size, 0);
+ check_access(ptr, size, 0, _RET_IP_);
}
EXPORT_SYMBOL(__tsan_read_range);
void __tsan_write_range(void *ptr, size_t size);
void __tsan_write_range(void *ptr, size_t size)
{
- check_access(ptr, size, KCSAN_ACCESS_WRITE);
+ check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_);
}
EXPORT_SYMBOL(__tsan_write_range);
@@ -886,7 +1045,8 @@ EXPORT_SYMBOL(__tsan_write_range);
IS_ALIGNED((unsigned long)ptr, size); \
if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \
return; \
- check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0); \
+ check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0, \
+ _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_volatile_read##size); \
void __tsan_unaligned_volatile_read##size(void *ptr) \
@@ -901,7 +1061,8 @@ EXPORT_SYMBOL(__tsan_write_range);
return; \
check_access(ptr, size, \
KCSAN_ACCESS_WRITE | \
- (is_atomic ? KCSAN_ACCESS_ATOMIC : 0)); \
+ (is_atomic ? KCSAN_ACCESS_ATOMIC : 0), \
+ _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_volatile_write##size); \
void __tsan_unaligned_volatile_write##size(void *ptr) \
@@ -915,19 +1076,56 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(8);
DEFINE_TSAN_VOLATILE_READ_WRITE(16);
/*
- * The below are not required by KCSAN, but can still be emitted by the
- * compiler.
+ * Function entry and exit are used to determine the validty of reorder_access.
+ * Reordering of the access ends at the end of the function scope where the
+ * access happened. This is done for two reasons:
+ *
+ * 1. Artificially limits the scope where missing barriers are detected.
+ * This minimizes false positives due to uninstrumented functions that
+ * contain the required barriers but were missed.
+ *
+ * 2. Simplifies generating the stack trace of the access.
*/
void __tsan_func_entry(void *call_pc);
-void __tsan_func_entry(void *call_pc)
+noinline void __tsan_func_entry(void *call_pc)
{
+ if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ return;
+
+ add_kcsan_stack_depth(1);
}
EXPORT_SYMBOL(__tsan_func_entry);
+
void __tsan_func_exit(void);
-void __tsan_func_exit(void)
+noinline void __tsan_func_exit(void)
{
+ struct kcsan_scoped_access *reorder_access;
+
+ if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ return;
+
+ reorder_access = get_reorder_access(get_ctx());
+ if (!reorder_access)
+ goto out;
+
+ if (get_kcsan_stack_depth() <= reorder_access->stack_depth) {
+ /*
+ * Access check to catch cases where write without a barrier
+ * (supposed release) was last access in function: because
+ * instrumentation is inserted before the real access, a data
+ * race due to the write giving up a c-s would only be caught if
+ * we do the conflicting access after.
+ */
+ check_access(reorder_access->ptr, reorder_access->size,
+ reorder_access->type, reorder_access->ip);
+ reorder_access->size = 0;
+ reorder_access->stack_depth = INT_MIN;
+ }
+out:
+ add_kcsan_stack_depth(-1);
}
EXPORT_SYMBOL(__tsan_func_exit);
+
void __tsan_init(void);
void __tsan_init(void)
{
@@ -950,12 +1148,21 @@ EXPORT_SYMBOL(__tsan_init);
* functions, whose job is to also execute the operation itself.
*/
+static __always_inline void kcsan_atomic_builtin_memorder(int memorder)
+{
+ if (memorder == __ATOMIC_RELEASE ||
+ memorder == __ATOMIC_SEQ_CST ||
+ memorder == __ATOMIC_ACQ_REL)
+ __kcsan_release();
+}
+
#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \
u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \
u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \
{ \
+ kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
- check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \
+ check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
return __atomic_load_n(ptr, memorder); \
} \
@@ -963,9 +1170,10 @@ EXPORT_SYMBOL(__tsan_init);
void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \
void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \
{ \
+ kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
- KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
__atomic_store_n(ptr, v, memorder); \
} \
@@ -975,10 +1183,11 @@ EXPORT_SYMBOL(__tsan_init);
u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \
u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \
{ \
+ kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
- KCSAN_ACCESS_ATOMIC); \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
return __atomic_##op##suffix(ptr, v, memorder); \
} \
@@ -1007,10 +1216,11 @@ EXPORT_SYMBOL(__tsan_init);
int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
u##bits val, int mo, int fail_mo) \
{ \
+ kcsan_atomic_builtin_memorder(mo); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
- KCSAN_ACCESS_ATOMIC); \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \
} \
@@ -1022,10 +1232,11 @@ EXPORT_SYMBOL(__tsan_init);
u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
int mo, int fail_mo) \
{ \
+ kcsan_atomic_builtin_memorder(mo); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
- KCSAN_ACCESS_ATOMIC); \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
__atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \
return exp; \
@@ -1053,10 +1264,47 @@ DEFINE_TSAN_ATOMIC_OPS(64);
void __tsan_atomic_thread_fence(int memorder);
void __tsan_atomic_thread_fence(int memorder)
{
+ kcsan_atomic_builtin_memorder(memorder);
__atomic_thread_fence(memorder);
}
EXPORT_SYMBOL(__tsan_atomic_thread_fence);
+/*
+ * In instrumented files, we emit instrumentation for barriers by mapping the
+ * kernel barriers to an __atomic_signal_fence(), which is interpreted specially
+ * and otherwise has no relation to a real __atomic_signal_fence(). No known
+ * kernel code uses __atomic_signal_fence().
+ *
+ * Since fsanitize=thread instrumentation handles __atomic_signal_fence(), which
+ * are turned into calls to __tsan_atomic_signal_fence(), such instrumentation
+ * can be disabled via the __no_kcsan function attribute (vs. an explicit call
+ * which could not). When __no_kcsan is requested, __atomic_signal_fence()
+ * generates no code.
+ *
+ * Note: The result of using __atomic_signal_fence() with KCSAN enabled is
+ * potentially limiting the compiler's ability to reorder operations; however,
+ * if barriers were instrumented with explicit calls (without LTO), the compiler
+ * couldn't optimize much anyway. The result of a hypothetical architecture
+ * using __atomic_signal_fence() in normal code would be KCSAN false negatives.
+ */
void __tsan_atomic_signal_fence(int memorder);
-void __tsan_atomic_signal_fence(int memorder) { }
+noinline void __tsan_atomic_signal_fence(int memorder)
+{
+ switch (memorder) {
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb:
+ __kcsan_mb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb:
+ __kcsan_wmb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb:
+ __kcsan_rmb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_release:
+ __kcsan_release();
+ break;
+ default:
+ break;
+ }
+}
EXPORT_SYMBOL(__tsan_atomic_signal_fence);
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index f36e25c497ed..ae33c2a7f07e 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -121,7 +121,7 @@ enum kcsan_value_change {
* to be consumed by the reporting thread. No report is printed yet.
*/
void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
- int watchpoint_idx);
+ unsigned long ip, int watchpoint_idx);
/*
* The calling thread observed that the watchpoint it set up was hit and
@@ -129,14 +129,14 @@ void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_typ
* thread.
*/
void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
- enum kcsan_value_change value_change, int watchpoint_idx,
- u64 old, u64 new, u64 mask);
+ unsigned long ip, enum kcsan_value_change value_change,
+ int watchpoint_idx, u64 old, u64 new, u64 mask);
/*
* No other thread was observed to race with the access, but the data value
* before and after the stall differs. Reports a race of "unknown origin".
*/
void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
- u64 old, u64 new, u64 mask);
+ unsigned long ip, u64 old, u64 new, u64 mask);
#endif /* _KERNEL_KCSAN_KCSAN_H */
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index dc55fd5a36fc..a36fca063a73 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -16,9 +16,12 @@
#define pr_fmt(fmt) "kcsan_test: " fmt
#include <kunit/test.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
#include <linux/jiffies.h>
#include <linux/kcsan-checks.h>
#include <linux/kernel.h>
+#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/seqlock.h>
#include <linux/spinlock.h>
@@ -29,6 +32,11 @@
#include <linux/types.h>
#include <trace/events/printk.h>
+#define KCSAN_TEST_REQUIRES(test, cond) do { \
+ if (!(cond)) \
+ kunit_skip((test), "Test requires: " #cond); \
+} while (0)
+
#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE
#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE)
#else
@@ -146,7 +154,7 @@ struct expect_report {
/* Check observed report matches information in @r. */
__no_kcsan
-static bool report_matches(const struct expect_report *r)
+static bool __report_matches(const struct expect_report *r)
{
const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
bool ret = false;
@@ -205,10 +213,12 @@ static bool report_matches(const struct expect_report *r)
"read-write" :
"write") :
"read");
+ const bool is_atomic = (ty & KCSAN_ACCESS_ATOMIC);
+ const bool is_scoped = (ty & KCSAN_ACCESS_SCOPED);
const char *const access_type_aux =
- (ty & KCSAN_ACCESS_ATOMIC) ?
- " (marked)" :
- ((ty & KCSAN_ACCESS_SCOPED) ? " (scoped)" : "");
+ (is_atomic && is_scoped) ? " (marked, reordered)"
+ : (is_atomic ? " (marked)"
+ : (is_scoped ? " (reordered)" : ""));
if (i == 1) {
/* Access 2 */
@@ -246,6 +256,40 @@ out:
return ret;
}
+static __always_inline const struct expect_report *
+__report_set_scoped(struct expect_report *r, int accesses)
+{
+ BUILD_BUG_ON(accesses > 3);
+
+ if (accesses & 1)
+ r->access[0].type |= KCSAN_ACCESS_SCOPED;
+ else
+ r->access[0].type &= ~KCSAN_ACCESS_SCOPED;
+
+ if (accesses & 2)
+ r->access[1].type |= KCSAN_ACCESS_SCOPED;
+ else
+ r->access[1].type &= ~KCSAN_ACCESS_SCOPED;
+
+ return r;
+}
+
+__no_kcsan
+static bool report_matches_any_reordered(struct expect_report *r)
+{
+ return __report_matches(__report_set_scoped(r, 0)) ||
+ __report_matches(__report_set_scoped(r, 1)) ||
+ __report_matches(__report_set_scoped(r, 2)) ||
+ __report_matches(__report_set_scoped(r, 3));
+}
+
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+/* Due to reordering accesses, any access may appear as "(reordered)". */
+#define report_matches report_matches_any_reordered
+#else
+#define report_matches __report_matches
+#endif
+
/* ===== Test kernels ===== */
static long test_sink;
@@ -256,6 +300,8 @@ static struct {
long val[8];
} test_struct;
static DEFINE_SEQLOCK(test_seqlock);
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_MUTEX(test_mutex);
/*
* Helper to avoid compiler optimizing out reads, and to generate source values
@@ -264,6 +310,16 @@ static DEFINE_SEQLOCK(test_seqlock);
__no_kcsan
static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); }
+/*
+ * Generates a delay and some accesses that enter the runtime but do not produce
+ * data races.
+ */
+static noinline void test_delay(int iter)
+{
+ while (iter--)
+ sink_value(READ_ONCE(test_sink));
+}
+
static noinline void test_kernel_read(void) { sink_value(test_var); }
static noinline void test_kernel_write(void)
@@ -333,7 +389,10 @@ static noinline void test_kernel_assert_bits_nochange(void)
ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS);
}
-/* To check that scoped assertions do trigger anywhere in scope. */
+/*
+ * Scoped assertions do trigger anywhere in scope. However, the report should
+ * still only point at the start of the scope.
+ */
static noinline void test_enter_scope(void)
{
int x = 0;
@@ -422,19 +481,239 @@ static noinline void test_kernel_xor_1bit(void)
kcsan_nestable_atomic_end();
}
+#define TEST_KERNEL_LOCKED(name, acquire, release) \
+ static noinline void test_kernel_##name(void) \
+ { \
+ long *flag = &test_struct.val[0]; \
+ long v = 0; \
+ if (!(acquire)) \
+ return; \
+ while (v++ < 100) { \
+ test_var++; \
+ barrier(); \
+ } \
+ release; \
+ test_delay(10); \
+ }
+
+TEST_KERNEL_LOCKED(with_memorder,
+ cmpxchg_acquire(flag, 0, 1) == 0,
+ smp_store_release(flag, 0));
+TEST_KERNEL_LOCKED(wrong_memorder,
+ cmpxchg_relaxed(flag, 0, 1) == 0,
+ WRITE_ONCE(*flag, 0));
+TEST_KERNEL_LOCKED(atomic_builtin_with_memorder,
+ __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED),
+ __atomic_store_n(flag, 0, __ATOMIC_RELEASE));
+TEST_KERNEL_LOCKED(atomic_builtin_wrong_memorder,
+ __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED),
+ __atomic_store_n(flag, 0, __ATOMIC_RELAXED));
+
/* ===== Test cases ===== */
+/*
+ * Tests that various barriers have the expected effect on internal state. Not
+ * exhaustive on atomic_t operations. Unlike the selftest, also checks for
+ * too-strict barrier instrumentation; these can be tolerated, because it does
+ * not cause false positives, but at least we should be aware of such cases.
+ */
+static void test_barrier_nothreads(struct kunit *test)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ struct kcsan_scoped_access *reorder_access = &current->kcsan_ctx.reorder_access;
+#else
+ struct kcsan_scoped_access *reorder_access = NULL;
+#endif
+ arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
+ atomic_t dummy;
+
+ KCSAN_TEST_REQUIRES(test, reorder_access != NULL);
+ KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP));
+
+#define __KCSAN_EXPECT_BARRIER(access_type, barrier, order_before, name) \
+ do { \
+ reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
+ reorder_access->size = sizeof(test_var); \
+ barrier; \
+ KUNIT_EXPECT_EQ_MSG(test, reorder_access->size, \
+ order_before ? 0 : sizeof(test_var), \
+ "improperly instrumented type=(" #access_type "): " name); \
+ } while (0)
+#define KCSAN_EXPECT_READ_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(0, b, o, #b)
+#define KCSAN_EXPECT_WRITE_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_WRITE, b, o, #b)
+#define KCSAN_EXPECT_RW_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, b, o, #b)
+
+ /*
+ * Lockdep initialization can strengthen certain locking operations due
+ * to calling into instrumented files; "warm up" our locks.
+ */
+ spin_lock(&test_spinlock);
+ spin_unlock(&test_spinlock);
+ mutex_lock(&test_mutex);
+ mutex_unlock(&test_mutex);
+
+ /* Force creating a valid entry in reorder_access first. */
+ test_var = 0;
+ while (test_var++ < 1000000 && reorder_access->size != sizeof(test_var))
+ __kcsan_check_read(&test_var, sizeof(test_var));
+ KUNIT_ASSERT_EQ(test, reorder_access->size, sizeof(test_var));
+
+ kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
+
+ KCSAN_EXPECT_READ_BARRIER(mb(), true);
+ KCSAN_EXPECT_READ_BARRIER(wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(smp_rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(dma_wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(dma_rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_READ_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_READ_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_READ_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_READ_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_READ_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_READ_BARRIER(mutex_unlock(&test_mutex), true);
+
+ KCSAN_EXPECT_WRITE_BARRIER(mb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(dma_wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(dma_rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_WRITE_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_WRITE_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_WRITE_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_WRITE_BARRIER(mutex_unlock(&test_mutex), true);
+
+ KCSAN_EXPECT_RW_BARRIER(mb(), true);
+ KCSAN_EXPECT_RW_BARRIER(wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(dma_wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(dma_rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_RW_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_RW_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_RW_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true);
+
+#ifdef clear_bit_unlock_is_negative_byte
+ KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
+#endif
+ kcsan_nestable_atomic_end();
+}
+
/* Simple test with normal data race. */
__no_kcsan
static void test_basic(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
- static const struct expect_report never = {
+ struct expect_report never = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
@@ -459,14 +738,14 @@ static void test_basic(struct kunit *test)
__no_kcsan
static void test_concurrent_races(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
/* NULL will match any address. */
{ test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
{ test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) },
},
};
- static const struct expect_report never = {
+ struct expect_report never = {
.access = {
{ test_kernel_rmw_array, NULL, 0, 0 },
{ test_kernel_rmw_array, NULL, 0, 0 },
@@ -488,17 +767,24 @@ static void test_concurrent_races(struct kunit *test)
__no_kcsan
static void test_novalue_change(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect_rw = {
.access = {
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
+ struct expect_report expect_ww = {
+ .access = {
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
bool match_expect = false;
+ test_kernel_write_nochange(); /* Reset value. */
begin_test_checks(test_kernel_write_nochange, test_kernel_read);
do {
- match_expect = report_matches(&expect);
+ match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
} while (!end_test_checks(match_expect));
if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY))
KUNIT_EXPECT_FALSE(test, match_expect);
@@ -513,17 +799,24 @@ static void test_novalue_change(struct kunit *test)
__no_kcsan
static void test_novalue_change_exception(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect_rw = {
.access = {
{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
+ struct expect_report expect_ww = {
+ .access = {
+ { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
bool match_expect = false;
+ test_kernel_write_nochange_rcu(); /* Reset value. */
begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read);
do {
- match_expect = report_matches(&expect);
+ match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
} while (!end_test_checks(match_expect));
KUNIT_EXPECT_TRUE(test, match_expect);
}
@@ -532,7 +825,7 @@ static void test_novalue_change_exception(struct kunit *test)
__no_kcsan
static void test_unknown_origin(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ NULL },
@@ -554,7 +847,7 @@ static void test_unknown_origin(struct kunit *test)
__no_kcsan
static void test_write_write_assume_atomic(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
@@ -580,7 +873,7 @@ static void test_write_write_assume_atomic(struct kunit *test)
__no_kcsan
static void test_write_write_struct(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
@@ -602,7 +895,7 @@ static void test_write_write_struct(struct kunit *test)
__no_kcsan
static void test_write_write_struct_part(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE },
@@ -634,7 +927,7 @@ static void test_read_atomic_write_atomic(struct kunit *test)
__no_kcsan
static void test_read_plain_atomic_write(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
@@ -642,8 +935,7 @@ static void test_read_plain_atomic_write(struct kunit *test)
};
bool match_expect = false;
- if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
- return;
+ KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
begin_test_checks(test_kernel_read, test_kernel_write_atomic);
do {
@@ -656,7 +948,7 @@ static void test_read_plain_atomic_write(struct kunit *test)
__no_kcsan
static void test_read_plain_atomic_rmw(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_atomic_rmw, &test_var, sizeof(test_var),
@@ -665,8 +957,7 @@ static void test_read_plain_atomic_rmw(struct kunit *test)
};
bool match_expect = false;
- if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
- return;
+ KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
begin_test_checks(test_kernel_read, test_kernel_atomic_rmw);
do {
@@ -679,13 +970,13 @@ static void test_read_plain_atomic_rmw(struct kunit *test)
__no_kcsan
static void test_zero_size_access(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
},
};
- const struct expect_report never = {
+ struct expect_report never = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 },
@@ -719,7 +1010,7 @@ static void test_data_race(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_writer(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
@@ -737,7 +1028,7 @@ static void test_assert_exclusive_writer(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_access(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
@@ -755,19 +1046,19 @@ static void test_assert_exclusive_access(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_access_writer(struct kunit *test)
{
- const struct expect_report expect_access_writer = {
+ struct expect_report expect_access_writer = {
.access = {
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
},
};
- const struct expect_report expect_access_access = {
+ struct expect_report expect_access_access = {
.access = {
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
},
};
- const struct expect_report never = {
+ struct expect_report never = {
.access = {
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
@@ -791,7 +1082,7 @@ static void test_assert_exclusive_access_writer(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_bits_change(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
{ test_kernel_change_bits, &test_var, sizeof(test_var),
@@ -822,43 +1113,43 @@ static void test_assert_exclusive_bits_nochange(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_writer_scoped(struct kunit *test)
{
- const struct expect_report expect_start = {
+ struct expect_report expect_start = {
.access = {
{ test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
},
};
- const struct expect_report expect_anywhere = {
+ struct expect_report expect_inscope = {
.access = {
{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
},
};
bool match_expect_start = false;
- bool match_expect_anywhere = false;
+ bool match_expect_inscope = false;
begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange);
do {
match_expect_start |= report_matches(&expect_start);
- match_expect_anywhere |= report_matches(&expect_anywhere);
- } while (!end_test_checks(match_expect_start && match_expect_anywhere));
+ match_expect_inscope |= report_matches(&expect_inscope);
+ } while (!end_test_checks(match_expect_inscope));
KUNIT_EXPECT_TRUE(test, match_expect_start);
- KUNIT_EXPECT_TRUE(test, match_expect_anywhere);
+ KUNIT_EXPECT_FALSE(test, match_expect_inscope);
}
__no_kcsan
static void test_assert_exclusive_access_scoped(struct kunit *test)
{
- const struct expect_report expect_start1 = {
+ struct expect_report expect_start1 = {
.access = {
{ test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
- const struct expect_report expect_start2 = {
+ struct expect_report expect_start2 = {
.access = { expect_start1.access[0], expect_start1.access[0] },
};
- const struct expect_report expect_inscope = {
+ struct expect_report expect_inscope = {
.access = {
{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
@@ -872,9 +1163,9 @@ static void test_assert_exclusive_access_scoped(struct kunit *test)
do {
match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2);
match_expect_inscope |= report_matches(&expect_inscope);
- } while (!end_test_checks(match_expect_start && match_expect_inscope));
+ } while (!end_test_checks(match_expect_inscope));
KUNIT_EXPECT_TRUE(test, match_expect_start);
- KUNIT_EXPECT_TRUE(test, match_expect_inscope);
+ KUNIT_EXPECT_FALSE(test, match_expect_inscope);
}
/*
@@ -963,7 +1254,7 @@ static void test_atomic_builtins(struct kunit *test)
__no_kcsan
static void test_1bit_value_change(struct kunit *test)
{
- const struct expect_report expect = {
+ struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_xor_1bit, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
@@ -983,6 +1274,90 @@ static void test_1bit_value_change(struct kunit *test)
KUNIT_EXPECT_TRUE(test, match);
}
+__no_kcsan
+static void test_correct_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_with_memorder, test_kernel_with_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_missing_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_wrong_memorder, test_kernel_wrong_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ else
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_atomic_builtins_correct_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_atomic_builtin_with_memorder,
+ test_kernel_atomic_builtin_with_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_atomic_builtins_missing_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_atomic_builtin_wrong_memorder,
+ test_kernel_atomic_builtin_wrong_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ else
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
/*
* Generate thread counts for all test cases. Values generated are in interval
* [2, 5] followed by exponentially increasing thread counts from 8 to 32.
@@ -1032,6 +1407,7 @@ static const void *nthreads_gen_params(const void *prev, char *desc)
#define KCSAN_KUNIT_CASE(test_name) KUNIT_CASE_PARAM(test_name, nthreads_gen_params)
static struct kunit_case kcsan_test_cases[] = {
+ KUNIT_CASE(test_barrier_nothreads),
KCSAN_KUNIT_CASE(test_basic),
KCSAN_KUNIT_CASE(test_concurrent_races),
KCSAN_KUNIT_CASE(test_novalue_change),
@@ -1056,6 +1432,10 @@ static struct kunit_case kcsan_test_cases[] = {
KCSAN_KUNIT_CASE(test_seqlock_noreport),
KCSAN_KUNIT_CASE(test_atomic_builtins),
KCSAN_KUNIT_CASE(test_1bit_value_change),
+ KCSAN_KUNIT_CASE(test_correct_barrier),
+ KCSAN_KUNIT_CASE(test_missing_barrier),
+ KCSAN_KUNIT_CASE(test_atomic_builtins_correct_barrier),
+ KCSAN_KUNIT_CASE(test_atomic_builtins_missing_barrier),
{},
};
@@ -1120,6 +1500,9 @@ static int test_init(struct kunit *test)
observed.nlines = 0;
spin_unlock_irqrestore(&observed.lock, flags);
+ if (strstr(test->name, "nothreads"))
+ return 0;
+
if (!torture_init_begin((char *)test->name, 1))
return -EBUSY;
@@ -1162,6 +1545,9 @@ static void test_exit(struct kunit *test)
struct task_struct **stop_thread;
int i;
+ if (strstr(test->name, "nothreads"))
+ return;
+
if (torture_cleanup_begin())
return;
@@ -1224,7 +1610,7 @@ static void kcsan_test_exit(void)
tracepoint_synchronize_unregister();
}
-late_initcall(kcsan_test_init);
+late_initcall_sync(kcsan_test_init);
module_exit(kcsan_test_exit);
MODULE_LICENSE("GPL v2");
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 21137929d428..67794404042a 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -8,6 +8,7 @@
#include <linux/debug_locks.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
+#include <linux/kallsyms.h>
#include <linux/kernel.h>
#include <linux/lockdep.h>
#include <linux/preempt.h>
@@ -31,6 +32,7 @@ struct access_info {
int access_type;
int task_pid;
int cpu_id;
+ unsigned long ip;
};
/*
@@ -213,9 +215,9 @@ static const char *get_access_type(int type)
if (type & KCSAN_ACCESS_ASSERT) {
if (type & KCSAN_ACCESS_SCOPED) {
if (type & KCSAN_ACCESS_WRITE)
- return "assert no accesses (scoped)";
+ return "assert no accesses (reordered)";
else
- return "assert no writes (scoped)";
+ return "assert no writes (reordered)";
} else {
if (type & KCSAN_ACCESS_WRITE)
return "assert no accesses";
@@ -238,13 +240,17 @@ static const char *get_access_type(int type)
case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
return "read-write (marked)";
case KCSAN_ACCESS_SCOPED:
- return "read (scoped)";
+ return "read (reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC:
- return "read (marked, scoped)";
+ return "read (marked, reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE:
- return "write (scoped)";
+ return "write (reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
- return "write (marked, scoped)";
+ return "write (marked, reordered)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE:
+ return "read-write (reordered)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "read-write (marked, reordered)";
default:
BUG();
}
@@ -300,6 +306,52 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
return skip;
}
+/*
+ * Skips to the first entry that matches the function of @ip, and then replaces
+ * that entry with @ip, returning the entries to skip with @replaced containing
+ * the replaced entry.
+ */
+static int
+replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned long ip,
+ unsigned long *replaced)
+{
+ unsigned long symbolsize, offset;
+ unsigned long target_func;
+ int skip;
+
+ if (kallsyms_lookup_size_offset(ip, &symbolsize, &offset))
+ target_func = ip - offset;
+ else
+ goto fallback;
+
+ for (skip = 0; skip < num_entries; ++skip) {
+ unsigned long func = stack_entries[skip];
+
+ if (!kallsyms_lookup_size_offset(func, &symbolsize, &offset))
+ goto fallback;
+ func -= offset;
+
+ if (func == target_func) {
+ *replaced = stack_entries[skip];
+ stack_entries[skip] = ip;
+ return skip;
+ }
+ }
+
+fallback:
+ /* Should not happen; the resulting stack trace is likely misleading. */
+ WARN_ONCE(1, "Cannot find frame for %pS in stack trace", (void *)ip);
+ return get_stack_skipnr(stack_entries, num_entries);
+}
+
+static int
+sanitize_stack_entries(unsigned long stack_entries[], int num_entries, unsigned long ip,
+ unsigned long *replaced)
+{
+ return ip ? replace_stack_entry(stack_entries, num_entries, ip, replaced) :
+ get_stack_skipnr(stack_entries, num_entries);
+}
+
/* Compares symbolized strings of addr1 and addr2. */
static int sym_strcmp(void *addr1, void *addr2)
{
@@ -312,6 +364,14 @@ static int sym_strcmp(void *addr1, void *addr2)
return strncmp(buf1, buf2, sizeof(buf1));
}
+static void
+print_stack_trace(unsigned long stack_entries[], int num_entries, unsigned long reordered_to)
+{
+ stack_trace_print(stack_entries, num_entries, 0);
+ if (reordered_to)
+ pr_err(" |\n +-> reordered to: %pS\n", (void *)reordered_to);
+}
+
static void print_verbose_info(struct task_struct *task)
{
if (!task)
@@ -327,13 +387,15 @@ static void print_verbose_info(struct task_struct *task)
static void print_report(enum kcsan_value_change value_change,
const struct access_info *ai,
- const struct other_info *other_info,
+ struct other_info *other_info,
u64 old, u64 new, u64 mask)
{
+ unsigned long reordered_to = 0;
unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 };
int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1);
- int skipnr = get_stack_skipnr(stack_entries, num_stack_entries);
+ int skipnr = sanitize_stack_entries(stack_entries, num_stack_entries, ai->ip, &reordered_to);
unsigned long this_frame = stack_entries[skipnr];
+ unsigned long other_reordered_to = 0;
unsigned long other_frame = 0;
int other_skipnr = 0; /* silence uninit warnings */
@@ -344,8 +406,9 @@ static void print_report(enum kcsan_value_change value_change,
return;
if (other_info) {
- other_skipnr = get_stack_skipnr(other_info->stack_entries,
- other_info->num_stack_entries);
+ other_skipnr = sanitize_stack_entries(other_info->stack_entries,
+ other_info->num_stack_entries,
+ other_info->ai.ip, &other_reordered_to);
other_frame = other_info->stack_entries[other_skipnr];
/* @value_change is only known for the other thread */
@@ -385,10 +448,9 @@ static void print_report(enum kcsan_value_change value_change,
other_info->ai.cpu_id);
/* Print the other thread's stack trace. */
- stack_trace_print(other_info->stack_entries + other_skipnr,
+ print_stack_trace(other_info->stack_entries + other_skipnr,
other_info->num_stack_entries - other_skipnr,
- 0);
-
+ other_reordered_to);
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
print_verbose_info(other_info->task);
@@ -402,9 +464,7 @@ static void print_report(enum kcsan_value_change value_change,
get_thread_desc(ai->task_pid), ai->cpu_id);
}
/* Print stack trace of this thread. */
- stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr,
- 0);
-
+ print_stack_trace(stack_entries + skipnr, num_stack_entries - skipnr, reordered_to);
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
print_verbose_info(current);
@@ -576,21 +636,23 @@ discard:
}
static struct access_info prepare_access_info(const volatile void *ptr, size_t size,
- int access_type)
+ int access_type, unsigned long ip)
{
return (struct access_info) {
.ptr = ptr,
.size = size,
.access_type = access_type,
.task_pid = in_task() ? task_pid_nr(current) : -1,
- .cpu_id = raw_smp_processor_id()
+ .cpu_id = raw_smp_processor_id(),
+ /* Only replace stack entry with @ip if scoped access. */
+ .ip = (access_type & KCSAN_ACCESS_SCOPED) ? ip : 0,
};
}
void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
- int watchpoint_idx)
+ unsigned long ip, int watchpoint_idx)
{
- const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
unsigned long flags;
kcsan_disable_current();
@@ -603,10 +665,10 @@ void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_typ
}
void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
- enum kcsan_value_change value_change, int watchpoint_idx,
- u64 old, u64 new, u64 mask)
+ unsigned long ip, enum kcsan_value_change value_change,
+ int watchpoint_idx, u64 old, u64 new, u64 mask)
{
- const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
struct other_info *other_info = &other_infos[watchpoint_idx];
unsigned long flags = 0;
@@ -637,9 +699,9 @@ out:
}
void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
- u64 old, u64 new, u64 mask)
+ unsigned long ip, u64 old, u64 new, u64 mask)
{
- const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
unsigned long flags;
kcsan_disable_current();
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 7f29cb0f5e63..75712959c84e 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -7,10 +7,15 @@
#define pr_fmt(fmt) "kcsan: " fmt
+#include <linux/atomic.h>
+#include <linux/bitops.h>
#include <linux/init.h>
+#include <linux/kcsan-checks.h>
#include <linux/kernel.h>
#include <linux/printk.h>
#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
#include <linux/types.h>
#include "encoding.h"
@@ -18,7 +23,7 @@
#define ITERS_PER_TEST 2000
/* Test requirements. */
-static bool test_requires(void)
+static bool __init test_requires(void)
{
/* random should be initialized for the below tests */
return prandom_u32() + prandom_u32() != 0;
@@ -28,14 +33,18 @@ static bool test_requires(void)
* Test watchpoint encode and decode: check that encoding some access's info,
* and then subsequent decode preserves the access's info.
*/
-static bool test_encode_decode(void)
+static bool __init test_encode_decode(void)
{
int i;
for (i = 0; i < ITERS_PER_TEST; ++i) {
size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
bool is_write = !!prandom_u32_max(2);
+ unsigned long verif_masked_addr;
+ long encoded_watchpoint;
+ bool verif_is_write;
unsigned long addr;
+ size_t verif_size;
prandom_bytes(&addr, sizeof(addr));
if (addr < PAGE_SIZE)
@@ -44,53 +53,37 @@ static bool test_encode_decode(void)
if (WARN_ON(!check_encodable(addr, size)))
return false;
- /* Encode and decode */
- {
- const long encoded_watchpoint =
- encode_watchpoint(addr, size, is_write);
- unsigned long verif_masked_addr;
- size_t verif_size;
- bool verif_is_write;
-
- /* Check special watchpoints */
- if (WARN_ON(decode_watchpoint(
- INVALID_WATCHPOINT, &verif_masked_addr,
- &verif_size, &verif_is_write)))
- return false;
- if (WARN_ON(decode_watchpoint(
- CONSUMED_WATCHPOINT, &verif_masked_addr,
- &verif_size, &verif_is_write)))
- return false;
-
- /* Check decoding watchpoint returns same data */
- if (WARN_ON(!decode_watchpoint(
- encoded_watchpoint, &verif_masked_addr,
- &verif_size, &verif_is_write)))
- return false;
- if (WARN_ON(verif_masked_addr !=
- (addr & WATCHPOINT_ADDR_MASK)))
- goto fail;
- if (WARN_ON(verif_size != size))
- goto fail;
- if (WARN_ON(is_write != verif_is_write))
- goto fail;
-
- continue;
-fail:
- pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
- __func__, is_write ? "write" : "read", size,
- addr, encoded_watchpoint,
- verif_is_write ? "write" : "read", verif_size,
- verif_masked_addr);
+ encoded_watchpoint = encode_watchpoint(addr, size, is_write);
+
+ /* Check special watchpoints */
+ if (WARN_ON(decode_watchpoint(INVALID_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
+ return false;
+ if (WARN_ON(decode_watchpoint(CONSUMED_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
return false;
- }
+
+ /* Check decoding watchpoint returns same data */
+ if (WARN_ON(!decode_watchpoint(encoded_watchpoint, &verif_masked_addr, &verif_size, &verif_is_write)))
+ return false;
+ if (WARN_ON(verif_masked_addr != (addr & WATCHPOINT_ADDR_MASK)))
+ goto fail;
+ if (WARN_ON(verif_size != size))
+ goto fail;
+ if (WARN_ON(is_write != verif_is_write))
+ goto fail;
+
+ continue;
+fail:
+ pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
+ __func__, is_write ? "write" : "read", size, addr, encoded_watchpoint,
+ verif_is_write ? "write" : "read", verif_size, verif_masked_addr);
+ return false;
}
return true;
}
/* Test access matching function. */
-static bool test_matching_access(void)
+static bool __init test_matching_access(void)
{
if (WARN_ON(!matching_access(10, 1, 10, 1)))
return false;
@@ -115,6 +108,143 @@ static bool test_matching_access(void)
return true;
}
+/*
+ * Correct memory barrier instrumentation is critical to avoiding false
+ * positives: simple test to check at boot certain barriers are always properly
+ * instrumented. See kcsan_test for a more complete test.
+ */
+static DEFINE_SPINLOCK(test_spinlock);
+static bool __init test_barrier(void)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ struct kcsan_scoped_access *reorder_access = &current->kcsan_ctx.reorder_access;
+#else
+ struct kcsan_scoped_access *reorder_access = NULL;
+#endif
+ bool ret = true;
+ arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
+ atomic_t dummy;
+ long test_var;
+
+ if (!reorder_access || !IS_ENABLED(CONFIG_SMP))
+ return true;
+
+#define __KCSAN_CHECK_BARRIER(access_type, barrier, name) \
+ do { \
+ reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
+ reorder_access->size = 1; \
+ barrier; \
+ if (reorder_access->size != 0) { \
+ pr_err("improperly instrumented type=(" #access_type "): " name "\n"); \
+ ret = false; \
+ } \
+ } while (0)
+#define KCSAN_CHECK_READ_BARRIER(b) __KCSAN_CHECK_BARRIER(0, b, #b)
+#define KCSAN_CHECK_WRITE_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE, b, #b)
+#define KCSAN_CHECK_RW_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND, b, #b)
+
+ kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
+
+ KCSAN_CHECK_READ_BARRIER(mb());
+ KCSAN_CHECK_READ_BARRIER(rmb());
+ KCSAN_CHECK_READ_BARRIER(smp_mb());
+ KCSAN_CHECK_READ_BARRIER(smp_rmb());
+ KCSAN_CHECK_READ_BARRIER(dma_rmb());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_READ_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_READ_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_READ_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_READ_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_READ_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_READ_BARRIER(spin_unlock(&test_spinlock));
+
+ KCSAN_CHECK_WRITE_BARRIER(mb());
+ KCSAN_CHECK_WRITE_BARRIER(wmb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_wmb());
+ KCSAN_CHECK_WRITE_BARRIER(dma_wmb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_WRITE_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_WRITE_BARRIER(spin_unlock(&test_spinlock));
+
+ KCSAN_CHECK_RW_BARRIER(mb());
+ KCSAN_CHECK_RW_BARRIER(wmb());
+ KCSAN_CHECK_RW_BARRIER(rmb());
+ KCSAN_CHECK_RW_BARRIER(smp_mb());
+ KCSAN_CHECK_RW_BARRIER(smp_wmb());
+ KCSAN_CHECK_RW_BARRIER(smp_rmb());
+ KCSAN_CHECK_RW_BARRIER(dma_wmb());
+ KCSAN_CHECK_RW_BARRIER(dma_rmb());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_RW_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_RW_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_RW_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_RW_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock));
+
+#ifdef clear_bit_unlock_is_negative_byte
+ KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
+#endif
+ kcsan_nestable_atomic_end();
+
+ return ret;
+}
+
static int __init kcsan_selftest(void)
{
int passed = 0;
@@ -132,6 +262,7 @@ static int __init kcsan_selftest(void)
RUN_TEST(test_requires);
RUN_TEST(test_encode_decode);
RUN_TEST(test_matching_access);
+ RUN_TEST(test_barrier);
pr_info("selftest: %d/%d tests passed\n", passed, total);
if (passed != total)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c82c6c06f051..b5e40f069768 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -19,26 +19,9 @@
#include "kexec_internal.h"
-static int copy_user_segment_list(struct kimage *image,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
-{
- int ret;
- size_t segment_bytes;
-
- /* Read in the segments */
- image->nr_segments = nr_segments;
- segment_bytes = nr_segments * sizeof(*segments);
- ret = copy_from_user(image->segment, segments, segment_bytes);
- if (ret)
- ret = -EFAULT;
-
- return ret;
-}
-
static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
unsigned long nr_segments,
- struct kexec_segment __user *segments,
+ struct kexec_segment *segments,
unsigned long flags)
{
int ret;
@@ -58,10 +41,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
return -ENOMEM;
image->start = entry;
-
- ret = copy_user_segment_list(image, nr_segments, segments);
- if (ret)
- goto out_free_image;
+ image->nr_segments = nr_segments;
+ memcpy(image->segment, segments, nr_segments * sizeof(*segments));
if (kexec_on_panic) {
/* Enable special crash kernel control page alloc policy. */
@@ -104,12 +85,23 @@ out_free_image:
}
static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
- struct kexec_segment __user *segments, unsigned long flags)
+ struct kexec_segment *segments, unsigned long flags)
{
struct kimage **dest_image, *image;
unsigned long i;
int ret;
+ /*
+ * Because we write directly to the reserved memory region when loading
+ * crash kernels we need a mutex here to prevent multiple crash kernels
+ * from attempting to load simultaneously, and to prevent a crash kernel
+ * from loading over the top of a in use crash kernel.
+ *
+ * KISS: always take the mutex.
+ */
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
if (flags & KEXEC_ON_CRASH) {
dest_image = &kexec_crash_image;
if (kexec_crash_image)
@@ -121,7 +113,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
if (nr_segments == 0) {
/* Uninstall image */
kimage_free(xchg(dest_image, NULL));
- return 0;
+ ret = 0;
+ goto out_unlock;
}
if (flags & KEXEC_ON_CRASH) {
/*
@@ -134,7 +127,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags);
if (ret)
- return ret;
+ goto out_unlock;
if (flags & KEXEC_PRESERVE_CONTEXT)
image->preserve_context = 1;
@@ -171,6 +164,8 @@ out:
arch_kexec_protect_crashkres();
kimage_free(image);
+out_unlock:
+ mutex_unlock(&kexec_mutex);
return ret;
}
@@ -236,7 +231,8 @@ static inline int kexec_load_check(unsigned long nr_segments,
SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
struct kexec_segment __user *, segments, unsigned long, flags)
{
- int result;
+ struct kexec_segment *ksegments;
+ unsigned long result;
result = kexec_load_check(nr_segments, flags);
if (result)
@@ -247,20 +243,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
return -EINVAL;
- /* Because we write directly to the reserved memory
- * region when loading crash kernels we need a mutex here to
- * prevent multiple crash kernels from attempting to load
- * simultaneously, and to prevent a crash kernel from loading
- * over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
- */
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
+ ksegments = memdup_user(segments, nr_segments * sizeof(ksegments[0]));
+ if (IS_ERR(ksegments))
+ return PTR_ERR(ksegments);
- result = do_kexec_load(entry, nr_segments, segments, flags);
-
- mutex_unlock(&kexec_mutex);
+ result = do_kexec_load(entry, nr_segments, ksegments, flags);
+ kfree(ksegments);
return result;
}
@@ -272,7 +260,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
compat_ulong_t, flags)
{
struct compat_kexec_segment in;
- struct kexec_segment out, __user *ksegments;
+ struct kexec_segment *ksegments;
unsigned long i, result;
result = kexec_load_check(nr_segments, flags);
@@ -285,37 +273,26 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
return -EINVAL;
- ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
+ ksegments = kmalloc_array(nr_segments, sizeof(ksegments[0]),
+ GFP_KERNEL);
+ if (!ksegments)
+ return -ENOMEM;
+
for (i = 0; i < nr_segments; i++) {
result = copy_from_user(&in, &segments[i], sizeof(in));
if (result)
- return -EFAULT;
+ goto fail;
- out.buf = compat_ptr(in.buf);
- out.bufsz = in.bufsz;
- out.mem = in.mem;
- out.memsz = in.memsz;
-
- result = copy_to_user(&ksegments[i], &out, sizeof(out));
- if (result)
- return -EFAULT;
+ ksegments[i].buf = compat_ptr(in.buf);
+ ksegments[i].bufsz = in.bufsz;
+ ksegments[i].mem = in.mem;
+ ksegments[i].memsz = in.memsz;
}
- /* Because we write directly to the reserved memory
- * region when loading crash kernels we need a mutex here to
- * prevent multiple crash kernels from attempting to load
- * simultaneously, and to prevent a crash kernel from loading
- * over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
- */
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
-
result = do_kexec_load(entry, nr_segments, ksegments, flags);
- mutex_unlock(&kexec_mutex);
-
+fail:
+ kfree(ksegments);
return result;
}
#endif
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 33400ff051a8..8347fc158d2b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -556,6 +556,11 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return func(&crashk_res, kbuf);
+ /*
+ * Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See
+ * IORESOURCE_SYSRAM_DRIVER_MANAGED handling in
+ * locate_mem_hole_callback().
+ */
if (kbuf->top_down) {
for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE,
&mstart, &mend, NULL) {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 790a573bbe00..21eccc961bba 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Kernel Probes (KProbes)
- * kernel/kprobes.c
*
* Copyright (C) IBM Corporation, 2002, 2004
*
@@ -18,6 +17,9 @@
* <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
* <prasanna@in.ibm.com> added function-return probes.
*/
+
+#define pr_fmt(fmt) "kprobes: " fmt
+
#include <linux/kprobes.h>
#include <linux/hash.h>
#include <linux/init.h>
@@ -49,18 +51,18 @@
static int kprobes_initialized;
/* kprobe_table can be accessed by
- * - Normal hlist traversal and RCU add/del under kprobe_mutex is held.
+ * - Normal hlist traversal and RCU add/del under 'kprobe_mutex' is held.
* Or
* - RCU hlist traversal under disabling preempt (breakpoint handlers)
*/
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
-/* NOTE: change this value only with kprobe_mutex held */
+/* NOTE: change this value only with 'kprobe_mutex' held */
static bool kprobes_all_disarmed;
-/* This protects kprobe_table and optimizing_list */
+/* This protects 'kprobe_table' and 'optimizing_list' */
static DEFINE_MUTEX(kprobe_mutex);
-static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+static DEFINE_PER_CPU(struct kprobe *, kprobe_instance);
kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
unsigned int __unused)
@@ -68,12 +70,15 @@ kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
}
-/* Blacklist -- list of struct kprobe_blacklist_entry */
+/*
+ * Blacklist -- list of 'struct kprobe_blacklist_entry' to store info where
+ * kprobes can not probe.
+ */
static LIST_HEAD(kprobe_blacklist);
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
/*
- * kprobe->ainsn.insn points to the copy of the instruction to be
+ * 'kprobe::ainsn.insn' points to the copy of the instruction to be
* single-stepped. x86_64, POWER4 and above have no-exec support and
* stepping on the instruction on a vmalloced/kmalloced/data page
* is a recipe for disaster
@@ -104,6 +109,12 @@ enum kprobe_slot_state {
void __weak *alloc_insn_page(void)
{
+ /*
+ * Use module_alloc() so this page is within +/- 2GB of where the
+ * kernel image and loaded module images reside. This is required
+ * for most of the architectures.
+ * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
+ */
return module_alloc(PAGE_SIZE);
}
@@ -139,6 +150,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
list_for_each_entry_rcu(kip, &c->pages, list) {
if (kip->nused < slots_per_page(c)) {
int i;
+
for (i = 0; i < slots_per_page(c); i++) {
if (kip->slot_used[i] == SLOT_CLEAN) {
kip->slot_used[i] = SLOT_USED;
@@ -164,11 +176,6 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
if (!kip)
goto out;
- /*
- * Use module_alloc so this page is within +/- 2GB of where the
- * kernel image and loaded module images reside. This is required
- * so x86_64 can correctly handle the %rip-relative fixups.
- */
kip->insns = c->alloc();
if (!kip->insns) {
kfree(kip);
@@ -191,8 +198,8 @@ out:
return slot;
}
-/* Return 1 if all garbages are collected, otherwise 0. */
-static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
+/* Return true if all garbages are collected, otherwise false. */
+static bool collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
kip->slot_used[idx] = SLOT_CLEAN;
kip->nused--;
@@ -216,9 +223,9 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
kip->cache->free(kip->insns);
kfree(kip);
}
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static int collect_garbage_slots(struct kprobe_insn_cache *c)
@@ -230,6 +237,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
list_for_each_entry_safe(kip, next, &c->pages, list) {
int i;
+
if (kip->ngarbage == 0)
continue;
kip->ngarbage = 0; /* we will collect all garbages */
@@ -310,7 +318,7 @@ int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
list_for_each_entry_rcu(kip, &c->pages, list) {
if ((*symnum)--)
continue;
- strlcpy(sym, c->sym, KSYM_NAME_LEN);
+ strscpy(sym, c->sym, KSYM_NAME_LEN);
*type = 't';
*value = (unsigned long)kip->insns;
ret = 0;
@@ -358,9 +366,9 @@ static inline void reset_kprobe_instance(void)
/*
* This routine is called either:
- * - under the kprobe_mutex - during kprobe_[un]register()
- * OR
- * - with preemption disabled - from arch/xxx/kernel/kprobes.c
+ * - under the 'kprobe_mutex' - during kprobe_[un]register().
+ * OR
+ * - with preemption disabled - from architecture specific code.
*/
struct kprobe *get_kprobe(void *addr)
{
@@ -380,22 +388,20 @@ NOKPROBE_SYMBOL(get_kprobe);
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
-/* Return true if the kprobe is an aggregator */
-static inline int kprobe_aggrprobe(struct kprobe *p)
+/* Return true if 'p' is an aggregator */
+static inline bool kprobe_aggrprobe(struct kprobe *p)
{
return p->pre_handler == aggr_pre_handler;
}
-/* Return true(!0) if the kprobe is unused */
-static inline int kprobe_unused(struct kprobe *p)
+/* Return true if 'p' is unused */
+static inline bool kprobe_unused(struct kprobe *p)
{
return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
list_empty(&p->list);
}
-/*
- * Keep all fields in the kprobe consistent
- */
+/* Keep all fields in the kprobe consistent. */
static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
{
memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
@@ -403,11 +409,11 @@ static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
}
#ifdef CONFIG_OPTPROBES
-/* NOTE: change this value only with kprobe_mutex held */
+/* NOTE: This is protected by 'kprobe_mutex'. */
static bool kprobes_allow_optimization;
/*
- * Call all pre_handler on the list, but ignores its return value.
+ * Call all 'kprobe::pre_handler' on the list, but ignores its return value.
* This must be called from arch-dep optimized caller.
*/
void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
@@ -435,7 +441,7 @@ static void free_aggr_kprobe(struct kprobe *p)
kfree(op);
}
-/* Return true(!0) if the kprobe is ready for optimization. */
+/* Return true if the kprobe is ready for optimization. */
static inline int kprobe_optready(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -448,8 +454,8 @@ static inline int kprobe_optready(struct kprobe *p)
return 0;
}
-/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
-static inline int kprobe_disarmed(struct kprobe *p)
+/* Return true if the kprobe is disarmed. Note: p must be on hash list */
+static inline bool kprobe_disarmed(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -462,32 +468,32 @@ static inline int kprobe_disarmed(struct kprobe *p)
return kprobe_disabled(p) && list_empty(&op->list);
}
-/* Return true(!0) if the probe is queued on (un)optimizing lists */
-static int kprobe_queued(struct kprobe *p)
+/* Return true if the probe is queued on (un)optimizing lists */
+static bool kprobe_queued(struct kprobe *p)
{
struct optimized_kprobe *op;
if (kprobe_aggrprobe(p)) {
op = container_of(p, struct optimized_kprobe, kp);
if (!list_empty(&op->list))
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
* Return an optimized kprobe whose optimizing code replaces
- * instructions including addr (exclude breakpoint).
+ * instructions including 'addr' (exclude breakpoint).
*/
-static struct kprobe *get_optimized_kprobe(unsigned long addr)
+static struct kprobe *get_optimized_kprobe(kprobe_opcode_t *addr)
{
int i;
struct kprobe *p = NULL;
struct optimized_kprobe *op;
/* Don't check i == 0, since that is a breakpoint case. */
- for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
- p = get_kprobe((void *)(addr - i));
+ for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH / sizeof(kprobe_opcode_t); i++)
+ p = get_kprobe(addr - i);
if (p && kprobe_optready(p)) {
op = container_of(p, struct optimized_kprobe, kp);
@@ -498,7 +504,7 @@ static struct kprobe *get_optimized_kprobe(unsigned long addr)
return NULL;
}
-/* Optimization staging list, protected by kprobe_mutex */
+/* Optimization staging list, protected by 'kprobe_mutex' */
static LIST_HEAD(optimizing_list);
static LIST_HEAD(unoptimizing_list);
static LIST_HEAD(freeing_list);
@@ -509,20 +515,20 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
/*
* Optimize (replace a breakpoint with a jump) kprobes listed on
- * optimizing_list.
+ * 'optimizing_list'.
*/
static void do_optimize_kprobes(void)
{
lockdep_assert_held(&text_mutex);
/*
- * The optimization/unoptimization refers online_cpus via
- * stop_machine() and cpu-hotplug modifies online_cpus.
- * And same time, text_mutex will be held in cpu-hotplug and here.
- * This combination can cause a deadlock (cpu-hotplug try to lock
- * text_mutex but stop_machine can not be done because online_cpus
- * has been changed)
- * To avoid this deadlock, caller must have locked cpu hotplug
- * for preventing cpu-hotplug outside of text_mutex locking.
+ * The optimization/unoptimization refers 'online_cpus' via
+ * stop_machine() and cpu-hotplug modifies the 'online_cpus'.
+ * And same time, 'text_mutex' will be held in cpu-hotplug and here.
+ * This combination can cause a deadlock (cpu-hotplug tries to lock
+ * 'text_mutex' but stop_machine() can not be done because
+ * the 'online_cpus' has been changed)
+ * To avoid this deadlock, caller must have locked cpu-hotplug
+ * for preventing cpu-hotplug outside of 'text_mutex' locking.
*/
lockdep_assert_cpus_held();
@@ -536,7 +542,7 @@ static void do_optimize_kprobes(void)
/*
* Unoptimize (replace a jump with a breakpoint and remove the breakpoint
- * if need) kprobes listed on unoptimizing_list.
+ * if need) kprobes listed on 'unoptimizing_list'.
*/
static void do_unoptimize_kprobes(void)
{
@@ -551,7 +557,7 @@ static void do_unoptimize_kprobes(void)
return;
arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- /* Loop free_list for disarming */
+ /* Loop on 'freeing_list' for disarming */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
/* Switching from detour code to origin */
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
@@ -562,7 +568,7 @@ static void do_unoptimize_kprobes(void)
/*
* Remove unused probes from hash list. After waiting
* for synchronization, these probes are reclaimed.
- * (reclaiming is done by do_free_cleaned_kprobes.)
+ * (reclaiming is done by do_free_cleaned_kprobes().)
*/
hlist_del_rcu(&op->kp.hlist);
} else
@@ -570,7 +576,7 @@ static void do_unoptimize_kprobes(void)
}
}
-/* Reclaim all kprobes on the free_list */
+/* Reclaim all kprobes on the 'freeing_list' */
static void do_free_cleaned_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
@@ -642,9 +648,9 @@ void wait_for_kprobe_optimizer(void)
while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
mutex_unlock(&kprobe_mutex);
- /* this will also make optimizing_work execute immmediately */
+ /* This will also make 'optimizing_work' execute immmediately */
flush_delayed_work(&optimizing_work);
- /* @optimizing_work might not have been queued yet, relax */
+ /* 'optimizing_work' might not have been queued yet, relax */
cpu_relax();
mutex_lock(&kprobe_mutex);
@@ -675,7 +681,7 @@ static void optimize_kprobe(struct kprobe *p)
(kprobe_disabled(p) || kprobes_all_disarmed))
return;
- /* kprobes with post_handler can not be optimized */
+ /* kprobes with 'post_handler' can not be optimized */
if (p->post_handler)
return;
@@ -695,7 +701,10 @@ static void optimize_kprobe(struct kprobe *p)
}
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
- /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
+ /*
+ * On the 'unoptimizing_list' and 'optimizing_list',
+ * 'op' must have OPTIMIZED flag
+ */
if (WARN_ON_ONCE(!list_empty(&op->list)))
return;
@@ -765,7 +774,7 @@ static int reuse_unused_kprobe(struct kprobe *ap)
WARN_ON_ONCE(list_empty(&op->list));
/* Enable the probe again */
ap->flags &= ~KPROBE_FLAG_DISABLED;
- /* Optimize it again (remove from op->list) */
+ /* Optimize it again. (remove from 'op->list') */
if (!kprobe_optready(ap))
return -EINVAL;
@@ -815,7 +824,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
__prepare_optimized_kprobe(op, p);
}
-/* Allocate new optimized_kprobe and try to prepare optimized instructions */
+/* Allocate new optimized_kprobe and try to prepare optimized instructions. */
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -834,19 +843,19 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
/*
- * Prepare an optimized_kprobe and optimize it
- * NOTE: p must be a normal registered kprobe
+ * Prepare an optimized_kprobe and optimize it.
+ * NOTE: 'p' must be a normal registered kprobe.
*/
static void try_to_optimize_kprobe(struct kprobe *p)
{
struct kprobe *ap;
struct optimized_kprobe *op;
- /* Impossible to optimize ftrace-based kprobe */
+ /* Impossible to optimize ftrace-based kprobe. */
if (kprobe_ftrace(p))
return;
- /* For preparing optimization, jump_label_text_reserved() is called */
+ /* For preparing optimization, jump_label_text_reserved() is called. */
cpus_read_lock();
jump_label_lock();
mutex_lock(&text_mutex);
@@ -857,14 +866,14 @@ static void try_to_optimize_kprobe(struct kprobe *p)
op = container_of(ap, struct optimized_kprobe, kp);
if (!arch_prepared_optinsn(&op->optinsn)) {
- /* If failed to setup optimizing, fallback to kprobe */
+ /* If failed to setup optimizing, fallback to kprobe. */
arch_remove_optimized_kprobe(op);
kfree(op);
goto out;
}
init_aggr_kprobe(ap, p);
- optimize_kprobe(ap); /* This just kicks optimizer thread */
+ optimize_kprobe(ap); /* This just kicks optimizer thread. */
out:
mutex_unlock(&text_mutex);
@@ -879,7 +888,7 @@ static void optimize_all_kprobes(void)
unsigned int i;
mutex_lock(&kprobe_mutex);
- /* If optimization is already allowed, just return */
+ /* If optimization is already allowed, just return. */
if (kprobes_allow_optimization)
goto out;
@@ -892,7 +901,7 @@ static void optimize_all_kprobes(void)
optimize_kprobe(p);
}
cpus_read_unlock();
- printk(KERN_INFO "Kprobes globally optimized\n");
+ pr_info("kprobe jump-optimization is enabled. All kprobes are optimized if possible.\n");
out:
mutex_unlock(&kprobe_mutex);
}
@@ -905,7 +914,7 @@ static void unoptimize_all_kprobes(void)
unsigned int i;
mutex_lock(&kprobe_mutex);
- /* If optimization is already prohibited, just return */
+ /* If optimization is already prohibited, just return. */
if (!kprobes_allow_optimization) {
mutex_unlock(&kprobe_mutex);
return;
@@ -923,9 +932,9 @@ static void unoptimize_all_kprobes(void)
cpus_read_unlock();
mutex_unlock(&kprobe_mutex);
- /* Wait for unoptimizing completion */
+ /* Wait for unoptimizing completion. */
wait_for_kprobe_optimizer();
- printk(KERN_INFO "Kprobes globally unoptimized\n");
+ pr_info("kprobe jump-optimization is disabled. All kprobes are based on software breakpoint.\n");
}
static DEFINE_MUTEX(kprobe_sysctl_mutex);
@@ -950,13 +959,15 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
}
#endif /* CONFIG_SYSCTL */
-/* Put a breakpoint for a probe. Must be called with text_mutex locked */
+/* Put a breakpoint for a probe. */
static void __arm_kprobe(struct kprobe *p)
{
struct kprobe *_p;
- /* Check collision with other optimized kprobes */
- _p = get_optimized_kprobe((unsigned long)p->addr);
+ lockdep_assert_held(&text_mutex);
+
+ /* Find the overlapping optimized kprobes. */
+ _p = get_optimized_kprobe(p->addr);
if (unlikely(_p))
/* Fallback to unoptimized kprobe */
unoptimize_kprobe(_p, true);
@@ -965,22 +976,29 @@ static void __arm_kprobe(struct kprobe *p)
optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
}
-/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
+/* Remove the breakpoint of a probe. */
static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
struct kprobe *_p;
+ lockdep_assert_held(&text_mutex);
+
/* Try to unoptimize */
unoptimize_kprobe(p, kprobes_all_disarmed);
if (!kprobe_queued(p)) {
arch_disarm_kprobe(p);
- /* If another kprobe was blocked, optimize it. */
- _p = get_optimized_kprobe((unsigned long)p->addr);
+ /* If another kprobe was blocked, re-optimize it. */
+ _p = get_optimized_kprobe(p->addr);
if (unlikely(_p) && reopt)
optimize_kprobe(_p);
}
- /* TODO: reoptimize others after unoptimized this probe */
+ /*
+ * TODO: Since unoptimization and real disarming will be done by
+ * the worker thread, we can not check whether another probe are
+ * unoptimized because of this probe here. It should be re-optimized
+ * by the worker thread.
+ */
}
#else /* !CONFIG_OPTPROBES */
@@ -1003,7 +1021,7 @@ static int reuse_unused_kprobe(struct kprobe *ap)
* unregistered.
* Thus there should be no chance to reuse unused kprobe.
*/
- printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
+ WARN_ON_ONCE(1);
return -EINVAL;
}
@@ -1033,34 +1051,21 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
static int kprobe_ipmodify_enabled;
static int kprobe_ftrace_enabled;
-/* Must ensure p->addr is really on ftrace */
-static int prepare_kprobe(struct kprobe *p)
-{
- if (!kprobe_ftrace(p))
- return arch_prepare_kprobe(p);
-
- return arch_prepare_kprobe_ftrace(p);
-}
-
-/* Caller must lock kprobe_mutex */
static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int *cnt)
{
int ret = 0;
+ lockdep_assert_held(&kprobe_mutex);
+
ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
- if (ret) {
- pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n",
- p->addr, ret);
+ if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret))
return ret;
- }
if (*cnt == 0) {
ret = register_ftrace_function(ops);
- if (ret) {
- pr_debug("Failed to init kprobe-ftrace (%d)\n", ret);
+ if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret))
goto err_ftrace;
- }
}
(*cnt)++;
@@ -1084,22 +1089,23 @@ static int arm_kprobe_ftrace(struct kprobe *p)
ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
-/* Caller must lock kprobe_mutex */
static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int *cnt)
{
int ret = 0;
+ lockdep_assert_held(&kprobe_mutex);
+
if (*cnt == 1) {
ret = unregister_ftrace_function(ops);
- if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret))
+ if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (error %d)\n", ret))
return ret;
}
(*cnt)--;
ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
- WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n",
+ WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (error %d)\n",
p->addr, ret);
return ret;
}
@@ -1113,11 +1119,6 @@ static int disarm_kprobe_ftrace(struct kprobe *p)
ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
#else /* !CONFIG_KPROBES_ON_FTRACE */
-static inline int prepare_kprobe(struct kprobe *p)
-{
- return arch_prepare_kprobe(p);
-}
-
static inline int arm_kprobe_ftrace(struct kprobe *p)
{
return -ENODEV;
@@ -1129,7 +1130,15 @@ static inline int disarm_kprobe_ftrace(struct kprobe *p)
}
#endif
-/* Arm a kprobe with text_mutex */
+static int prepare_kprobe(struct kprobe *p)
+{
+ /* Must ensure p->addr is really on ftrace */
+ if (kprobe_ftrace(p))
+ return arch_prepare_kprobe_ftrace(p);
+
+ return arch_prepare_kprobe(p);
+}
+
static int arm_kprobe(struct kprobe *kp)
{
if (unlikely(kprobe_ftrace(kp)))
@@ -1144,7 +1153,6 @@ static int arm_kprobe(struct kprobe *kp)
return 0;
}
-/* Disarm a kprobe with text_mutex */
static int disarm_kprobe(struct kprobe *kp, bool reopt)
{
if (unlikely(kprobe_ftrace(kp)))
@@ -1194,17 +1202,17 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
}
NOKPROBE_SYMBOL(aggr_post_handler);
-/* Walks the list and increments nmissed count for multiprobe case */
+/* Walks the list and increments 'nmissed' if 'p' has child probes. */
void kprobes_inc_nmissed_count(struct kprobe *p)
{
struct kprobe *kp;
+
if (!kprobe_aggrprobe(p)) {
p->nmissed++;
} else {
list_for_each_entry_rcu(kp, &p->list, list)
kp->nmissed++;
}
- return;
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
@@ -1222,9 +1230,9 @@ static void recycle_rp_inst(struct kretprobe_instance *ri)
{
struct kretprobe *rp = get_kretprobe(ri);
- if (likely(rp)) {
+ if (likely(rp))
freelist_add(&ri->freelist, &rp->freelist);
- } else
+ else
call_rcu(&ri->rcu, free_rp_inst_rcu);
}
NOKPROBE_SYMBOL(recycle_rp_inst);
@@ -1250,10 +1258,10 @@ void kprobe_busy_end(void)
}
/*
- * This function is called from finish_task_switch when task tk becomes dead,
- * so that we can recycle any function-return probe instances associated
- * with this task. These left over instances represent probed functions
- * that have been called but will never return.
+ * This function is called from delayed_put_task_struct() when a task is
+ * dead and cleaned up to recycle any kretprobe instances associated with
+ * this task. These left over instances represent probed functions that
+ * have been called but will never return.
*/
void kprobe_flush_task(struct task_struct *tk)
{
@@ -1299,7 +1307,7 @@ static inline void free_rp_inst(struct kretprobe *rp)
}
}
-/* Add the new probe to ap->list */
+/* Add the new probe to 'ap->list'. */
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
if (p->post_handler)
@@ -1313,12 +1321,12 @@ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
}
/*
- * Fill in the required fields of the "manager kprobe". Replace the
- * earlier kprobe in the hlist with the manager kprobe
+ * Fill in the required fields of the aggregator kprobe. Replace the
+ * earlier kprobe in the hlist with the aggregator kprobe.
*/
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
- /* Copy p's insn slot to ap */
+ /* Copy the insn slot of 'p' to 'ap'. */
copy_kprobe(p, ap);
flush_insn_slot(ap);
ap->addr = p->addr;
@@ -1336,8 +1344,7 @@ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
}
/*
- * This is the second or subsequent kprobe at the address - handle
- * the intricacies
+ * This registers the second or subsequent kprobe at the same address.
*/
static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
{
@@ -1351,7 +1358,7 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
mutex_lock(&text_mutex);
if (!kprobe_aggrprobe(orig_p)) {
- /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
+ /* If 'orig_p' is not an 'aggr_kprobe', create new one. */
ap = alloc_aggr_kprobe(orig_p);
if (!ap) {
ret = -ENOMEM;
@@ -1376,8 +1383,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
if (ret)
/*
* Even if fail to allocate new slot, don't need to
- * free aggr_probe. It will be used next time, or
- * freed by unregister_kprobe.
+ * free the 'ap'. It will be used next time, or
+ * freed by unregister_kprobe().
*/
goto out;
@@ -1392,7 +1399,7 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
| KPROBE_FLAG_DISABLED;
}
- /* Copy ap's insn slot to p */
+ /* Copy the insn slot of 'p' to 'ap'. */
copy_kprobe(ap, p);
ret = add_new_kprobe(ap, p);
@@ -1418,7 +1425,7 @@ out:
bool __weak arch_within_kprobe_blacklist(unsigned long addr)
{
- /* The __kprobes marked functions and entry code must not be probed */
+ /* The '__kprobes' functions and entry code must not be probed. */
return addr >= (unsigned long)__kprobes_text_start &&
addr < (unsigned long)__kprobes_text_end;
}
@@ -1430,8 +1437,8 @@ static bool __within_kprobe_blacklist(unsigned long addr)
if (arch_within_kprobe_blacklist(addr))
return true;
/*
- * If there exists a kprobe_blacklist, verify and
- * fail any probe registration in the prohibited area
+ * If 'kprobe_blacklist' is defined, check the address and
+ * reject any probe registration in the prohibited area.
*/
list_for_each_entry(ent, &kprobe_blacklist, list) {
if (addr >= ent->start_addr && addr < ent->end_addr)
@@ -1461,7 +1468,7 @@ bool within_kprobe_blacklist(unsigned long addr)
}
/*
- * If we have a symbol_name argument, look it up and add the offset field
+ * If 'symbol_name' is specified, look it up and add the 'offset'
* to it. This way, we can specify a relative address to a symbol.
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
@@ -1491,7 +1498,10 @@ static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
return _kprobe_addr(p->addr, p->symbol_name, p->offset);
}
-/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+/*
+ * Check the 'p' is valid and return the aggregator kprobe
+ * at the same address.
+ */
static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
struct kprobe *ap, *list_p;
@@ -1529,7 +1539,7 @@ static inline int warn_kprobe_rereg(struct kprobe *p)
return ret;
}
-int __weak arch_check_ftrace_location(struct kprobe *p)
+static int check_ftrace_location(struct kprobe *p)
{
unsigned long ftrace_addr;
@@ -1552,7 +1562,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
{
int ret;
- ret = arch_check_ftrace_location(p);
+ ret = check_ftrace_location(p);
if (ret)
return ret;
jump_label_lock();
@@ -1568,7 +1578,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
goto out;
}
- /* Check if are we probing a module */
+ /* Check if 'p' is probing a module. */
*probed_mod = __module_text_address((unsigned long) p->addr);
if (*probed_mod) {
/*
@@ -1581,7 +1591,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
}
/*
- * If the module freed .init.text, we couldn't insert
+ * If the module freed '.init.text', we couldn't insert
* kprobes in there.
*/
if (within_module_init((unsigned long)p->addr, *probed_mod) &&
@@ -1628,7 +1638,7 @@ int register_kprobe(struct kprobe *p)
old_p = get_kprobe(p->addr);
if (old_p) {
- /* Since this may unoptimize old_p, locking text_mutex. */
+ /* Since this may unoptimize 'old_p', locking 'text_mutex'. */
ret = register_aggr_kprobe(old_p, p);
goto out;
}
@@ -1667,8 +1677,8 @@ out:
}
EXPORT_SYMBOL_GPL(register_kprobe);
-/* Check if all probes on the aggrprobe are disabled */
-static int aggr_kprobe_disabled(struct kprobe *ap)
+/* Check if all probes on the 'ap' are disabled. */
+static bool aggr_kprobe_disabled(struct kprobe *ap)
{
struct kprobe *kp;
@@ -1677,20 +1687,21 @@ static int aggr_kprobe_disabled(struct kprobe *ap)
list_for_each_entry(kp, &ap->list, list)
if (!kprobe_disabled(kp))
/*
- * There is an active probe on the list.
- * We can't disable this ap.
+ * Since there is an active probe on the list,
+ * we can't disable this 'ap'.
*/
- return 0;
+ return false;
- return 1;
+ return true;
}
-/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
static struct kprobe *__disable_kprobe(struct kprobe *p)
{
struct kprobe *orig_p;
int ret;
+ lockdep_assert_held(&kprobe_mutex);
+
/* Get an original kprobe for return */
orig_p = __get_valid_kprobe(p);
if (unlikely(orig_p == NULL))
@@ -1704,7 +1715,7 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
/* Try to disarm and disable this/parent probe */
if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
/*
- * If kprobes_all_disarmed is set, orig_p
+ * If 'kprobes_all_disarmed' is set, 'orig_p'
* should have already been disarmed, so
* skip unneed disarming process.
*/
@@ -1850,53 +1861,105 @@ static struct notifier_block kprobe_exceptions_nb = {
.priority = 0x7fffffff /* we need to be notified first */
};
-unsigned long __weak arch_deref_entry_point(void *entry)
+#ifdef CONFIG_KRETPROBES
+
+/* This assumes the 'tsk' is the current task or the is not running. */
+static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk,
+ struct llist_node **cur)
+{
+ struct kretprobe_instance *ri = NULL;
+ struct llist_node *node = *cur;
+
+ if (!node)
+ node = tsk->kretprobe_instances.first;
+ else
+ node = node->next;
+
+ while (node) {
+ ri = container_of(node, struct kretprobe_instance, llist);
+ if (ri->ret_addr != kretprobe_trampoline_addr()) {
+ *cur = node;
+ return ri->ret_addr;
+ }
+ node = node->next;
+ }
+ return NULL;
+}
+NOKPROBE_SYMBOL(__kretprobe_find_ret_addr);
+
+/**
+ * kretprobe_find_ret_addr -- Find correct return address modified by kretprobe
+ * @tsk: Target task
+ * @fp: A frame pointer
+ * @cur: a storage of the loop cursor llist_node pointer for next call
+ *
+ * Find the correct return address modified by a kretprobe on @tsk in unsigned
+ * long type. If it finds the return address, this returns that address value,
+ * or this returns 0.
+ * The @tsk must be 'current' or a task which is not running. @fp is a hint
+ * to get the currect return address - which is compared with the
+ * kretprobe_instance::fp field. The @cur is a loop cursor for searching the
+ * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
+ * first call, but '@cur' itself must NOT NULL.
+ */
+unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
+ struct llist_node **cur)
{
- return (unsigned long)entry;
+ struct kretprobe_instance *ri = NULL;
+ kprobe_opcode_t *ret;
+
+ if (WARN_ON_ONCE(!cur))
+ return 0;
+
+ do {
+ ret = __kretprobe_find_ret_addr(tsk, cur);
+ if (!ret)
+ break;
+ ri = container_of(*cur, struct kretprobe_instance, llist);
+ } while (ri->fp != fp);
+
+ return (unsigned long)ret;
}
+NOKPROBE_SYMBOL(kretprobe_find_ret_addr);
-#ifdef CONFIG_KRETPROBES
+void __weak arch_kretprobe_fixup_return(struct pt_regs *regs,
+ kprobe_opcode_t *correct_ret_addr)
+{
+ /*
+ * Do nothing by default. Please fill this to update the fake return
+ * address on the stack with the correct one on each arch if possible.
+ */
+}
unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
- void *trampoline_address,
void *frame_pointer)
{
kprobe_opcode_t *correct_ret_addr = NULL;
struct kretprobe_instance *ri = NULL;
- struct llist_node *first, *node;
+ struct llist_node *first, *node = NULL;
struct kretprobe *rp;
- /* Find all nodes for this frame. */
- first = node = current->kretprobe_instances.first;
- while (node) {
- ri = container_of(node, struct kretprobe_instance, llist);
-
- BUG_ON(ri->fp != frame_pointer);
-
- if (ri->ret_addr != trampoline_address) {
- correct_ret_addr = ri->ret_addr;
- /*
- * This is the real return address. Any other
- * instances associated with this task are for
- * other calls deeper on the call stack
- */
- goto found;
- }
-
- node = node->next;
+ /* Find correct address and all nodes for this frame. */
+ correct_ret_addr = __kretprobe_find_ret_addr(current, &node);
+ if (!correct_ret_addr) {
+ pr_err("kretprobe: Return address not found, not execute handler. Maybe there is a bug in the kernel.\n");
+ BUG_ON(1);
}
- pr_err("Oops! Kretprobe fails to find correct return address.\n");
- BUG_ON(1);
-found:
- /* Unlink all nodes for this frame. */
- current->kretprobe_instances.first = node->next;
- node->next = NULL;
+ /*
+ * Set the return address as the instruction pointer, because if the
+ * user handler calls stack_trace_save_regs() with this 'regs',
+ * the stack trace will start from the instruction pointer.
+ */
+ instruction_pointer_set(regs, (unsigned long)correct_ret_addr);
- /* Run them.. */
+ /* Run the user handler of the nodes. */
+ first = current->kretprobe_instances.first;
while (first) {
ri = container_of(first, struct kretprobe_instance, llist);
- first = first->next;
+
+ if (WARN_ON_ONCE(ri->fp != frame_pointer))
+ break;
rp = get_kretprobe(ri);
if (rp && rp->handler) {
@@ -1907,6 +1970,23 @@ found:
rp->handler(ri, regs);
__this_cpu_write(current_kprobe, prev);
}
+ if (first == node)
+ break;
+
+ first = first->next;
+ }
+
+ arch_kretprobe_fixup_return(regs, correct_ret_addr);
+
+ /* Unlink all nodes for this frame. */
+ first = current->kretprobe_instances.first;
+ current->kretprobe_instances.first = node->next;
+ node->next = NULL;
+
+ /* Recycle free instances. */
+ while (first) {
+ ri = container_of(first, struct kretprobe_instance, llist);
+ first = first->next;
recycle_rp_inst(ri);
}
@@ -1991,7 +2071,7 @@ int register_kretprobe(struct kretprobe *rp)
if (ret)
return ret;
- /* If only rp->kp.addr is specified, check reregistering kprobes */
+ /* If only 'rp->kp.addr' is specified, check reregistering kprobes */
if (rp->kp.addr && warn_kprobe_rereg(&rp->kp))
return -EINVAL;
@@ -2006,6 +2086,9 @@ int register_kretprobe(struct kretprobe *rp)
}
}
+ if (rp->data_size > KRETPROBE_MAX_DATA_SIZE)
+ return -E2BIG;
+
rp->kp.pre_handler = pre_handler_kretprobe;
rp->kp.post_handler = NULL;
@@ -2096,13 +2179,13 @@ EXPORT_SYMBOL_GPL(unregister_kretprobes);
#else /* CONFIG_KRETPROBES */
int register_kretprobe(struct kretprobe *rp)
{
- return -ENOSYS;
+ return -EOPNOTSUPP;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
int register_kretprobes(struct kretprobe **rps, int num)
{
- return -ENOSYS;
+ return -EOPNOTSUPP;
}
EXPORT_SYMBOL_GPL(register_kretprobes);
@@ -2151,7 +2234,7 @@ static void kill_kprobe(struct kprobe *p)
/*
* The module is going away. We should disarm the kprobe which
* is using ftrace, because ftrace framework is still available at
- * MODULE_STATE_GOING notification.
+ * 'MODULE_STATE_GOING' notification.
*/
if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
disarm_kprobe_ftrace(p);
@@ -2214,8 +2297,7 @@ EXPORT_SYMBOL_GPL(enable_kprobe);
/* Caller must NOT call this in usual path. This is only for critical case */
void dump_kprobe(struct kprobe *kp)
{
- pr_err("Dumping kprobe:\n");
- pr_err("Name: %s\nOffset: %x\nAddress: %pS\n",
+ pr_err("Dump kprobe:\n.symbol_name = %s, .offset = %x, .addr = %pS\n",
kp->symbol_name, kp->offset, kp->addr);
}
NOKPROBE_SYMBOL(dump_kprobe);
@@ -2317,7 +2399,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
int ret;
for (iter = start; iter < end; iter++) {
- entry = arch_deref_entry_point((void *)*iter);
+ entry = (unsigned long)dereference_symbol_descriptor((void *)*iter);
ret = kprobe_add_ksym_blacklist(entry);
if (ret == -EINVAL)
continue;
@@ -2325,13 +2407,13 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
return ret;
}
- /* Symbols in __kprobes_text are blacklisted */
+ /* Symbols in '__kprobes_text' are blacklisted */
ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start,
(unsigned long)__kprobes_text_end);
if (ret)
return ret;
- /* Symbols in noinstr section are blacklisted */
+ /* Symbols in 'noinstr' section are blacklisted */
ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start,
(unsigned long)__noinstr_text_end);
@@ -2403,9 +2485,9 @@ static int kprobes_module_callback(struct notifier_block *nb,
return NOTIFY_DONE;
/*
- * When MODULE_STATE_GOING was notified, both of module .text and
- * .init.text sections would be freed. When MODULE_STATE_LIVE was
- * notified, only .init.text section would be freed. We need to
+ * When 'MODULE_STATE_GOING' was notified, both of module '.text' and
+ * '.init.text' sections would be freed. When 'MODULE_STATE_LIVE' was
+ * notified, only '.init.text' section would be freed. We need to
* disable kprobes which have been inserted in the sections.
*/
mutex_lock(&kprobe_mutex);
@@ -2422,9 +2504,9 @@ static int kprobes_module_callback(struct notifier_block *nb,
*
* Note, this will also move any optimized probes
* that are pending to be removed from their
- * corresponding lists to the freeing_list and
+ * corresponding lists to the 'freeing_list' and
* will not be touched by the delayed
- * kprobe_optimizer work handler.
+ * kprobe_optimizer() work handler.
*/
kill_kprobe(p);
}
@@ -2440,10 +2522,6 @@ static struct notifier_block kprobe_module_nb = {
.priority = 0
};
-/* Markers of _kprobe_blacklist section */
-extern unsigned long __start_kprobe_blacklist[];
-extern unsigned long __stop_kprobe_blacklist[];
-
void kprobe_free_init_mem(void)
{
void *start = (void *)(&__init_begin);
@@ -2454,7 +2532,7 @@ void kprobe_free_init_mem(void)
mutex_lock(&kprobe_mutex);
- /* Kill all kprobes on initmem */
+ /* Kill all kprobes on initmem because the target code has been freed. */
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry(p, head, hlist) {
@@ -2477,10 +2555,8 @@ static int __init init_kprobes(void)
err = populate_kprobe_blacklist(__start_kprobe_blacklist,
__stop_kprobe_blacklist);
- if (err) {
- pr_err("kprobes: failed to populate blacklist: %d\n", err);
- pr_err("Please take care of using kprobes.\n");
- }
+ if (err)
+ pr_err("Failed to populate blacklist (error %d), kprobes not restricted, be careful using them!\n", err);
if (kretprobe_blacklist_size) {
/* lookup the function address from its name */
@@ -2488,7 +2564,7 @@ static int __init init_kprobes(void)
kretprobe_blacklist[i].addr =
kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
if (!kretprobe_blacklist[i].addr)
- printk("kretprobe: lookup failed: %s\n",
+ pr_err("Failed to lookup symbol '%s' for kretprobe blacklist. Maybe the target function is removed or renamed.\n",
kretprobe_blacklist[i].name);
}
}
@@ -2497,7 +2573,7 @@ static int __init init_kprobes(void)
kprobes_all_disarmed = false;
#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
- /* Init kprobe_optinsn_slots for allocation */
+ /* Init 'kprobe_optinsn_slots' for allocation */
kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
#endif
@@ -2508,9 +2584,6 @@ static int __init init_kprobes(void)
err = register_module_notifier(&kprobe_module_nb);
kprobes_initialized = (err == 0);
-
- if (!err)
- init_test_probes();
return err;
}
early_initcall(init_kprobes);
@@ -2631,7 +2704,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
list_entry(v, struct kprobe_blacklist_entry, list);
/*
- * If /proc/kallsyms is not showing kernel address, we won't
+ * If '/proc/kallsyms' is not showing kernel address, we won't
* show them here either.
*/
if (!kallsyms_show_value(m->file->f_cred))
@@ -2692,7 +2765,7 @@ static int arm_all_kprobes(void)
}
if (errors)
- pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n",
+ pr_warn("Kprobes globally enabled, but failed to enable %d out of %d probes. Please check which kprobes are kept disabled via debugfs.\n",
errors, total);
else
pr_info("Kprobes globally enabled\n");
@@ -2735,7 +2808,7 @@ static int disarm_all_kprobes(void)
}
if (errors)
- pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n",
+ pr_warn("Kprobes globally disabled, but failed to disable %d out of %d probes. Please check which kprobes are kept enabled via debugfs.\n",
errors, total);
else
pr_info("Kprobes globally disabled\n");
@@ -2770,30 +2843,14 @@ static ssize_t read_enabled_file_bool(struct file *file,
static ssize_t write_enabled_file_bool(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
- char buf[32];
- size_t buf_size;
- int ret = 0;
-
- buf_size = min(count, (sizeof(buf)-1));
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
+ bool enable;
+ int ret;
- buf[buf_size] = '\0';
- switch (buf[0]) {
- case 'y':
- case 'Y':
- case '1':
- ret = arm_all_kprobes();
- break;
- case 'n':
- case 'N':
- case '0':
- ret = disarm_all_kprobes();
- break;
- default:
- return -EINVAL;
- }
+ ret = kstrtobool_from_user(user_buf, count, &enable);
+ if (ret)
+ return ret;
+ ret = enable ? arm_all_kprobes() : disarm_all_kprobes();
if (ret)
return ret;
@@ -2809,13 +2866,12 @@ static const struct file_operations fops_kp = {
static int __init debugfs_kprobe_init(void)
{
struct dentry *dir;
- unsigned int value = 1;
dir = debugfs_create_dir("kprobes", NULL);
debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops);
- debugfs_create_file("enabled", 0600, dir, &value, &fops_kp);
+ debugfs_create_file("enabled", 0600, dir, NULL, &fops_kp);
debugfs_create_file("blacklist", 0400, dir, NULL,
&kprobe_blacklist_fops);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5b37a8567168..4ed9e7bce9e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -270,6 +270,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme);
static int kthread(void *_create)
{
+ static const struct sched_param param = { .sched_priority = 0 };
/* Copy data: it's on kthread's stack */
struct kthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn;
@@ -300,6 +301,13 @@ static int kthread(void *_create)
init_completion(&self->parked);
current->vfork_done = &self->exited;
+ /*
+ * The new thread inherited kthreadd's priority and CPU mask. Reset
+ * back to default in case they have been changed.
+ */
+ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
+ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
+
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
@@ -397,7 +405,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
}
task = create->result;
if (!IS_ERR(task)) {
- static const struct sched_param param = { .sched_priority = 0 };
char name[TASK_COMM_LEN];
/*
@@ -406,13 +413,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
*/
vsnprintf(name, sizeof(name), namefmt, args);
set_task_comm(task, name);
- /*
- * root may have changed our (kthreadd's) priority or CPU mask.
- * The kernel thread should not inherit these properties.
- */
- sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(task,
- housekeeping_cpumask(HK_FLAG_KTHREAD));
}
kfree(create);
return task;
@@ -433,7 +433,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
* If thread is going to be bound on a particular cpu, give its node
* in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
* When woken, the thread will run @threadfn() with @data as its
- * argument. @threadfn() can either call do_exit() directly if it is a
+ * argument. @threadfn() can either return directly if it is a
* standalone thread for which no one will call kthread_stop(), or
* return when 'kthread_should_stop()' is true (which means
* kthread_stop() has been called). The return value should be zero
@@ -523,6 +523,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
to_kthread(p)->cpu = cpu;
return p;
}
+EXPORT_SYMBOL(kthread_create_on_cpu);
void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index e8029aea67f1..fe316c021d73 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -49,14 +49,15 @@ static void notrace klp_ftrace_handler(unsigned long ip,
ops = container_of(fops, struct klp_ops, fops);
+ /*
+ * The ftrace_test_recursion_trylock() will disable preemption,
+ * which is required for the variant of synchronize_rcu() that is
+ * used to allow patching functions where RCU is not watching.
+ * See klp_synchronize_transition() for more details.
+ */
bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (WARN_ON_ONCE(bit < 0))
return;
- /*
- * A variant of synchronize_rcu() is used to allow patching functions
- * where RCU is not watching, see klp_synchronize_transition().
- */
- preempt_disable_notrace();
func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
stack_node);
@@ -120,7 +121,6 @@ static void notrace klp_ftrace_handler(unsigned long ip,
klp_arch_set_pc(fregs, (unsigned long)func->new_func);
unlock:
- preempt_enable_notrace();
ftrace_test_recursion_unlock(bit);
}
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 291b857a6e20..5683ac0d2566 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -13,7 +13,6 @@
#include "core.h"
#include "patch.h"
#include "transition.h"
-#include "../sched/sched.h"
#define MAX_STACK_ENTRIES 100
#define STACK_ERR_BUF_SIZE 128
@@ -240,7 +239,7 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
* Determine whether it's safe to transition the task to the target patch state
* by looking for any to-be-patched or to-be-unpatched functions on its stack.
*/
-static int klp_check_stack(struct task_struct *task, char *err_buf)
+static int klp_check_stack(struct task_struct *task, const char **oldname)
{
static unsigned long entries[MAX_STACK_ENTRIES];
struct klp_object *obj;
@@ -248,12 +247,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
int ret, nr_entries;
ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
- if (ret < 0) {
- snprintf(err_buf, STACK_ERR_BUF_SIZE,
- "%s: %s:%d has an unreliable stack\n",
- __func__, task->comm, task->pid);
- return ret;
- }
+ if (ret < 0)
+ return -EINVAL;
nr_entries = ret;
klp_for_each_object(klp_transition_patch, obj) {
@@ -262,11 +257,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
klp_for_each_func(obj, func) {
ret = klp_check_stack_func(func, entries, nr_entries);
if (ret) {
- snprintf(err_buf, STACK_ERR_BUF_SIZE,
- "%s: %s:%d is sleeping on function %s\n",
- __func__, task->comm, task->pid,
- func->old_name);
- return ret;
+ *oldname = func->old_name;
+ return -EADDRINUSE;
}
}
}
@@ -274,6 +266,22 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
return 0;
}
+static int klp_check_and_switch_task(struct task_struct *task, void *arg)
+{
+ int ret;
+
+ if (task_curr(task) && task != current)
+ return -EBUSY;
+
+ ret = klp_check_stack(task, arg);
+ if (ret)
+ return ret;
+
+ clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+ task->patch_state = klp_target_state;
+ return 0;
+}
+
/*
* Try to safely switch a task to the target patch state. If it's currently
* running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
@@ -281,13 +289,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
*/
static bool klp_try_switch_task(struct task_struct *task)
{
- static char err_buf[STACK_ERR_BUF_SIZE];
- struct rq *rq;
- struct rq_flags flags;
+ const char *old_name;
int ret;
- bool success = false;
-
- err_buf[0] = '\0';
/* check if this task has already switched over */
if (task->patch_state == klp_target_state)
@@ -305,36 +308,31 @@ static bool klp_try_switch_task(struct task_struct *task)
* functions. If all goes well, switch the task to the target patch
* state.
*/
- rq = task_rq_lock(task, &flags);
+ ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+ switch (ret) {
+ case 0: /* success */
+ break;
- if (task_running(rq, task) && task != current) {
- snprintf(err_buf, STACK_ERR_BUF_SIZE,
- "%s: %s:%d is running\n", __func__, task->comm,
- task->pid);
- goto done;
+ case -EBUSY: /* klp_check_and_switch_task() */
+ pr_debug("%s: %s:%d is running\n",
+ __func__, task->comm, task->pid);
+ break;
+ case -EINVAL: /* klp_check_and_switch_task() */
+ pr_debug("%s: %s:%d has an unreliable stack\n",
+ __func__, task->comm, task->pid);
+ break;
+ case -EADDRINUSE: /* klp_check_and_switch_task() */
+ pr_debug("%s: %s:%d is sleeping on function %s\n",
+ __func__, task->comm, task->pid, old_name);
+ break;
+
+ default:
+ pr_debug("%s: Unknown error code (%d) when trying to switch %s:%d\n",
+ __func__, ret, task->comm, task->pid);
+ break;
}
- ret = klp_check_stack(task, err_buf);
- if (ret)
- goto done;
-
- success = true;
-
- clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
- task->patch_state = klp_target_state;
-
-done:
- task_rq_unlock(rq, task, &flags);
-
- /*
- * Due to console deadlock issues, pr_debug() can't be used while
- * holding the task rq lock. Instead we have to use a temporary buffer
- * and print the debug message after releasing the lock.
- */
- if (err_buf[0] != '\0')
- pr_debug("%s", err_buf);
-
- return success;
+ return !ret;
}
/*
@@ -415,8 +413,11 @@ void klp_try_complete_transition(void)
for_each_possible_cpu(cpu) {
task = idle_task(cpu);
if (cpu_online(cpu)) {
- if (!klp_try_switch_task(task))
+ if (!klp_try_switch_task(task)) {
complete = false;
+ /* Make idle task go through the main loop. */
+ wake_up_if_idle(cpu);
+ }
} else if (task->patch_state != klp_target_state) {
/* offline idle tasks can be switched immediately */
clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index bf1c00c881e4..4a882f83aeb9 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -788,6 +788,21 @@ static int very_verbose(struct lock_class *class)
* Is this the address of a static object:
*/
#ifdef __KERNEL__
+/*
+ * Check if an address is part of freed initmem. After initmem is freed,
+ * memory can be allocated from it, and such allocations would then have
+ * addresses within the range [_stext, _end].
+ */
+#ifndef arch_is_kernel_initmem_freed
+static int arch_is_kernel_initmem_freed(unsigned long addr)
+{
+ if (system_state < SYSTEM_FREEING_INITMEM)
+ return 0;
+
+ return init_section_contains((void *)addr, 1);
+}
+#endif
+
static int static_obj(const void *obj)
{
unsigned long start = (unsigned long) &_stext,
@@ -803,9 +818,6 @@ static int static_obj(const void *obj)
if ((addr >= start) && (addr < end))
return 1;
- if (arch_is_kernel_data(addr))
- return 1;
-
/*
* in-kernel percpu var?
*/
@@ -888,7 +900,7 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return NULL;
- hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu_notrace(class, hash_head, hash_entry) {
if (class->key == key) {
/*
* Huh! same key, different name? Did someone trample
@@ -4671,7 +4683,7 @@ print_lock_invalid_wait_context(struct task_struct *curr,
/*
* Verify the wait_type context.
*
- * This check validates we takes locks in the right wait-type order; that is it
+ * This check validates we take locks in the right wait-type order; that is it
* ensures that we do not take mutexes inside spinlocks and do not attempt to
* acquire spinlocks inside raw_spinlocks and the sort.
*
@@ -5366,7 +5378,7 @@ int __lock_is_held(const struct lockdep_map *lock, int read)
struct held_lock *hlock = curr->held_locks + i;
if (match_held_lock(hlock, lock)) {
- if (read == -1 || hlock->read == read)
+ if (read == -1 || !!hlock->read == read)
return LOCK_STATE_HELD;
return LOCK_STATE_NOT_HELD;
@@ -5473,6 +5485,7 @@ static noinstr void check_flags(unsigned long flags)
}
}
+#ifndef CONFIG_PREEMPT_RT
/*
* We dont accurately track softirq state in e.g.
* hardirq contexts (such as on 4KSTACKS), so only
@@ -5487,6 +5500,7 @@ static noinstr void check_flags(unsigned long flags)
DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
}
}
+#endif
if (!debug_locks)
print_irqtrace_events(current);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 7c5a4a087cc7..9c2fb613a55d 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -1022,23 +1022,23 @@ static int __init lock_torture_init(void)
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ,
onoff_interval * HZ, NULL);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shutdown_secs > 0) {
firsterr = torture_shutdown_init(shutdown_secs,
lock_torture_cleanup);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stutter > 0) {
firsterr = torture_stutter_init(stutter, stutter);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
@@ -1047,7 +1047,7 @@ static int __init lock_torture_init(void)
sizeof(writer_tasks[0]),
GFP_KERNEL);
if (writer_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ TOROUT_ERRSTRING("writer_tasks: Out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -1058,7 +1058,7 @@ static int __init lock_torture_init(void)
sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ TOROUT_ERRSTRING("reader_tasks: Out of memory");
kfree(writer_tasks);
writer_tasks = NULL;
firsterr = -ENOMEM;
@@ -1082,7 +1082,7 @@ static int __init lock_torture_init(void)
/* Create writer. */
firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
writer_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
create_reader:
@@ -1091,13 +1091,13 @@ static int __init lock_torture_init(void)
/* Create reader. */
firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
reader_tasks[j]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(lock_torture_stats, NULL,
stats_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
torture_init_end();
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index d456579d0952..5e3585950ec8 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -94,6 +94,9 @@ static inline unsigned long __owner_flags(unsigned long owner)
return owner & MUTEX_FLAGS;
}
+/*
+ * Returns: __mutex_owner(lock) on failure or NULL on success.
+ */
static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff)
{
unsigned long owner, curr = (unsigned long)current;
@@ -348,21 +351,23 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
{
bool ret = true;
- rcu_read_lock();
+ lockdep_assert_preemption_disabled();
+
while (__mutex_owner(lock) == owner) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
- * checking lock->owner still matches owner. If that fails,
- * owner might point to freed memory. If it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
+ * checking lock->owner still matches owner. And we already
+ * disabled preemption which is equal to the RCU read-side
+ * crital section in optimistic spinning code. Thus the
+ * task_strcut structure won't go away during the spinning
+ * period
*/
barrier();
/*
* Use vcpu_is_preempted to detect lock holder preemption issue.
*/
- if (!owner->on_cpu || need_resched() ||
- vcpu_is_preempted(task_cpu(owner))) {
+ if (!owner_on_cpu(owner) || need_resched()) {
ret = false;
break;
}
@@ -374,7 +379,6 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
cpu_relax();
}
- rcu_read_unlock();
return ret;
}
@@ -387,19 +391,19 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
struct task_struct *owner;
int retval = 1;
+ lockdep_assert_preemption_disabled();
+
if (need_resched())
return 0;
- rcu_read_lock();
- owner = __mutex_owner(lock);
-
/*
- * As lock holder preemption issue, we both skip spinning if task is not
- * on cpu or its cpu is preempted
+ * We already disabled preemption which is equal to the RCU read-side
+ * crital section in optimistic spinning code. Thus the task_strcut
+ * structure won't go away during the spinning period.
*/
+ owner = __mutex_owner(lock);
if (owner)
- retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
- rcu_read_unlock();
+ retval = owner_on_cpu(owner);
/*
* If lock->owner is not set, the mutex has been released. Return true
@@ -736,6 +740,44 @@ __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true);
}
+/**
+ * ww_mutex_trylock - tries to acquire the w/w mutex with optional acquire context
+ * @ww: mutex to lock
+ * @ww_ctx: optional w/w acquire context
+ *
+ * Trylocks a mutex with the optional acquire context; no deadlock detection is
+ * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise.
+ *
+ * Unlike ww_mutex_lock, no deadlock handling is performed. However, if a @ctx is
+ * specified, -EALREADY handling may happen in calls to ww_mutex_trylock.
+ *
+ * A mutex acquired with this function must be released with ww_mutex_unlock.
+ */
+int ww_mutex_trylock(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
+{
+ if (!ww_ctx)
+ return mutex_trylock(&ww->base);
+
+ MUTEX_WARN_ON(ww->base.magic != &ww->base);
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+ if (__mutex_trylock(&ww->base)) {
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
+ mutex_acquire_nest(&ww->base.dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ww_mutex_trylock);
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __sched
mutex_lock_nested(struct mutex *lock, unsigned int subclass)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8eabdc79602b..8555c4efe97c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -446,19 +446,26 @@ static __always_inline void rt_mutex_adjust_prio(struct task_struct *p)
}
/* RT mutex specific wake_q wrappers */
-static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
- struct rt_mutex_waiter *w)
+static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh,
+ struct task_struct *task,
+ unsigned int wake_state)
{
- if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state != TASK_NORMAL) {
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) {
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(wqh->rtlock_task);
- get_task_struct(w->task);
- wqh->rtlock_task = w->task;
+ get_task_struct(task);
+ wqh->rtlock_task = task;
} else {
- wake_q_add(&wqh->head, w->task);
+ wake_q_add(&wqh->head, task);
}
}
+static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
+ struct rt_mutex_waiter *w)
+{
+ rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state);
+}
+
static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) {
@@ -753,7 +760,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* other configuration and we fail to report; also, see
* lockdep.
*/
- if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter->ww_ctx)
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx)
ret = 0;
raw_spin_unlock(&lock->wait_lock);
@@ -1096,8 +1103,11 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
* the other will detect the deadlock and return -EDEADLOCK,
* which is wrong, as the other waiter is not in a deadlock
* situation.
+ *
+ * Except for ww_mutex, in that case the chain walk must already deal
+ * with spurious cycles, see the comments at [3] and [6].
*/
- if (owner == task)
+ if (owner == task && !(build_ww_mutex() && ww_ctx))
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
@@ -1372,9 +1382,8 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
* for CONFIG_PREEMPT_RCU=y)
* - the VCPU on which owner runs is preempted
*/
- if (!owner->on_cpu || need_resched() ||
- rt_mutex_waiter_is_top_waiter(lock, waiter) ||
- vcpu_is_preempted(task_cpu(owner))) {
+ if (!owner_on_cpu(owner) || need_resched() ||
+ !rt_mutex_waiter_is_top_waiter(lock, waiter)) {
res = false;
break;
}
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 5c9299aaabae..900220941caa 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -21,12 +21,13 @@ int max_lock_depth = 1024;
*/
static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock,
unsigned int state,
+ struct lockdep_map *nest_lock,
unsigned int subclass)
{
int ret;
might_sleep();
- mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_);
ret = __rt_mutex_lock(&lock->rtmutex, state);
if (ret)
mutex_release(&lock->dep_map, _RET_IP_);
@@ -48,10 +49,16 @@ EXPORT_SYMBOL(rt_mutex_base_init);
*/
void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
{
- __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass);
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
+void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0);
+}
+EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock);
+
#else /* !CONFIG_DEBUG_LOCK_ALLOC */
/**
@@ -61,7 +68,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
*/
void __sched rt_mutex_lock(struct rt_mutex *lock)
{
- __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock);
#endif
@@ -77,11 +84,26 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
*/
int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
{
- return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0);
+ return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
/**
+ * rt_mutex_lock_killable - lock a rt_mutex killable
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
+{
+ return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
+
+/**
* rt_mutex_trylock - try to lock a rt_mutex
*
* @lock: the rt_mutex to be locked
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 4ba15088e640..6fd3162e4098 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -41,6 +41,12 @@
* The risk of writer starvation is there, but the pathological use cases
* which trigger it are not necessarily the typical RT workloads.
*
+ * Fast-path orderings:
+ * The lock/unlock of readers can run in fast paths: lock and unlock are only
+ * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE
+ * semantics of rwbase_rt. Atomic ops should thus provide _acquire()
+ * and _release() (or stronger).
+ *
* Common code shared between RT rw_semaphore and rwlock
*/
@@ -53,7 +59,7 @@ static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb)
* set.
*/
for (r = atomic_read(&rwb->readers); r < 0;) {
- if (likely(atomic_try_cmpxchg(&rwb->readers, &r, r + 1)))
+ if (likely(atomic_try_cmpxchg_acquire(&rwb->readers, &r, r + 1)))
return 1;
}
return 0;
@@ -141,6 +147,7 @@ static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
{
struct rt_mutex_base *rtm = &rwb->rtmutex;
struct task_struct *owner;
+ DEFINE_RT_WAKE_Q(wqh);
raw_spin_lock_irq(&rtm->wait_lock);
/*
@@ -151,9 +158,12 @@ static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
*/
owner = rt_mutex_owner(rtm);
if (owner)
- wake_up_state(owner, state);
+ rt_mutex_wake_q_add_task(&wqh, owner, state);
+ /* Pairs with the preempt_enable in rt_mutex_wake_up_q() */
+ preempt_disable();
raw_spin_unlock_irq(&rtm->wait_lock);
+ rt_mutex_wake_up_q(&wqh);
}
static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
@@ -162,6 +172,8 @@ static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
/*
* rwb->readers can only hit 0 when a writer is waiting for the
* active readers to leave the critical section.
+ *
+ * dec_and_test() is fully ordered, provides RELEASE.
*/
if (unlikely(atomic_dec_and_test(&rwb->readers)))
__rwbase_read_unlock(rwb, state);
@@ -172,7 +184,11 @@ static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias,
{
struct rt_mutex_base *rtm = &rwb->rtmutex;
- atomic_add(READER_BIAS - bias, &rwb->readers);
+ /*
+ * _release() is needed in case that reader is in fast path, pairing
+ * with atomic_try_cmpxchg_acquire() in rwbase_read_trylock().
+ */
+ (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers);
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
rwbase_rtmutex_unlock(rtm);
}
@@ -196,6 +212,23 @@ static inline void rwbase_write_downgrade(struct rwbase_rt *rwb)
__rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags);
}
+static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ /* Can do without CAS because we're serialized by wait_lock. */
+ lockdep_assert_held(&rwb->rtmutex.wait_lock);
+
+ /*
+ * _acquire is needed in case the reader is in the fast path, pairing
+ * with rwbase_read_unlock(), provides ACQUIRE.
+ */
+ if (!atomic_read_acquire(&rwb->readers)) {
+ atomic_set(&rwb->readers, WRITER_BIAS);
+ return 1;
+ }
+
+ return 0;
+}
+
static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
unsigned int state)
{
@@ -210,34 +243,30 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
atomic_sub(READER_BIAS, &rwb->readers);
raw_spin_lock_irqsave(&rtm->wait_lock, flags);
- /*
- * set_current_state() for rw_semaphore
- * current_save_and_set_rtlock_wait_state() for rwlock
- */
- rwbase_set_and_save_current_state(state);
+ if (__rwbase_write_trylock(rwb))
+ goto out_unlock;
- /* Block until all readers have left the critical section. */
- for (; atomic_read(&rwb->readers);) {
+ rwbase_set_and_save_current_state(state);
+ for (;;) {
/* Optimized out for rwlocks */
if (rwbase_signal_pending_state(state, current)) {
- __set_current_state(TASK_RUNNING);
+ rwbase_restore_current_state();
__rwbase_write_unlock(rwb, 0, flags);
return -EINTR;
}
+
+ if (__rwbase_write_trylock(rwb))
+ break;
+
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_schedule();
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
- /*
- * Schedule and wait for the readers to leave the critical
- * section. The last reader leaving it wakes the waiter.
- */
- if (atomic_read(&rwb->readers) != 0)
- rwbase_schedule();
set_current_state(state);
- raw_spin_lock_irqsave(&rtm->wait_lock, flags);
}
-
- atomic_set(&rwb->readers, WRITER_BIAS);
rwbase_restore_current_state();
+
+out_unlock:
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
return 0;
}
@@ -253,8 +282,7 @@ static inline int rwbase_write_trylock(struct rwbase_rt *rwb)
atomic_sub(READER_BIAS, &rwb->readers);
raw_spin_lock_irqsave(&rtm->wait_lock, flags);
- if (!atomic_read(&rwb->readers)) {
- atomic_set(&rwb->readers, WRITER_BIAS);
+ if (__rwbase_write_trylock(rwb)) {
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
return 1;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 9215b4d6a9de..69aba4abe104 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -56,7 +56,6 @@
*
* A fast path reader optimistic lock stealing is supported when the rwsem
* is previously owned by a writer and the following conditions are met:
- * - OSQ is empty
* - rwsem is not currently writer owned
* - the handoff isn't set.
*/
@@ -106,9 +105,9 @@
* atomic_long_cmpxchg() will be used to obtain writer lock.
*
* There are three places where the lock handoff bit may be set or cleared.
- * 1) rwsem_mark_wake() for readers.
- * 2) rwsem_try_write_lock() for writers.
- * 3) Error path of rwsem_down_write_slowpath().
+ * 1) rwsem_mark_wake() for readers -- set, clear
+ * 2) rwsem_try_write_lock() for writers -- set, clear
+ * 3) rwsem_del_waiter() -- clear
*
* For all the above cases, wait_lock will be held. A writer must also
* be the first one in the wait_list to be eligible for setting the handoff
@@ -335,6 +334,9 @@ struct rwsem_waiter {
struct task_struct *task;
enum rwsem_waiter_type type;
unsigned long timeout;
+
+ /* Writer only, not initialized in reader */
+ bool handoff_set;
};
#define rwsem_first_waiter(sem) \
list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
@@ -345,12 +347,6 @@ enum rwsem_wake_type {
RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
};
-enum writer_wait_state {
- WRITER_NOT_FIRST, /* Writer is not first in wait list */
- WRITER_FIRST, /* Writer is first in wait list */
- WRITER_HANDOFF /* Writer is first & handoff needed */
-};
-
/*
* The typical HZ value is either 250 or 1000. So set the minimum waiting
* time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
@@ -366,6 +362,31 @@ enum writer_wait_state {
*/
#define MAX_READERS_WAKEUP 0x100
+static inline void
+rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+ lockdep_assert_held(&sem->wait_lock);
+ list_add_tail(&waiter->list, &sem->wait_list);
+ /* caller will set RWSEM_FLAG_WAITERS */
+}
+
+/*
+ * Remove a waiter from the wait_list and clear flags.
+ *
+ * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
+ * this function. Modify with care.
+ */
+static inline void
+rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+ lockdep_assert_held(&sem->wait_lock);
+ list_del(&waiter->list);
+ if (likely(!list_empty(&sem->wait_list)))
+ return;
+
+ atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
+}
+
/*
* handle the lock release when processes blocked on it that can now run
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -377,6 +398,8 @@ enum writer_wait_state {
* preferably when the wait_lock is released
* - woken process blocks are discarded from the list after having task zeroed
* - writers are only marked woken if downgrading is false
+ *
+ * Implies rwsem_del_waiter() for all woken readers.
*/
static void rwsem_mark_wake(struct rw_semaphore *sem,
enum rwsem_wake_type wake_type,
@@ -485,24 +508,31 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
/*
* Limit # of readers that can be woken up per wakeup call.
*/
- if (woken >= MAX_READERS_WAKEUP)
+ if (unlikely(woken >= MAX_READERS_WAKEUP))
break;
}
adjustment = woken * RWSEM_READER_BIAS - adjustment;
lockevent_cond_inc(rwsem_wake_reader, woken);
+
+ oldcount = atomic_long_read(&sem->count);
if (list_empty(&sem->wait_list)) {
- /* hit end of list above */
+ /*
+ * Combined with list_move_tail() above, this implies
+ * rwsem_del_waiter().
+ */
adjustment -= RWSEM_FLAG_WAITERS;
+ if (oldcount & RWSEM_FLAG_HANDOFF)
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ } else if (woken) {
+ /*
+ * When we've woken a reader, we no longer need to force
+ * writers to give up the lock and we can clear HANDOFF.
+ */
+ if (oldcount & RWSEM_FLAG_HANDOFF)
+ adjustment -= RWSEM_FLAG_HANDOFF;
}
- /*
- * When we've woken a reader, we no longer need to force writers
- * to give up the lock and we can clear HANDOFF.
- */
- if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
- adjustment -= RWSEM_FLAG_HANDOFF;
-
if (adjustment)
atomic_long_add(adjustment, &sem->count);
@@ -533,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* race conditions between checking the rwsem wait list and setting the
* sem->count accordingly.
*
- * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
- * bit is set or the lock is acquired with handoff bit cleared.
+ * Implies rwsem_del_waiter() on success.
*/
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
- enum writer_wait_state wstate)
+ struct rwsem_waiter *waiter)
{
+ bool first = rwsem_first_waiter(sem) == waiter;
long count, new;
lockdep_assert_held(&sem->wait_lock);
@@ -547,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
do {
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
- if (has_handoff && wstate == WRITER_NOT_FIRST)
- return false;
+ if (has_handoff) {
+ if (!first)
+ return false;
+
+ /* First waiter inherits a previously set handoff bit */
+ waiter->handoff_set = true;
+ }
new = count;
if (count & RWSEM_LOCK_MASK) {
- if (has_handoff || (wstate != WRITER_HANDOFF))
+ if (has_handoff || (!rt_task(waiter->task) &&
+ !time_after(jiffies, waiter->timeout)))
return false;
new |= RWSEM_FLAG_HANDOFF;
@@ -570,13 +606,39 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
* We have either acquired the lock with handoff bit cleared or
* set the handoff bit.
*/
- if (new & RWSEM_FLAG_HANDOFF)
+ if (new & RWSEM_FLAG_HANDOFF) {
+ waiter->handoff_set = true;
+ lockevent_inc(rwsem_wlock_handoff);
return false;
+ }
+ /*
+ * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
+ * success.
+ */
+ list_del(&waiter->list);
rwsem_set_owner(sem);
return true;
}
+/*
+ * The rwsem_spin_on_owner() function returns the following 4 values
+ * depending on the lock owner state.
+ * OWNER_NULL : owner is currently NULL
+ * OWNER_WRITER: when owner changes and is a writer
+ * OWNER_READER: when owner changes and the new owner may be a reader.
+ * OWNER_NONSPINNABLE:
+ * when optimistic spinning has to stop because either the
+ * owner stops running, is unknown, or its timeslice has
+ * been used up.
+ */
+enum owner_state {
+ OWNER_NULL = 1 << 0,
+ OWNER_WRITER = 1 << 1,
+ OWNER_READER = 1 << 2,
+ OWNER_NONSPINNABLE = 1 << 3,
+};
+
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
/*
* Try to acquire write lock before the writer has been put on wait queue.
@@ -596,15 +658,6 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
return false;
}
-static inline bool owner_on_cpu(struct task_struct *owner)
-{
- /*
- * As lock holder preemption issue, we both skip spinning if
- * task is not on cpu or its cpu is preempted
- */
- return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
-}
-
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
@@ -617,7 +670,10 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
}
preempt_disable();
- rcu_read_lock();
+ /*
+ * Disable preemption is equal to the RCU read-side crital section,
+ * thus the task_strcut structure won't go away.
+ */
owner = rwsem_owner_flags(sem, &flags);
/*
* Don't check the read-owner as the entry may be stale.
@@ -625,30 +681,12 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
if ((flags & RWSEM_NONSPINNABLE) ||
(owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
ret = false;
- rcu_read_unlock();
preempt_enable();
lockevent_cond_inc(rwsem_opt_fail, !ret);
return ret;
}
-/*
- * The rwsem_spin_on_owner() function returns the following 4 values
- * depending on the lock owner state.
- * OWNER_NULL : owner is currently NULL
- * OWNER_WRITER: when owner changes and is a writer
- * OWNER_READER: when owner changes and the new owner may be a reader.
- * OWNER_NONSPINNABLE:
- * when optimistic spinning has to stop because either the
- * owner stops running, is unknown, or its timeslice has
- * been used up.
- */
-enum owner_state {
- OWNER_NULL = 1 << 0,
- OWNER_WRITER = 1 << 1,
- OWNER_READER = 1 << 2,
- OWNER_NONSPINNABLE = 1 << 3,
-};
#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
static inline enum owner_state
@@ -670,12 +708,13 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
unsigned long flags, new_flags;
enum owner_state state;
+ lockdep_assert_preemption_disabled();
+
owner = rwsem_owner_flags(sem, &flags);
state = rwsem_owner_state(owner, flags);
if (state != OWNER_WRITER)
return state;
- rcu_read_lock();
for (;;) {
/*
* When a waiting writer set the handoff flag, it may spin
@@ -693,7 +732,9 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
* Ensure we emit the owner->on_cpu, dereference _after_
* checking sem->owner still matches owner, if that fails,
* owner might point to free()d memory, if it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
+ * our spinning context already disabled preemption which is
+ * equal to RCU read-side crital section ensures the memory
+ * stays valid.
*/
barrier();
@@ -704,7 +745,6 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
cpu_relax();
}
- rcu_read_unlock();
return state;
}
@@ -878,12 +918,11 @@ static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
-static inline int
+static inline enum owner_state
rwsem_spin_on_owner(struct rw_semaphore *sem)
{
- return 0;
+ return OWNER_NONSPINNABLE;
}
-#define OWNER_NULL 1
#endif
/*
@@ -953,7 +992,7 @@ queue:
}
adjustment += RWSEM_FLAG_WAITERS;
}
- list_add_tail(&waiter.list, &sem->wait_list);
+ rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock, but no longer actively locking */
count = atomic_long_add_return(adjustment, &sem->count);
@@ -999,11 +1038,7 @@ queue:
return sem;
out_nolock:
- list_del(&waiter.list);
- if (list_empty(&sem->wait_list)) {
- atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
- &sem->count);
- }
+ rwsem_del_waiter(sem, &waiter);
raw_spin_unlock_irq(&sem->wait_lock);
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock_fail);
@@ -1017,9 +1052,7 @@ static struct rw_semaphore *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
long count;
- enum writer_wait_state wstate;
struct rwsem_waiter waiter;
- struct rw_semaphore *ret = sem;
DEFINE_WAKE_Q(wake_q);
/* do optimistic spinning and steal lock if possible */
@@ -1035,16 +1068,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_WRITE;
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
-
- /* account for this before adding a new element to the list */
- wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
-
- list_add_tail(&waiter.list, &sem->wait_list);
+ rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock */
- if (wstate == WRITER_NOT_FIRST) {
+ if (rwsem_first_waiter(sem) != &waiter) {
count = atomic_long_read(&sem->count);
/*
@@ -1080,13 +1110,16 @@ wait:
/* wait until we successfully acquire the lock */
set_current_state(state);
for (;;) {
- if (rwsem_try_write_lock(sem, wstate)) {
+ if (rwsem_try_write_lock(sem, &waiter)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */
break;
}
raw_spin_unlock_irq(&sem->wait_lock);
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
/*
* After setting the handoff bit and failing to acquire
* the lock, attempt to spin on owner to accelerate lock
@@ -1095,70 +1128,37 @@ wait:
* In this case, we attempt to acquire the lock again
* without sleeping.
*/
- if (wstate == WRITER_HANDOFF &&
- rwsem_spin_on_owner(sem) == OWNER_NULL)
- goto trylock_again;
-
- /* Block until there are no active lockers. */
- for (;;) {
- if (signal_pending_state(state, current))
- goto out_nolock;
+ if (waiter.handoff_set) {
+ enum owner_state owner_state;
- schedule();
- lockevent_inc(rwsem_sleep_writer);
- set_current_state(state);
- /*
- * If HANDOFF bit is set, unconditionally do
- * a trylock.
- */
- if (wstate == WRITER_HANDOFF)
- break;
+ preempt_disable();
+ owner_state = rwsem_spin_on_owner(sem);
+ preempt_enable();
- if ((wstate == WRITER_NOT_FIRST) &&
- (rwsem_first_waiter(sem) == &waiter))
- wstate = WRITER_FIRST;
-
- count = atomic_long_read(&sem->count);
- if (!(count & RWSEM_LOCK_MASK))
- break;
-
- /*
- * The setting of the handoff bit is deferred
- * until rwsem_try_write_lock() is called.
- */
- if ((wstate == WRITER_FIRST) && (rt_task(current) ||
- time_after(jiffies, waiter.timeout))) {
- wstate = WRITER_HANDOFF;
- lockevent_inc(rwsem_wlock_handoff);
- break;
- }
+ if (owner_state == OWNER_NULL)
+ goto trylock_again;
}
+
+ schedule();
+ lockevent_inc(rwsem_sleep_writer);
+ set_current_state(state);
trylock_again:
raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
- list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
-
- return ret;
+ return sem;
out_nolock:
__set_current_state(TASK_RUNNING);
raw_spin_lock_irq(&sem->wait_lock);
- list_del(&waiter.list);
-
- if (unlikely(wstate == WRITER_HANDOFF))
- atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
-
- if (list_empty(&sem->wait_list))
- atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
- else
+ rwsem_del_waiter(sem, &waiter);
+ if (!list_empty(&sem->wait_list))
rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);
lockevent_inc(rwsem_wlock_fail);
-
return ERR_PTR(-EINTR);
}
@@ -1239,17 +1239,14 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
- /*
- * Optimize for the case when the rwsem is not locked at all.
- */
- tmp = RWSEM_UNLOCKED_VALUE;
- do {
+ tmp = atomic_long_read(&sem->count);
+ while (!(tmp & RWSEM_READ_FAILED_MASK)) {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
- tmp + RWSEM_READER_BIAS)) {
+ tmp + RWSEM_READER_BIAS)) {
rwsem_set_reader_owned(sem);
return 1;
}
- } while (!(tmp & RWSEM_READ_FAILED_MASK));
+ }
return 0;
}
@@ -1376,15 +1373,17 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
#include "rwbase_rt.c"
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void __rwsem_init(struct rw_semaphore *sem, const char *name,
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
struct lock_class_key *key)
{
+ init_rwbase_rt(&(sem)->rwbase);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
-}
-EXPORT_SYMBOL(__rwsem_init);
#endif
+}
+EXPORT_SYMBOL(__init_rwsem);
static inline void __down_read(struct rw_semaphore *sem)
{
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index c5830cfa379a..b562f9289372 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -378,8 +378,7 @@ unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
local_irq_save(flags);
preempt_disable();
spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
- do_raw_spin_lock_flags, &flags);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
return flags;
}
EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
index d2912e44d61f..9e396a09fe0f 100644
--- a/kernel/locking/spinlock_rt.c
+++ b/kernel/locking/spinlock_rt.c
@@ -24,6 +24,17 @@
#define RT_MUTEX_BUILD_SPINLOCKS
#include "rtmutex.c"
+/*
+ * __might_resched() skips the state check as rtlocks are state
+ * preserving. Take RCU nesting into account as spin/read/write_lock() can
+ * legitimately nest into an RCU read side critical section.
+ */
+#define RTLOCK_RESCHED_OFFSETS \
+ (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT)
+
+#define rtlock_might_resched() \
+ __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS)
+
static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
{
if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
@@ -32,7 +43,7 @@ static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
static __always_inline void __rt_spin_lock(spinlock_t *lock)
{
- ___might_sleep(__FILE__, __LINE__, 0);
+ rtlock_might_resched();
rtlock_lock(&lock->lock);
rcu_read_lock();
migrate_disable();
@@ -210,7 +221,7 @@ EXPORT_SYMBOL(rt_write_trylock);
void __sched rt_read_lock(rwlock_t *rwlock)
{
- ___might_sleep(__FILE__, __LINE__, 0);
+ rtlock_might_resched();
rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
rcu_read_lock();
@@ -220,7 +231,7 @@ EXPORT_SYMBOL(rt_read_lock);
void __sched rt_write_lock(rwlock_t *rwlock)
{
- ___might_sleep(__FILE__, __LINE__, 0);
+ rtlock_might_resched();
rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
rcu_read_lock();
@@ -246,12 +257,6 @@ void __sched rt_write_unlock(rwlock_t *rwlock)
}
EXPORT_SYMBOL(rt_write_unlock);
-int __sched rt_rwlock_is_contended(rwlock_t *rwlock)
-{
- return rw_base_is_contended(&rwlock->rwbase);
-}
-EXPORT_SYMBOL(rt_rwlock_is_contended);
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __rt_rwlock_init(rwlock_t *rwlock, const char *name,
struct lock_class_key *key)
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 3e82f449b4ff..353004155d65 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -16,6 +16,15 @@
static DEFINE_WD_CLASS(ww_class);
struct workqueue_struct *wq;
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+#define ww_acquire_init_noinject(a, b) do { \
+ ww_acquire_init((a), (b)); \
+ (a)->deadlock_inject_countdown = ~0U; \
+ } while (0)
+#else
+#define ww_acquire_init_noinject(a, b) ww_acquire_init((a), (b))
+#endif
+
struct test_mutex {
struct work_struct work;
struct ww_mutex mutex;
@@ -36,7 +45,7 @@ static void test_mutex_work(struct work_struct *work)
wait_for_completion(&mtx->go);
if (mtx->flags & TEST_MTX_TRY) {
- while (!ww_mutex_trylock(&mtx->mutex))
+ while (!ww_mutex_trylock(&mtx->mutex, NULL))
cond_resched();
} else {
ww_mutex_lock(&mtx->mutex, NULL);
@@ -109,19 +118,39 @@ static int test_mutex(void)
return 0;
}
-static int test_aa(void)
+static int test_aa(bool trylock)
{
struct ww_mutex mutex;
struct ww_acquire_ctx ctx;
int ret;
+ const char *from = trylock ? "trylock" : "lock";
ww_mutex_init(&mutex, &ww_class);
ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&mutex, &ctx);
+ if (!trylock) {
+ ret = ww_mutex_lock(&mutex, &ctx);
+ if (ret) {
+ pr_err("%s: initial lock failed!\n", __func__);
+ goto out;
+ }
+ } else {
+ ret = !ww_mutex_trylock(&mutex, &ctx);
+ if (ret) {
+ pr_err("%s: initial trylock failed!\n", __func__);
+ goto out;
+ }
+ }
- if (ww_mutex_trylock(&mutex)) {
- pr_err("%s: trylocked itself!\n", __func__);
+ if (ww_mutex_trylock(&mutex, NULL)) {
+ pr_err("%s: trylocked itself without context from %s!\n", __func__, from);
+ ww_mutex_unlock(&mutex);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ww_mutex_trylock(&mutex, &ctx)) {
+ pr_err("%s: trylocked itself with context from %s!\n", __func__, from);
ww_mutex_unlock(&mutex);
ret = -EINVAL;
goto out;
@@ -129,17 +158,17 @@ static int test_aa(void)
ret = ww_mutex_lock(&mutex, &ctx);
if (ret != -EALREADY) {
- pr_err("%s: missed deadlock for recursing, ret=%d\n",
- __func__, ret);
+ pr_err("%s: missed deadlock for recursing, ret=%d from %s\n",
+ __func__, ret, from);
if (!ret)
ww_mutex_unlock(&mutex);
ret = -EINVAL;
goto out;
}
+ ww_mutex_unlock(&mutex);
ret = 0;
out:
- ww_mutex_unlock(&mutex);
ww_acquire_fini(&ctx);
return ret;
}
@@ -150,7 +179,7 @@ struct test_abba {
struct ww_mutex b_mutex;
struct completion a_ready;
struct completion b_ready;
- bool resolve;
+ bool resolve, trylock;
int result;
};
@@ -160,8 +189,13 @@ static void test_abba_work(struct work_struct *work)
struct ww_acquire_ctx ctx;
int err;
- ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&abba->b_mutex, &ctx);
+ ww_acquire_init_noinject(&ctx, &ww_class);
+ if (!abba->trylock)
+ ww_mutex_lock(&abba->b_mutex, &ctx);
+ else
+ WARN_ON(!ww_mutex_trylock(&abba->b_mutex, &ctx));
+
+ WARN_ON(READ_ONCE(abba->b_mutex.ctx) != &ctx);
complete(&abba->b_ready);
wait_for_completion(&abba->a_ready);
@@ -181,7 +215,7 @@ static void test_abba_work(struct work_struct *work)
abba->result = err;
}
-static int test_abba(bool resolve)
+static int test_abba(bool trylock, bool resolve)
{
struct test_abba abba;
struct ww_acquire_ctx ctx;
@@ -192,12 +226,18 @@ static int test_abba(bool resolve)
INIT_WORK_ONSTACK(&abba.work, test_abba_work);
init_completion(&abba.a_ready);
init_completion(&abba.b_ready);
+ abba.trylock = trylock;
abba.resolve = resolve;
schedule_work(&abba.work);
- ww_acquire_init(&ctx, &ww_class);
- ww_mutex_lock(&abba.a_mutex, &ctx);
+ ww_acquire_init_noinject(&ctx, &ww_class);
+ if (!trylock)
+ ww_mutex_lock(&abba.a_mutex, &ctx);
+ else
+ WARN_ON(!ww_mutex_trylock(&abba.a_mutex, &ctx));
+
+ WARN_ON(READ_ONCE(abba.a_mutex.ctx) != &ctx);
complete(&abba.a_ready);
wait_for_completion(&abba.b_ready);
@@ -249,7 +289,7 @@ static void test_cycle_work(struct work_struct *work)
struct ww_acquire_ctx ctx;
int err, erra = 0;
- ww_acquire_init(&ctx, &ww_class);
+ ww_acquire_init_noinject(&ctx, &ww_class);
ww_mutex_lock(&cycle->a_mutex, &ctx);
complete(cycle->a_signal);
@@ -581,7 +621,9 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
static int __init test_ww_mutex_init(void)
{
int ncpus = num_online_cpus();
- int ret;
+ int ret, i;
+
+ printk(KERN_INFO "Beginning ww mutex selftests\n");
wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
if (!wq)
@@ -591,17 +633,19 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = test_aa();
+ ret = test_aa(false);
if (ret)
return ret;
- ret = test_abba(false);
+ ret = test_aa(true);
if (ret)
return ret;
- ret = test_abba(true);
- if (ret)
- return ret;
+ for (i = 0; i < 4; i++) {
+ ret = test_abba(i & 1, i & 2);
+ if (ret)
+ return ret;
+ }
ret = test_cycle(ncpus);
if (ret)
@@ -619,6 +663,7 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
+ printk(KERN_INFO "All ww mutex selftests passed\n");
return 0;
}
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
index 3f1fff7d2780..d1473c624105 100644
--- a/kernel/locking/ww_rt_mutex.c
+++ b/kernel/locking/ww_rt_mutex.c
@@ -9,6 +9,31 @@
#define WW_RT
#include "rtmutex.c"
+int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ struct rt_mutex *rtm = &lock->base;
+
+ if (!ww_ctx)
+ return rt_mutex_trylock(rtm);
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+ if (__rt_mutex_trylock(&rtm->rtmutex)) {
+ ww_mutex_set_context_fastpath(lock, ww_ctx);
+ mutex_acquire_nest(&rtm->dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ww_mutex_trylock);
+
static int __sched
__ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
unsigned int state, unsigned long ip)
diff --git a/kernel/module.c b/kernel/module.c
index c96160f7f3f5..24fc305afe95 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2942,7 +2942,11 @@ static int module_sig_check(struct load_info *info, int flags)
static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
{
+#if defined(CONFIG_64BIT)
+ unsigned long long secend;
+#else
unsigned long secend;
+#endif
/*
* Check for both overflow and offset/size being
@@ -2967,14 +2971,29 @@ static int elf_validity_check(struct load_info *info)
Elf_Shdr *shdr, *strhdr;
int err;
- if (info->len < sizeof(*(info->hdr)))
- return -ENOEXEC;
+ if (info->len < sizeof(*(info->hdr))) {
+ pr_err("Invalid ELF header len %lu\n", info->len);
+ goto no_exec;
+ }
- if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
- || info->hdr->e_type != ET_REL
- || !elf_check_arch(info->hdr)
- || info->hdr->e_shentsize != sizeof(Elf_Shdr))
- return -ENOEXEC;
+ if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) {
+ pr_err("Invalid ELF header magic: != %s\n", ELFMAG);
+ goto no_exec;
+ }
+ if (info->hdr->e_type != ET_REL) {
+ pr_err("Invalid ELF header type: %u != %u\n",
+ info->hdr->e_type, ET_REL);
+ goto no_exec;
+ }
+ if (!elf_check_arch(info->hdr)) {
+ pr_err("Invalid architecture in ELF header: %u\n",
+ info->hdr->e_machine);
+ goto no_exec;
+ }
+ if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
+ pr_err("Invalid ELF section header size\n");
+ goto no_exec;
+ }
/*
* e_shnum is 16 bits, and sizeof(Elf_Shdr) is
@@ -2983,8 +3002,10 @@ static int elf_validity_check(struct load_info *info)
*/
if (info->hdr->e_shoff >= info->len
|| (info->hdr->e_shnum * sizeof(Elf_Shdr) >
- info->len - info->hdr->e_shoff))
- return -ENOEXEC;
+ info->len - info->hdr->e_shoff)) {
+ pr_err("Invalid ELF section header overflow\n");
+ goto no_exec;
+ }
info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
@@ -2992,13 +3013,19 @@ static int elf_validity_check(struct load_info *info)
* Verify if the section name table index is valid.
*/
if (info->hdr->e_shstrndx == SHN_UNDEF
- || info->hdr->e_shstrndx >= info->hdr->e_shnum)
- return -ENOEXEC;
+ || info->hdr->e_shstrndx >= info->hdr->e_shnum) {
+ pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n",
+ info->hdr->e_shstrndx, info->hdr->e_shstrndx,
+ info->hdr->e_shnum);
+ goto no_exec;
+ }
strhdr = &info->sechdrs[info->hdr->e_shstrndx];
err = validate_section_offset(info, strhdr);
- if (err < 0)
+ if (err < 0) {
+ pr_err("Invalid ELF section hdr(type %u)\n", strhdr->sh_type);
return err;
+ }
/*
* The section name table must be NUL-terminated, as required
@@ -3006,8 +3033,10 @@ static int elf_validity_check(struct load_info *info)
* strings in the section safe.
*/
info->secstrings = (void *)info->hdr + strhdr->sh_offset;
- if (info->secstrings[strhdr->sh_size - 1] != '\0')
- return -ENOEXEC;
+ if (info->secstrings[strhdr->sh_size - 1] != '\0') {
+ pr_err("ELF Spec violation: section name table isn't null terminated\n");
+ goto no_exec;
+ }
/*
* The code assumes that section 0 has a length of zero and
@@ -3015,8 +3044,11 @@ static int elf_validity_check(struct load_info *info)
*/
if (info->sechdrs[0].sh_type != SHT_NULL
|| info->sechdrs[0].sh_size != 0
- || info->sechdrs[0].sh_addr != 0)
- return -ENOEXEC;
+ || info->sechdrs[0].sh_addr != 0) {
+ pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n",
+ info->sechdrs[0].sh_type);
+ goto no_exec;
+ }
for (i = 1; i < info->hdr->e_shnum; i++) {
shdr = &info->sechdrs[i];
@@ -3026,8 +3058,12 @@ static int elf_validity_check(struct load_info *info)
continue;
case SHT_SYMTAB:
if (shdr->sh_link == SHN_UNDEF
- || shdr->sh_link >= info->hdr->e_shnum)
- return -ENOEXEC;
+ || shdr->sh_link >= info->hdr->e_shnum) {
+ pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n",
+ shdr->sh_link, shdr->sh_link,
+ info->hdr->e_shnum);
+ goto no_exec;
+ }
fallthrough;
default:
err = validate_section_offset(info, shdr);
@@ -3049,6 +3085,9 @@ static int elf_validity_check(struct load_info *info)
}
return 0;
+
+no_exec:
+ return -ENOEXEC;
}
#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
@@ -3940,10 +3979,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
* sections.
*/
err = elf_validity_check(info);
- if (err) {
- pr_err("Module has invalid ELF structures\n");
+ if (err)
goto free_copy;
- }
/*
* Everything checks out, so set up the section info
@@ -4491,8 +4528,10 @@ static void cfi_init(struct module *mod)
/* Fix init/exit functions to point to the CFI jump table */
if (init)
mod->init = *init;
+#ifdef CONFIG_MODULE_UNLOAD
if (exit)
mod->exit = *exit;
+#endif
cfi_module_add(mod, module_addr_min);
#endif
diff --git a/kernel/notifier.c b/kernel/notifier.c
index b8251dc0bc0f..ba005ebf4730 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -20,12 +20,13 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
*/
static int notifier_chain_register(struct notifier_block **nl,
- struct notifier_block *n)
+ struct notifier_block *n)
{
while ((*nl) != NULL) {
if (unlikely((*nl) == n)) {
- WARN(1, "double register detected");
- return 0;
+ WARN(1, "notifier callback %ps already registered",
+ n->notifier_call);
+ return -EEXIST;
}
if (n->priority > (*nl)->priority)
break;
@@ -134,7 +135,7 @@ static int notifier_call_chain_robust(struct notifier_block **nl,
*
* Adds a notifier to an atomic notifier chain.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
struct notifier_block *n)
@@ -216,7 +217,7 @@ NOKPROBE_SYMBOL(atomic_notifier_call_chain);
* Adds a notifier to a blocking notifier chain.
* Must be called in process context.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
struct notifier_block *n)
@@ -335,7 +336,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
* Adds a notifier to a raw notifier chain.
* All locking must be provided by the caller.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int raw_notifier_chain_register(struct raw_notifier_head *nh,
struct notifier_block *n)
@@ -406,7 +407,7 @@ EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
* Adds a notifier to an SRCU notifier chain.
* Must be called in process context.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
struct notifier_block *n)
diff --git a/kernel/params.c b/kernel/params.c
index 8299bd764e42..5b92310425c5 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -926,9 +926,9 @@ static const struct sysfs_ops module_sysfs_ops = {
.store = module_attr_store,
};
-static int uevent_filter(struct kset *kset, struct kobject *kobj)
+static int uevent_filter(struct kobject *kobj)
{
- struct kobj_type *ktype = get_ktype(kobj);
+ const struct kobj_type *ktype = get_ktype(kobj);
if (ktype == &module_ktype)
return 1;
diff --git a/kernel/pid.c b/kernel/pid.c
index efe87db44683..2fc0a16ec77b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -540,6 +540,42 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
}
/**
+ * pidfd_get_task() - Get the task associated with a pidfd
+ *
+ * @pidfd: pidfd for which to get the task
+ * @flags: flags associated with this pidfd
+ *
+ * Return the task associated with @pidfd. The function takes a reference on
+ * the returned task. The caller is responsible for releasing that reference.
+ *
+ * Currently, the process identified by @pidfd is always a thread-group leader.
+ * This restriction currently exists for all aspects of pidfds including pidfd
+ * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
+ * (only supports thread group leaders).
+ *
+ * Return: On success, the task_struct associated with the pidfd.
+ * On error, a negative errno number will be returned.
+ */
+struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
+{
+ unsigned int f_flags;
+ struct pid *pid;
+ struct task_struct *task;
+
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid))
+ return ERR_CAST(pid);
+
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ put_pid(pid);
+ if (!task)
+ return ERR_PTR(-ESRCH);
+
+ *flags = f_flags;
+ return task;
+}
+
+/**
* pidfd_create() - Create a new pid file descriptor.
*
* @pid: struct pid that the pidfd will reference
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index a332ccd829e2..0153b0ca7b23 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -2,7 +2,7 @@
/*
* Energy Model of devices
*
- * Copyright (c) 2018-2020, Arm ltd.
+ * Copyright (c) 2018-2021, Arm ltd.
* Written by: Quentin Perret, Arm ltd.
* Improvements provided by: Lukasz Luba, Arm ltd.
*/
@@ -10,6 +10,7 @@
#define pr_fmt(fmt) "energy_model: " fmt
#include <linux/cpu.h>
+#include <linux/cpufreq.h>
#include <linux/cpumask.h>
#include <linux/debugfs.h>
#include <linux/energy_model.h>
@@ -42,6 +43,7 @@ static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd)
debugfs_create_ulong("frequency", 0444, d, &ps->frequency);
debugfs_create_ulong("power", 0444, d, &ps->power);
debugfs_create_ulong("cost", 0444, d, &ps->cost);
+ debugfs_create_ulong("inefficient", 0444, d, &ps->flags);
}
static int em_debug_cpus_show(struct seq_file *s, void *unused)
@@ -55,7 +57,8 @@ DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
static int em_debug_units_show(struct seq_file *s, void *unused)
{
struct em_perf_domain *pd = s->private;
- char *units = pd->milliwatts ? "milliWatts" : "bogoWatts";
+ char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ?
+ "milliWatts" : "bogoWatts";
seq_printf(s, "%s\n", units);
@@ -63,6 +66,17 @@ static int em_debug_units_show(struct seq_file *s, void *unused)
}
DEFINE_SHOW_ATTRIBUTE(em_debug_units);
+static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused)
+{
+ struct em_perf_domain *pd = s->private;
+ int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0;
+
+ seq_printf(s, "%d\n", enabled);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies);
+
static void em_debug_create_pd(struct device *dev)
{
struct dentry *d;
@@ -76,6 +90,8 @@ static void em_debug_create_pd(struct device *dev)
&em_debug_cpus_fops);
debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops);
+ debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd,
+ &em_debug_skip_inefficiencies_fops);
/* Create a sub-directory for each performance state */
for (i = 0; i < dev->em_pd->nr_perf_states; i++)
@@ -107,8 +123,7 @@ static void em_debug_remove_pd(struct device *dev) {}
static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
int nr_states, struct em_data_callback *cb)
{
- unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
- unsigned long power, freq, prev_freq = 0;
+ unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
struct em_perf_state *table;
int i, ret;
u64 fmax;
@@ -153,27 +168,22 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
table[i].power = power;
table[i].frequency = prev_freq = freq;
-
- /*
- * The hertz/watts efficiency ratio should decrease as the
- * frequency grows on sane platforms. But this isn't always
- * true in practice so warn the user if a higher OPP is more
- * power efficient than a lower one.
- */
- opp_eff = freq / power;
- if (opp_eff >= prev_opp_eff)
- dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n",
- i, i - 1);
- prev_opp_eff = opp_eff;
}
/* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
- for (i = 0; i < nr_states; i++) {
+ for (i = nr_states - 1; i >= 0; i--) {
unsigned long power_res = em_scale_power(table[i].power);
table[i].cost = div64_u64(fmax * power_res,
table[i].frequency);
+ if (table[i].cost >= prev_cost) {
+ table[i].flags = EM_PERF_STATE_INEFFICIENT;
+ dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
+ table[i].frequency);
+ } else {
+ prev_cost = table[i].cost;
+ }
}
pd->table = table;
@@ -222,6 +232,43 @@ static int em_create_pd(struct device *dev, int nr_states,
return 0;
}
+static void em_cpufreq_update_efficiencies(struct device *dev)
+{
+ struct em_perf_domain *pd = dev->em_pd;
+ struct em_perf_state *table;
+ struct cpufreq_policy *policy;
+ int found = 0;
+ int i;
+
+ if (!_is_cpu_device(dev) || !pd)
+ return;
+
+ policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd)));
+ if (!policy) {
+ dev_warn(dev, "EM: Access to CPUFreq policy failed");
+ return;
+ }
+
+ table = pd->table;
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT))
+ continue;
+
+ if (!cpufreq_table_set_inefficient(policy, table[i].frequency))
+ found++;
+ }
+
+ if (!found)
+ return;
+
+ /*
+ * Efficiencies have been installed in CPUFreq, inefficient frequencies
+ * will be skipped. The EM can do the same.
+ */
+ pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES;
+}
+
/**
* em_pd_get() - Return the performance domain for a device
* @dev : Device to find the performance domain for
@@ -335,7 +382,10 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
if (ret)
goto unlock;
- dev->em_pd->milliwatts = milliwatts;
+ if (milliwatts)
+ dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS;
+
+ em_cpufreq_update_efficiencies(dev);
em_debug_create_pd(dev);
dev_info(dev, "EM: created perf domain\n");
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 559acef3fddb..e6af502c2fd7 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -300,7 +300,7 @@ static int create_image(int platform_mode)
if (error || hibernation_test(TEST_PLATFORM))
goto Platform_finish;
- error = suspend_disable_secondary_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error || hibernation_test(TEST_CPUS))
goto Enable_cpus;
@@ -342,7 +342,7 @@ static int create_image(int platform_mode)
local_irq_enable();
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
/* Allow architectures to do nosmt-specific post-resume dances */
if (!in_suspend)
@@ -466,6 +466,8 @@ static int resume_target_kernel(bool platform_mode)
if (error)
goto Cleanup;
+ cpuidle_pause();
+
error = hibernate_resume_nonboot_cpu_disable();
if (error)
goto Enable_cpus;
@@ -509,7 +511,7 @@ static int resume_target_kernel(bool platform_mode)
local_irq_enable();
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
Cleanup:
platform_restore_cleanup(platform_mode);
@@ -587,7 +589,7 @@ int hibernation_platform_enter(void)
if (error)
goto Platform_finish;
- error = suspend_disable_secondary_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error)
goto Enable_cpus;
@@ -609,7 +611,7 @@ int hibernation_platform_enter(void)
local_irq_enable();
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
Platform_finish:
hibernation_ops->finish();
@@ -691,7 +693,7 @@ static int load_image_and_restore(void)
goto Unlock;
error = swsusp_read(&flags);
- swsusp_close(FMODE_READ);
+ swsusp_close(FMODE_READ | FMODE_EXCL);
if (!error)
error = hibernation_restore(flags & SF_PLATFORM_MODE);
@@ -981,7 +983,7 @@ static int software_resume(void)
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
- swsusp_close(FMODE_READ);
+ swsusp_close(FMODE_READ | FMODE_EXCL);
goto Unlock;
}
@@ -1016,7 +1018,7 @@ static int software_resume(void)
pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
- swsusp_close(FMODE_READ);
+ swsusp_close(FMODE_READ | FMODE_EXCL);
goto Finish;
}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 778bf431ec02..b4f433943209 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -4,6 +4,8 @@
#include <linux/utsname.h>
#include <linux/freezer.h>
#include <linux/compiler.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
struct swsusp_info {
struct new_utsname uts;
@@ -168,6 +170,7 @@ extern int swsusp_swap_in_use(void);
#define SF_PLATFORM_MODE 1
#define SF_NOCOMPRESS_MODE 2
#define SF_CRC32_MODE 4
+#define SF_HW_SIG 8
/* kernel/power/hibernate.c */
extern int swsusp_check(void);
@@ -310,3 +313,15 @@ extern int pm_wake_lock(const char *buf);
extern int pm_wake_unlock(const char *buf);
#endif /* !CONFIG_PM_WAKELOCKS */
+
+static inline int pm_sleep_disable_secondary_cpus(void)
+{
+ cpuidle_pause();
+ return suspend_disable_secondary_cpus();
+}
+
+static inline void pm_sleep_enable_secondary_cpus(void)
+{
+ suspend_enable_secondary_cpus();
+ cpuidle_resume();
+}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 37401c99b7d7..b7e7798637b8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -94,7 +94,7 @@ static int try_to_freeze_tasks(bool user_only)
todo - wq_busy, wq_busy);
if (wq_busy)
- show_workqueue_state();
+ show_all_workqueues();
if (!wakeup || pm_debug_messages_on) {
read_lock(&tasklist_lock);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index eb75f394a059..80cc1f0f502b 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -97,7 +97,6 @@ static void s2idle_enter(void)
raw_spin_unlock_irq(&s2idle_lock);
cpus_read_lock();
- cpuidle_resume();
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
@@ -105,7 +104,6 @@ static void s2idle_enter(void)
swait_event_exclusive(s2idle_wait_head,
s2idle_state == S2IDLE_STATE_WAKE);
- cpuidle_pause();
cpus_read_unlock();
raw_spin_lock_irq(&s2idle_lock);
@@ -162,11 +160,13 @@ EXPORT_SYMBOL_GPL(s2idle_wake);
static bool valid_state(suspend_state_t state)
{
/*
- * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
- * support and need to be valid to the low level
- * implementation, no valid callback implies that none are valid.
+ * The PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states require low-level
+ * support and need to be valid to the low-level implementation.
+ *
+ * No ->valid() or ->enter() callback implies that none are valid.
*/
- return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state) &&
+ suspend_ops->enter;
}
void __init pm_states_init(void)
@@ -238,7 +238,7 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
static bool sleep_state_supported(suspend_state_t state)
{
- return state == PM_SUSPEND_TO_IDLE || (suspend_ops && suspend_ops->enter);
+ return state == PM_SUSPEND_TO_IDLE || valid_state(state);
}
static int platform_suspend_prepare(suspend_state_t state)
@@ -422,7 +422,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
goto Platform_wake;
}
- error = suspend_disable_secondary_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -452,7 +452,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
BUG_ON(irqs_disabled());
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
Platform_wake:
platform_resume_noirq(state);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3cb89baebc79..ad10359030a4 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -36,6 +36,8 @@
#define HIBERNATE_SIG "S1SUSPEND"
+u32 swsusp_hardware_signature;
+
/*
* When reading an {un,}compressed image, we may restore pages in place,
* in which case some architectures need these pages cleaning before they
@@ -104,7 +106,8 @@ struct swap_map_handle {
struct swsusp_header {
char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
- sizeof(u32)];
+ sizeof(u32) - sizeof(u32)];
+ u32 hw_sig;
u32 crc32;
sector_t image;
unsigned int flags; /* Flags to pass to the "boot" kernel */
@@ -299,7 +302,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
return error;
}
-static blk_status_t hib_wait_io(struct hib_bio_batch *hb)
+static int hib_wait_io(struct hib_bio_batch *hb)
{
/*
* We are relying on the behavior of blk_plug that a thread with
@@ -312,7 +315,6 @@ static blk_status_t hib_wait_io(struct hib_bio_batch *hb)
/*
* Saving part
*/
-
static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
@@ -324,6 +326,10 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
swsusp_header->image = handle->first_sector;
+ if (swsusp_hardware_signature) {
+ swsusp_header->hw_sig = swsusp_hardware_signature;
+ flags |= SF_HW_SIG;
+ }
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
@@ -705,22 +711,19 @@ static int save_image_lzo(struct swap_map_handle *handle,
goto out_clean;
}
- data = vmalloc(array_size(nr_threads, sizeof(*data)));
+ data = vzalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
- for (thr = 0; thr < nr_threads; thr++)
- memset(&data[thr], 0, offsetof(struct cmp_data, go));
- crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ crc = kzalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
- memset(crc, 0, offsetof(struct crc_data, go));
/*
* Start the compression threads.
@@ -1198,22 +1201,19 @@ static int load_image_lzo(struct swap_map_handle *handle,
goto out_clean;
}
- data = vmalloc(array_size(nr_threads, sizeof(*data)));
+ data = vzalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
- for (thr = 0; thr < nr_threads; thr++)
- memset(&data[thr], 0, offsetof(struct dec_data, go));
- crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ crc = kzalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
- memset(crc, 0, offsetof(struct crc_data, go));
clean_pages_on_decompress = true;
@@ -1521,9 +1521,10 @@ end:
int swsusp_check(void)
{
int error;
+ void *holder;
hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
- FMODE_READ, NULL);
+ FMODE_READ | FMODE_EXCL, &holder);
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
@@ -1542,10 +1543,16 @@ int swsusp_check(void)
} else {
error = -EINVAL;
}
+ if (!error && swsusp_header->flags & SF_HW_SIG &&
+ swsusp_header->hw_sig != swsusp_hardware_signature) {
+ pr_info("Suspend image hardware signature mismatch (%08x now %08x); aborting resume.\n",
+ swsusp_header->hw_sig, swsusp_hardware_signature);
+ error = -EINVAL;
+ }
put:
if (error)
- blkdev_put(hib_resume_bdev, FMODE_READ);
+ blkdev_put(hib_resume_bdev, FMODE_READ | FMODE_EXCL);
else
pr_debug("Image signature found, resuming\n");
} else {
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 740723bb3885..ad241b4ff64c 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -177,7 +177,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
if (res <= 0)
goto unlock;
} else {
- res = PAGE_SIZE - pg_offp;
+ res = PAGE_SIZE;
}
if (!data_of(data->handle)) {
diff --git a/kernel/printk/index.c b/kernel/printk/index.c
index d3709408debe..c85be186a783 100644
--- a/kernel/printk/index.c
+++ b/kernel/printk/index.c
@@ -26,10 +26,9 @@ static struct pi_entry *pi_get_entry(const struct module *mod, loff_t pos)
if (mod) {
entries = mod->printk_index_start;
nr_entries = mod->printk_index_size;
- }
+ } else
#endif
-
- if (!mod) {
+ {
/* vmlinux, comes from linker symbols */
entries = __start_printk_index;
nr_entries = __stop_printk_index - __start_printk_index;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 825277e1e742..155229f0cf0f 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -280,7 +280,6 @@ static struct console *exclusive_console;
static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
static int preferred_console = -1;
-static bool has_preferred_console;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);
@@ -847,7 +846,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
return err;
}
- user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
+ user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
if (!user)
return -ENOMEM;
@@ -875,7 +874,7 @@ static int devkmsg_release(struct inode *inode, struct file *file)
ratelimit_state_exit(&user->rs);
mutex_destroy(&user->lock);
- kfree(user);
+ kvfree(user);
return 0;
}
@@ -1166,9 +1165,9 @@ void __init setup_log_buf(int early)
return;
err_free_descs:
- memblock_free(__pa(new_descs), new_descs_size);
+ memblock_free(new_descs, new_descs_size);
err_free_log_buf:
- memblock_free(__pa(new_log_buf), new_log_buf_len);
+ memblock_free(new_log_buf, new_log_buf_len);
}
static bool __read_mostly ignore_loglevel;
@@ -2066,6 +2065,7 @@ u16 printk_parse_prefix(const char *text, int *level,
return prefix_len;
}
+__printf(5, 0)
static u16 printk_sprint(char *text, u16 size, int facility,
enum printk_info_flags *flags, const char *fmt,
va_list args)
@@ -2860,7 +2860,8 @@ early_param("keep_bootcon", keep_bootcon_setup);
* Care need to be taken with consoles that are statically
* enabled such as netconsole
*/
-static int try_enable_new_console(struct console *newcon, bool user_specified)
+static int try_enable_preferred_console(struct console *newcon,
+ bool user_specified)
{
struct console_cmdline *c;
int i, err;
@@ -2890,10 +2891,8 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
return err;
}
newcon->flags |= CON_ENABLED;
- if (i == preferred_console) {
+ if (i == preferred_console)
newcon->flags |= CON_CONSDEV;
- has_preferred_console = true;
- }
return 0;
}
@@ -2908,6 +2907,21 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
return -ENOENT;
}
+/* Try to enable the console unconditionally */
+static void try_enable_default_console(struct console *newcon)
+{
+ if (newcon->index < 0)
+ newcon->index = 0;
+
+ if (newcon->setup && newcon->setup(newcon, NULL) != 0)
+ return;
+
+ newcon->flags |= CON_ENABLED;
+
+ if (newcon->device)
+ newcon->flags |= CON_CONSDEV;
+}
+
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
@@ -2929,59 +2943,56 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
*/
void register_console(struct console *newcon)
{
- struct console *bcon = NULL;
+ struct console *con;
+ bool bootcon_enabled = false;
+ bool realcon_enabled = false;
int err;
- for_each_console(bcon) {
- if (WARN(bcon == newcon, "console '%s%d' already registered\n",
- bcon->name, bcon->index))
+ for_each_console(con) {
+ if (WARN(con == newcon, "console '%s%d' already registered\n",
+ con->name, con->index))
return;
}
- /*
- * before we register a new CON_BOOT console, make sure we don't
- * already have a valid console
- */
- if (newcon->flags & CON_BOOT) {
- for_each_console(bcon) {
- if (!(bcon->flags & CON_BOOT)) {
- pr_info("Too late to register bootconsole %s%d\n",
- newcon->name, newcon->index);
- return;
- }
- }
+ for_each_console(con) {
+ if (con->flags & CON_BOOT)
+ bootcon_enabled = true;
+ else
+ realcon_enabled = true;
}
- if (console_drivers && console_drivers->flags & CON_BOOT)
- bcon = console_drivers;
-
- if (!has_preferred_console || bcon || !console_drivers)
- has_preferred_console = preferred_console >= 0;
+ /* Do not register boot consoles when there already is a real one. */
+ if (newcon->flags & CON_BOOT && realcon_enabled) {
+ pr_info("Too late to register bootconsole %s%d\n",
+ newcon->name, newcon->index);
+ return;
+ }
/*
- * See if we want to use this console driver. If we
- * didn't select a console we take the first one
- * that registers here.
+ * See if we want to enable this console driver by default.
+ *
+ * Nope when a console is preferred by the command line, device
+ * tree, or SPCR.
+ *
+ * The first real console with tty binding (driver) wins. More
+ * consoles might get enabled before the right one is found.
+ *
+ * Note that a console with tty binding will have CON_CONSDEV
+ * flag set and will be first in the list.
*/
- if (!has_preferred_console) {
- if (newcon->index < 0)
- newcon->index = 0;
- if (newcon->setup == NULL ||
- newcon->setup(newcon, NULL) == 0) {
- newcon->flags |= CON_ENABLED;
- if (newcon->device) {
- newcon->flags |= CON_CONSDEV;
- has_preferred_console = true;
- }
+ if (preferred_console < 0) {
+ if (!console_drivers || !console_drivers->device ||
+ console_drivers->flags & CON_BOOT) {
+ try_enable_default_console(newcon);
}
}
/* See if this console matches one we selected on the command line */
- err = try_enable_new_console(newcon, true);
+ err = try_enable_preferred_console(newcon, true);
/* If not, try to match against the platform default(s) */
if (err == -ENOENT)
- err = try_enable_new_console(newcon, false);
+ err = try_enable_preferred_console(newcon, false);
/* printk() messages are not printed to the Braille console. */
if (err || newcon->flags & CON_BRL)
@@ -2993,8 +3004,10 @@ void register_console(struct console *newcon)
* the real console are the same physical device, it's annoying to
* see the beginning boot messages twice
*/
- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
+ if (bootcon_enabled &&
+ ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
newcon->flags &= ~CON_PRINTBUFFER;
+ }
/*
* Put this console in the list - keep the
@@ -3050,15 +3063,15 @@ void register_console(struct console *newcon)
pr_info("%sconsole [%s%d] enabled\n",
(newcon->flags & CON_BOOT) ? "boot" : "" ,
newcon->name, newcon->index);
- if (bcon &&
+ if (bootcon_enabled &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
!keep_bootcon) {
/* We need to iterate through all boot consoles, to make
* sure we print everything out, before we unregister them.
*/
- for_each_console(bcon)
- if (bcon->flags & CON_BOOT)
- unregister_console(bcon);
+ for_each_console(con)
+ if (con->flags & CON_BOOT)
+ unregister_console(con);
}
}
EXPORT_SYMBOL(register_console);
@@ -3252,6 +3265,11 @@ void defer_console_output(void)
preempt_enable();
}
+void printk_trigger_flush(void)
+{
+ defer_console_output();
+}
+
int vprintk_deferred(const char *fmt, va_list args)
{
int r;
diff --git a/kernel/profile.c b/kernel/profile.c
index c2ebddb5e974..eb9c7f0f5ac5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -41,7 +41,8 @@ struct profile_hit {
#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
static atomic_t *prof_buffer;
-static unsigned long prof_len, prof_shift;
+static unsigned long prof_len;
+static unsigned short int prof_shift;
int prof_on __read_mostly;
EXPORT_SYMBOL_GPL(prof_on);
@@ -67,8 +68,8 @@ int profile_setup(char *str)
if (str[strlen(sleepstr)] == ',')
str += strlen(sleepstr) + 1;
if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel sleep profiling enabled (shift: %ld)\n",
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel sleep profiling enabled (shift: %u)\n",
prof_shift);
#else
pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
@@ -78,21 +79,21 @@ int profile_setup(char *str)
if (str[strlen(schedstr)] == ',')
str += strlen(schedstr) + 1;
if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel schedule profiling enabled (shift: %ld)\n",
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel schedule profiling enabled (shift: %u)\n",
prof_shift);
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
prof_on = KVM_PROFILING;
if (str[strlen(kvmstr)] == ',')
str += strlen(kvmstr) + 1;
if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel KVM profiling enabled (shift: %ld)\n",
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel KVM profiling enabled (shift: %u)\n",
prof_shift);
} else if (get_option(&str, &par)) {
- prof_shift = par;
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
prof_on = CPU_PROFILING;
- pr_info("kernel profiling enabled (shift: %ld)\n",
+ pr_info("kernel profiling enabled (shift: %u)\n",
prof_shift);
}
return 1;
@@ -468,7 +469,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
unsigned long p = *ppos;
ssize_t read;
char *pnt;
- unsigned int sample_step = 1 << prof_shift;
+ unsigned long sample_step = 1UL << prof_shift;
profile_flip_buffers();
if (p >= (prof_len+1)*sizeof(unsigned int))
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 3128b7cf8e1f..bf8e341e75b4 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -112,7 +112,7 @@ config RCU_STALL_COMMON
making these warnings mandatory for the tree variants.
config RCU_NEED_SEGCBLIST
- def_bool ( TREE_RCU || TREE_SRCU )
+ def_bool ( TREE_RCU || TREE_SRCU || TASKS_RCU_GENERIC )
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
@@ -169,24 +169,6 @@ config RCU_FANOUT_LEAF
Take the default if unsure.
-config RCU_FAST_NO_HZ
- bool "Accelerate last non-dyntick-idle CPU's grace periods"
- depends on NO_HZ_COMMON && SMP && RCU_EXPERT
- default n
- help
- This option permits CPUs to enter dynticks-idle state even if
- they have RCU callbacks queued, and prevents RCU from waking
- these CPUs up more than roughly once every four jiffies (by
- default, you can adjust this using the rcutree.rcu_idle_gp_delay
- parameter), thus improving energy efficiency. On the other
- hand, this option increases the duration of RCU grace periods,
- for example, slowing down synchronize_rcu().
-
- Say Y if energy efficiency is critically important, and you
- don't care about increased grace-period durations.
-
- Say N if you are unsure.
-
config RCU_BOOST
bool "Enable RCU priority boosting"
depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index aaa111237b60..81145c3ece25 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -261,16 +261,14 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
}
/*
- * Mark the specified rcu_segcblist structure as offloaded.
+ * Mark the specified rcu_segcblist structure as offloaded (or not)
*/
void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload)
{
- if (offload) {
- rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY);
- rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED);
- } else {
+ if (offload)
+ rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED);
+ else
rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED);
- }
}
/*
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 9a19328ff251..e373fbe44da5 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -80,11 +80,14 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED);
}
-/* Is the specified rcu_segcblist offloaded, or is SEGCBLIST_SOFTIRQ_ONLY set? */
+/*
+ * Is the specified rcu_segcblist NOCB offloaded (or in the middle of the
+ * [de]offloading process)?
+ */
static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
{
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- !rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY))
+ rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING))
return true;
return false;
@@ -92,9 +95,8 @@ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp)
{
- int flags = SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | SEGCBLIST_OFFLOADED;
-
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && (rsclp->flags & flags) == flags)
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE))
return true;
return false;
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 2cc34a22a506..5e4f1f83d38e 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -50,8 +50,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s)
#define VERBOSE_SCALEOUT_STRING(s) \
do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0)
-#define VERBOSE_SCALEOUT_ERRSTRING(s) \
- do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s); } while (0)
+#define SCALEOUT_ERRSTRING(s) \
+ pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s)
/*
* The intended use cases for the nreaders and nwriters module parameters
@@ -514,11 +514,11 @@ rcu_scale_cleanup(void)
* during the mid-boot phase, so have to wait till the end.
*/
if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
- VERBOSE_SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
if (rcu_gp_is_normal() && gp_exp)
- VERBOSE_SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
if (gp_exp && gp_async)
- VERBOSE_SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
+ SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
if (torture_cleanup_begin())
return;
@@ -758,7 +758,7 @@ kfree_scale_init(void)
init_waitqueue_head(&shutdown_wq);
firsterr = torture_create_kthread(kfree_scale_shutdown, NULL,
shutdown_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
schedule_timeout_uninterruptible(1);
}
@@ -775,7 +775,7 @@ kfree_scale_init(void)
for (i = 0; i < kfree_nrealthreads; i++) {
firsterr = torture_create_kthread(kfree_scale_thread, (void *)i,
kfree_reader_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
@@ -838,21 +838,21 @@ rcu_scale_init(void)
init_waitqueue_head(&shutdown_wq);
firsterr = torture_create_kthread(rcu_scale_shutdown, NULL,
shutdown_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
schedule_timeout_uninterruptible(1);
}
reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
for (i = 0; i < nrealreaders; i++) {
firsterr = torture_create_kthread(rcu_scale_reader, (void *)i,
reader_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders)
@@ -865,7 +865,7 @@ rcu_scale_init(void)
kcalloc(nrealwriters, sizeof(*writer_n_durations),
GFP_KERNEL);
if (!writer_tasks || !writer_durations || !writer_n_durations) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -879,7 +879,7 @@ rcu_scale_init(void)
}
firsterr = torture_create_kthread(rcu_scale_writer, (void *)i,
writer_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
torture_init_end();
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index ab4215266ebe..422f7e4cc08d 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -46,6 +46,7 @@
#include <linux/oom.h>
#include <linux/tick.h>
#include <linux/rcupdate_trace.h>
+#include <linux/nmi.h>
#include "rcu.h"
@@ -53,15 +54,18 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
/* Bits for ->extendables field, extendables param, and related definitions. */
-#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1)
+#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1)
+#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2)
#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */
#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */
#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */
#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */
#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */
-#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */
+#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */
+#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */
+#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */
#define RCUTORTURE_MAX_EXTEND \
(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
@@ -75,7 +79,7 @@ torture_param(int, fqs_duration, 0,
"Duration of fqs bursts (us), 0 to disable");
torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
-torture_param(bool, fwd_progress, 1, "Test grace-period forward progress");
+torture_param(int, fwd_progress, 1, "Test grace-period forward progress");
torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait");
torture_param(int, fwd_progress_holdoff, 60,
"Time between forward-progress tests (s)");
@@ -109,6 +113,8 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
torture_param(int, stall_cpu_holdoff, 10,
"Time to wait before starting stall (s).");
+torture_param(bool, stall_no_softlockup, false,
+ "Avoid softlockup warning during cpu stall.");
torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
torture_param(int, stall_gp_kthread, 0,
@@ -140,7 +146,7 @@ static struct task_struct *stats_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
static struct task_struct *stall_task;
-static struct task_struct *fwd_prog_task;
+static struct task_struct **fwd_prog_tasks;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
static struct task_struct *read_exit_task;
@@ -342,10 +348,12 @@ struct rcu_torture_ops {
void (*gp_kthread_dbg)(void);
bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
int (*stall_dur)(void);
+ long cbflood_max;
int irq_capable;
int can_boost;
int extendables;
int slow_gps;
+ int no_pi_lock;
const char *name;
};
@@ -667,6 +675,7 @@ static struct rcu_torture_ops srcu_ops = {
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcu"
};
@@ -700,6 +709,7 @@ static struct rcu_torture_ops srcud_ops = {
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcud"
};
@@ -720,6 +730,7 @@ static struct rcu_torture_ops busted_srcud_ops = {
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.extendables = RCUTORTURE_MAX_EXTEND,
.name = "busted_srcud"
};
@@ -831,6 +842,7 @@ static struct rcu_torture_ops tasks_rude_ops = {
.call = call_rcu_tasks_rude,
.cb_barrier = rcu_barrier_tasks_rude,
.gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread,
+ .cbflood_max = 50000,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
@@ -871,6 +883,7 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.call = call_rcu_tasks_trace,
.cb_barrier = rcu_barrier_tasks_trace,
.gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread,
+ .cbflood_max = 50000,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
@@ -1420,46 +1433,64 @@ static void rcutorture_one_extend(int *readstate, int newstate,
struct rt_read_seg *rtrsp)
{
unsigned long flags;
- int idxnew = -1;
- int idxold = *readstate;
+ int idxnew1 = -1;
+ int idxnew2 = -1;
+ int idxold1 = *readstate;
+ int idxold2 = idxold1;
int statesnew = ~*readstate & newstate;
int statesold = *readstate & ~newstate;
- WARN_ON_ONCE(idxold < 0);
- WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1);
+ WARN_ON_ONCE(idxold2 < 0);
+ WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */
if (statesnew & RCUTORTURE_RDR_BH)
local_bh_disable();
+ if (statesnew & RCUTORTURE_RDR_RBH)
+ rcu_read_lock_bh();
if (statesnew & RCUTORTURE_RDR_IRQ)
local_irq_disable();
if (statesnew & RCUTORTURE_RDR_PREEMPT)
preempt_disable();
- if (statesnew & RCUTORTURE_RDR_RBH)
- rcu_read_lock_bh();
if (statesnew & RCUTORTURE_RDR_SCHED)
rcu_read_lock_sched();
- if (statesnew & RCUTORTURE_RDR_RCU)
- idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT;
+ if (statesnew & RCUTORTURE_RDR_RCU_1)
+ idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1;
+ if (statesnew & RCUTORTURE_RDR_RCU_2)
+ idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2;
- /* Next, remove old protection, irq first due to bh conflict. */
+ /*
+ * Next, remove old protection, in decreasing order of strength
+ * to avoid unlock paths that aren't safe in the stronger
+ * context. Namely: BH can not be enabled with disabled interrupts.
+ * Additionally PREEMPT_RT requires that BH is enabled in preemptible
+ * context.
+ */
if (statesold & RCUTORTURE_RDR_IRQ)
local_irq_enable();
- if (statesold & RCUTORTURE_RDR_BH)
- local_bh_enable();
if (statesold & RCUTORTURE_RDR_PREEMPT)
preempt_enable();
- if (statesold & RCUTORTURE_RDR_RBH)
- rcu_read_unlock_bh();
if (statesold & RCUTORTURE_RDR_SCHED)
rcu_read_unlock_sched();
- if (statesold & RCUTORTURE_RDR_RCU) {
- bool lockit = !statesnew && !(torture_random(trsp) & 0xffff);
+ if (statesold & RCUTORTURE_RDR_BH)
+ local_bh_enable();
+ if (statesold & RCUTORTURE_RDR_RBH)
+ rcu_read_unlock_bh();
+ if (statesold & RCUTORTURE_RDR_RCU_2) {
+ cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1);
+ WARN_ON_ONCE(idxnew2 != -1);
+ idxold2 = 0;
+ }
+ if (statesold & RCUTORTURE_RDR_RCU_1) {
+ bool lockit;
+ lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff);
if (lockit)
raw_spin_lock_irqsave(&current->pi_lock, flags);
- cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT);
+ cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1);
+ WARN_ON_ONCE(idxnew1 != -1);
+ idxold1 = 0;
if (lockit)
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
}
@@ -1469,13 +1500,19 @@ static void rcutorture_one_extend(int *readstate, int newstate,
cur_ops->read_delay(trsp, rtrsp);
/* Update the reader state. */
- if (idxnew == -1)
- idxnew = idxold & ~RCUTORTURE_RDR_MASK;
- WARN_ON_ONCE(idxnew < 0);
- WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1);
- *readstate = idxnew | newstate;
- WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0);
- WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1);
+ if (idxnew1 == -1)
+ idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1;
+ WARN_ON_ONCE(idxnew1 < 0);
+ if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1))
+ pr_info("Unexpected idxnew1 value of %#x\n", idxnew1);
+ if (idxnew2 == -1)
+ idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2;
+ WARN_ON_ONCE(idxnew2 < 0);
+ WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
+ *readstate = idxnew1 | idxnew2 | newstate;
+ WARN_ON_ONCE(*readstate < 0);
+ if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1))
+ pr_info("Unexpected idxnew2 value of %#x\n", idxnew2);
}
/* Return the biggest extendables mask given current RCU and boot parameters. */
@@ -1485,7 +1522,7 @@ static int rcutorture_extend_mask_max(void)
WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND);
mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables;
- mask = mask | RCUTORTURE_RDR_RCU;
+ mask = mask | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
return mask;
}
@@ -1496,19 +1533,45 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
int mask = rcutorture_extend_mask_max();
unsigned long randmask1 = torture_random(trsp) >> 8;
unsigned long randmask2 = randmask1 >> 3;
+ unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
+ unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
+ WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1);
/* Mostly only one bit (need preemption!), sometimes lots of bits. */
if (!(randmask1 & 0x7))
mask = mask & randmask2;
else
mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS));
- /* Can't enable bh w/irq disabled. */
- if ((mask & RCUTORTURE_RDR_IRQ) &&
- ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
- (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
- mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- return mask ?: RCUTORTURE_RDR_RCU;
+
+ // Can't have nested RCU reader without outer RCU reader.
+ if (!(mask & RCUTORTURE_RDR_RCU_1) && (mask & RCUTORTURE_RDR_RCU_2)) {
+ if (oldmask & RCUTORTURE_RDR_RCU_1)
+ mask &= ~RCUTORTURE_RDR_RCU_2;
+ else
+ mask |= RCUTORTURE_RDR_RCU_1;
+ }
+
+ /*
+ * Can't enable bh w/irq disabled.
+ */
+ if (mask & RCUTORTURE_RDR_IRQ)
+ mask |= oldmask & bhs;
+
+ /*
+ * Ideally these sequences would be detected in debug builds
+ * (regardless of RT), but until then don't stop testing
+ * them on non-RT.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /* Can't modify BH in atomic context */
+ if (oldmask & preempts_irq)
+ mask &= ~bhs;
+ if ((oldmask | mask) & preempts_irq)
+ mask |= oldmask & bhs;
+ }
+
+ return mask ?: RCUTORTURE_RDR_RCU_1;
}
/*
@@ -1602,7 +1665,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
rcu_torture_writer_state,
cookie, cur_ops->get_gp_state());
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
- WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK);
+ WARN_ON_ONCE(readstate);
// This next splat is expected behavior if leakpointer, especially
// for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1);
@@ -1968,9 +2031,8 @@ static int rcutorture_booster_init(unsigned int cpu)
mutex_lock(&boost_mutex);
rcu_torture_disable_rt_throttle();
VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
- boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
- cpu_to_node(cpu),
- "rcu_torture_boost");
+ boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL,
+ cpu, "rcu_torture_boost_%u");
if (IS_ERR(boost_tasks[cpu])) {
retval = PTR_ERR(boost_tasks[cpu]);
VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
@@ -1979,8 +2041,6 @@ static int rcutorture_booster_init(unsigned int cpu)
mutex_unlock(&boost_mutex);
return retval;
}
- kthread_bind(boost_tasks[cpu], cpu);
- wake_up_process(boost_tasks[cpu]);
mutex_unlock(&boost_mutex);
return 0;
}
@@ -2028,6 +2088,8 @@ static int rcu_torture_stall(void *args)
#else
schedule_timeout_uninterruptible(HZ);
#endif
+ } else if (stall_no_softlockup) {
+ touch_softlockup_watchdog();
}
if (stall_cpu_irqsoff)
local_irq_enable();
@@ -2099,10 +2161,13 @@ struct rcu_fwd {
unsigned long rcu_fwd_startat;
struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
unsigned long rcu_launder_gp_seq_start;
+ int rcu_fwd_id;
};
static DEFINE_MUTEX(rcu_fwd_mutex);
static struct rcu_fwd *rcu_fwds;
+static unsigned long rcu_fwd_seq;
+static atomic_long_t rcu_fwd_max_cbs;
static bool rcu_fwd_emergency_stop;
static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
@@ -2115,8 +2180,9 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
if (rfp->n_launders_hist[i].n_launders > 0)
break;
- pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
- __func__, jiffies - rfp->rcu_fwd_startat);
+ mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
+ pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):",
+ __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat);
gps_old = rfp->rcu_launder_gp_seq_start;
for (j = 0; j <= i; j++) {
gps = rfp->n_launders_hist[j].launder_gp_seq;
@@ -2127,6 +2193,7 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
gps_old = gps;
}
pr_cont("\n");
+ mutex_unlock(&rcu_fwd_mutex);
}
/* Callback function for continuous-flood RCU callbacks. */
@@ -2252,7 +2319,8 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
cver = READ_ONCE(rcu_torture_current_version) - cver;
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
WARN_ON(!cver && gps < 2);
- pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps);
+ pr_alert("%s: %d Duration %ld cver %ld gps %ld\n", __func__,
+ rfp->rcu_fwd_id, dur, cver, gps);
}
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 1);
@@ -2320,7 +2388,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
rfp->rcu_fwd_cb_head = rfcpn;
n_launders++;
n_launders_sa++;
- } else {
+ } else if (!cur_ops->cbflood_max || cur_ops->cbflood_max > n_max_cbs) {
rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL);
if (WARN_ON_ONCE(!rfcp)) {
schedule_timeout_interruptible(1);
@@ -2330,8 +2398,11 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders_sa = 0;
rfcp->rfc_gps = 0;
rfcp->rfc_rfp = rfp;
+ } else {
+ rfcp = NULL;
}
- cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
+ if (rfcp)
+ cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
if (tick_nohz_full_enabled()) {
local_irq_save(flags);
@@ -2355,6 +2426,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders + n_max_cbs - n_launders_cb_snap,
n_launders, n_launders_sa,
n_max_gps, n_max_cbs, cver, gps);
+ atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
rcu_torture_fwd_cb_hist(rfp);
}
schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
@@ -2370,6 +2442,8 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
static int rcutorture_oom_notify(struct notifier_block *self,
unsigned long notused, void *nfreed)
{
+ int i;
+ long ncbs;
struct rcu_fwd *rfp;
mutex_lock(&rcu_fwd_mutex);
@@ -2380,18 +2454,26 @@ static int rcutorture_oom_notify(struct notifier_block *self,
}
WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
__func__);
- rcu_torture_fwd_cb_hist(rfp);
- rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2);
+ for (i = 0; i < fwd_progress; i++) {
+ rcu_torture_fwd_cb_hist(&rfp[i]);
+ rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp[i].rcu_fwd_startat)) / 2);
+ }
WRITE_ONCE(rcu_fwd_emergency_stop, true);
smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
rcu_barrier();
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
rcu_barrier();
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
smp_mb(); /* Frees before return to avoid redoing OOM. */
(*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
pr_info("%s returning after OOM processing.\n", __func__);
@@ -2406,7 +2488,10 @@ static struct notifier_block rcutorture_oom_nb = {
/* Carry out grace-period forward-progress testing. */
static int rcu_torture_fwd_prog(void *args)
{
+ bool firsttime = true;
+ long max_cbs;
int oldnice = task_nice(current);
+ unsigned long oldseq = READ_ONCE(rcu_fwd_seq);
struct rcu_fwd *rfp = args;
int tested = 0;
int tested_tries = 0;
@@ -2416,21 +2501,38 @@ static int rcu_torture_fwd_prog(void *args)
if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST))
set_user_nice(current, MAX_NICE);
do {
- schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
- WRITE_ONCE(rcu_fwd_emergency_stop, false);
- if (!IS_ENABLED(CONFIG_TINY_RCU) ||
- rcu_inkernel_boot_has_ended())
- rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
- if (rcu_inkernel_boot_has_ended())
+ if (!rfp->rcu_fwd_id) {
+ schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
+ WRITE_ONCE(rcu_fwd_emergency_stop, false);
+ if (!firsttime) {
+ max_cbs = atomic_long_xchg(&rcu_fwd_max_cbs, 0);
+ pr_alert("%s n_max_cbs: %ld\n", __func__, max_cbs);
+ }
+ firsttime = false;
+ WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
+ } else {
+ while (READ_ONCE(rcu_fwd_seq) == oldseq)
+ schedule_timeout_interruptible(1);
+ oldseq = READ_ONCE(rcu_fwd_seq);
+ }
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
+ if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id)
rcu_torture_fwd_prog_cr(rfp);
+ if ((cur_ops->stall_dur && cur_ops->stall_dur() > 0) &&
+ (!IS_ENABLED(CONFIG_TINY_RCU) ||
+ (rcu_inkernel_boot_has_ended() &&
+ torture_num_online_cpus() > rfp->rcu_fwd_id)))
+ rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
/* Avoid slow periods, better to test when busy. */
if (stutter_wait("rcu_torture_fwd_prog"))
sched_set_normal(current, oldnice);
} while (!torture_must_stop());
/* Short runs might not contain a valid forward-progress attempt. */
- WARN_ON(!tested && tested_tries >= 5);
- pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
+ if (!rfp->rcu_fwd_id) {
+ WARN_ON(!tested && tested_tries >= 5);
+ pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
+ }
torture_kthread_stopping("rcu_torture_fwd_prog");
return 0;
}
@@ -2438,18 +2540,29 @@ static int rcu_torture_fwd_prog(void *args)
/* If forward-progress checking is requested and feasible, spawn the thread. */
static int __init rcu_torture_fwd_prog_init(void)
{
+ int i;
+ int ret = 0;
struct rcu_fwd *rfp;
if (!fwd_progress)
return 0; /* Not requested, so don't do it. */
+ if (fwd_progress >= nr_cpu_ids) {
+ VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Limiting fwd_progress to # CPUs.\n");
+ fwd_progress = nr_cpu_ids;
+ } else if (fwd_progress < 0) {
+ fwd_progress = nr_cpu_ids;
+ }
if ((!cur_ops->sync && !cur_ops->call) ||
- !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) {
+ (!cur_ops->cbflood_max && (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0)) ||
+ cur_ops == &rcu_busted_ops) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test");
+ fwd_progress = 0;
return 0;
}
if (stall_cpu > 0) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing");
- if (IS_MODULE(CONFIG_RCU_TORTURE_TESTS))
+ fwd_progress = 0;
+ if (IS_MODULE(CONFIG_RCU_TORTURE_TEST))
return -EINVAL; /* In module, can fail back to user. */
WARN_ON(1); /* Make sure rcutorture notices conflict. */
return 0;
@@ -2458,29 +2571,51 @@ static int __init rcu_torture_fwd_prog_init(void)
fwd_progress_holdoff = 1;
if (fwd_progress_div <= 0)
fwd_progress_div = 4;
- rfp = kzalloc(sizeof(*rfp), GFP_KERNEL);
- if (!rfp)
+ rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL);
+ fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL);
+ if (!rfp || !fwd_prog_tasks) {
+ kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
+ fwd_progress = 0;
return -ENOMEM;
- spin_lock_init(&rfp->rcu_fwd_lock);
- rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
+ }
+ for (i = 0; i < fwd_progress; i++) {
+ spin_lock_init(&rfp[i].rcu_fwd_lock);
+ rfp[i].rcu_fwd_cb_tail = &rfp[i].rcu_fwd_cb_head;
+ rfp[i].rcu_fwd_id = i;
+ }
mutex_lock(&rcu_fwd_mutex);
rcu_fwds = rfp;
mutex_unlock(&rcu_fwd_mutex);
register_oom_notifier(&rcutorture_oom_nb);
- return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
+ for (i = 0; i < fwd_progress; i++) {
+ ret = torture_create_kthread(rcu_torture_fwd_prog, &rcu_fwds[i], fwd_prog_tasks[i]);
+ if (ret) {
+ fwd_progress = i;
+ return ret;
+ }
+ }
+ return 0;
}
static void rcu_torture_fwd_prog_cleanup(void)
{
+ int i;
struct rcu_fwd *rfp;
- torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
- rfp = rcu_fwds;
+ if (!rcu_fwds || !fwd_prog_tasks)
+ return;
+ for (i = 0; i < fwd_progress; i++)
+ torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_tasks[i]);
+ unregister_oom_notifier(&rcutorture_oom_nb);
mutex_lock(&rcu_fwd_mutex);
+ rfp = rcu_fwds;
rcu_fwds = NULL;
mutex_unlock(&rcu_fwd_mutex);
- unregister_oom_notifier(&rcutorture_oom_nb);
kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
}
/* Callback function for RCU barrier testing. */
@@ -2717,7 +2852,7 @@ static int rcu_torture_read_exit(void *unused)
&trs, "%s",
"rcu_torture_read_exit_child");
if (IS_ERR(tsp)) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
errexit = true;
tsp = NULL;
break;
@@ -2741,7 +2876,7 @@ static int rcu_torture_read_exit(void *unused)
static int rcu_torture_read_exit_init(void)
{
if (read_exit_burst <= 0)
- return -EINVAL;
+ return 0;
init_waitqueue_head(&read_exit_wq);
read_exit_child_stop = false;
read_exit_child_stopped = false;
@@ -2819,7 +2954,7 @@ rcu_torture_cleanup(void)
rcutorture_seq_diff(gp_seq, start_gp_seq));
torture_stop_kthread(rcu_torture_stats, stats_task);
torture_stop_kthread(rcu_torture_fqs, fqs_task);
- if (rcu_torture_can_boost())
+ if (rcu_torture_can_boost() && rcutor_hp >= 0)
cpuhp_remove_state(rcutor_hp);
/*
@@ -3037,14 +3172,14 @@ rcu_torture_init(void)
rcu_torture_write_types();
firsterr = torture_create_kthread(rcu_torture_writer, NULL,
writer_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
if (nfakewriters > 0) {
fakewriter_tasks = kcalloc(nfakewriters,
sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -3052,7 +3187,7 @@ rcu_torture_init(void)
for (i = 0; i < nfakewriters; i++) {
firsterr = torture_create_kthread(rcu_torture_fakewriter,
NULL, fakewriter_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
@@ -3060,7 +3195,7 @@ rcu_torture_init(void)
rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk),
GFP_KERNEL);
if (!reader_tasks || !rcu_torture_reader_mbchk) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -3068,7 +3203,7 @@ rcu_torture_init(void)
rcu_torture_reader_mbchk[i].rtc_chkrdr = -1;
firsterr = torture_create_kthread(rcu_torture_reader, (void *)i,
reader_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
nrealnocbers = nocbs_nthreads;
@@ -3079,7 +3214,7 @@ rcu_torture_init(void)
if (nrealnocbers > 0) {
nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL);
if (nocb_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -3088,18 +3223,18 @@ rcu_torture_init(void)
}
for (i = 0; i < nrealnocbers; i++) {
firsterr = torture_create_kthread(rcu_nocb_toggle, NULL, nocb_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(rcu_torture_stats, NULL,
stats_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (test_no_idle_hz && shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval * HZ);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stutter < 0)
@@ -3109,7 +3244,7 @@ rcu_torture_init(void)
t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ;
firsterr = torture_stutter_init(stutter * HZ, t);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (fqs_duration < 0)
@@ -3118,7 +3253,7 @@ rcu_torture_init(void)
/* Create the fqs thread */
firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
fqs_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (test_boost_interval < 1)
@@ -3132,9 +3267,9 @@ rcu_torture_init(void)
firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE",
rcutorture_booster_init,
rcutorture_booster_cleanup);
- if (firsterr < 0)
- goto unwind;
rcutor_hp = firsterr;
+ if (torture_init_error(firsterr))
+ goto unwind;
// Testing RCU priority boosting requires rcutorture do
// some serious abuse. Counter this by running ksoftirqd
@@ -3153,23 +3288,23 @@ rcu_torture_init(void)
}
shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval,
rcutorture_sync);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_stall_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_fwd_prog_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_barrier_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_read_exit_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
if (object_debug)
rcu_test_debug_objects();
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 66dc14cf5687..5489ff7f478e 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -44,7 +44,10 @@
pr_alert("%s" SCALE_FLAG s, scale_type, ## x)
#define VERBOSE_SCALEOUT(s, x...) \
- do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0)
+ do { \
+ if (verbose) \
+ pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \
+ } while (0)
static atomic_t verbose_batch_ctr;
@@ -54,12 +57,11 @@ do { \
(verbose_batched <= 0 || \
!(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \
schedule_timeout_uninterruptible(1); \
- pr_alert("%s" SCALE_FLAG s, scale_type, ## x); \
+ pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \
} \
} while (0)
-#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \
- do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0)
+#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
@@ -604,7 +606,7 @@ static u64 process_durations(int n)
char *buf;
u64 sum = 0;
- buf = kmalloc(128 + nreaders * 32, GFP_KERNEL);
+ buf = kmalloc(800 + 64, GFP_KERNEL);
if (!buf)
return 0;
buf[0] = 0;
@@ -617,13 +619,15 @@ static u64 process_durations(int n)
if (i % 5 == 0)
strcat(buf, "\n");
+ if (strlen(buf) >= 800) {
+ pr_alert("%s", buf);
+ buf[0] = 0;
+ }
strcat(buf, buf1);
sum += rt->last_duration_ns;
}
- strcat(buf, "\n");
-
- SCALEOUT("%s\n", buf);
+ pr_alert("%s\n", buf);
kfree(buf);
return sum;
@@ -637,7 +641,6 @@ static u64 process_durations(int n)
// point all the timestamps are printed.
static int main_func(void *arg)
{
- bool errexit = false;
int exp, r;
char buf1[64];
char *buf;
@@ -648,10 +651,10 @@ static int main_func(void *arg)
VERBOSE_SCALEOUT("main_func task started");
result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
- buf = kzalloc(64 + nruns * 32, GFP_KERNEL);
+ buf = kzalloc(800 + 64, GFP_KERNEL);
if (!result_avg || !buf) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
- errexit = true;
+ SCALEOUT_ERRSTRING("out of memory");
+ goto oom_exit;
}
if (holdoff)
schedule_timeout_interruptible(holdoff * HZ);
@@ -663,8 +666,6 @@ static int main_func(void *arg)
// Start exp readers up per experiment
for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
- if (errexit)
- break;
if (torture_must_stop())
goto end;
@@ -698,26 +699,23 @@ static int main_func(void *arg)
// Print the average of all experiments
SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
- if (!errexit) {
- buf[0] = 0;
- strcat(buf, "\n");
- strcat(buf, "Runs\tTime(ns)\n");
- }
-
+ pr_alert("Runs\tTime(ns)\n");
for (exp = 0; exp < nruns; exp++) {
u64 avg;
u32 rem;
- if (errexit)
- break;
avg = div_u64_rem(result_avg[exp], 1000, &rem);
sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem);
strcat(buf, buf1);
+ if (strlen(buf) >= 800) {
+ pr_alert("%s", buf);
+ buf[0] = 0;
+ }
}
- if (!errexit)
- SCALEOUT("%s", buf);
+ pr_alert("%s", buf);
+oom_exit:
// This will shutdown everything including us.
if (shutdown) {
shutdown_start = 1;
@@ -824,7 +822,7 @@ ref_scale_init(void)
init_waitqueue_head(&shutdown_wq);
firsterr = torture_create_kthread(ref_scale_shutdown, NULL,
shutdown_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
schedule_timeout_uninterruptible(1);
}
@@ -841,17 +839,17 @@ ref_scale_init(void)
reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (!reader_tasks) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
- VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders);
+ VERBOSE_SCALEOUT("Starting %d reader threads", nreaders);
for (i = 0; i < nreaders; i++) {
firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
reader_tasks[i].task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
init_waitqueue_head(&(reader_tasks[i].wq));
@@ -860,7 +858,7 @@ ref_scale_init(void)
// Main Task
init_waitqueue_head(&main_wq);
firsterr = torture_create_kthread(main_func, NULL, main_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
torture_init_end();
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index a0ba2ed49bc6..92c002d65482 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -99,7 +99,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
- if (!newval && READ_ONCE(ssp->srcu_gp_waiting))
+ if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task())
swake_up_one(&ssp->srcu_wq);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 806160c44b17..84f1d91604cc 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -6,6 +6,7 @@
*/
#ifdef CONFIG_TASKS_RCU_GENERIC
+#include "rcu_segcblist.h"
////////////////////////////////////////////////////////////////////////
//
@@ -20,11 +21,33 @@ typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp);
typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
/**
+ * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism.
+ * @cblist: Callback list.
+ * @lock: Lock protecting per-CPU callback list.
+ * @rtp_jiffies: Jiffies counter value for statistics.
+ * @rtp_n_lock_retries: Rough lock-contention statistic.
+ * @rtp_work: Work queue for invoking callbacks.
+ * @rtp_irq_work: IRQ work queue for deferred wakeups.
+ * @barrier_q_head: RCU callback for barrier operation.
+ * @cpu: CPU number corresponding to this entry.
+ * @rtpp: Pointer to the rcu_tasks structure.
+ */
+struct rcu_tasks_percpu {
+ struct rcu_segcblist cblist;
+ raw_spinlock_t __private lock;
+ unsigned long rtp_jiffies;
+ unsigned long rtp_n_lock_retries;
+ struct work_struct rtp_work;
+ struct irq_work rtp_irq_work;
+ struct rcu_head barrier_q_head;
+ int cpu;
+ struct rcu_tasks *rtpp;
+};
+
+/**
* struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
- * @cbs_head: Head of callback list.
- * @cbs_tail: Tail pointer for callback list.
* @cbs_wq: Wait queue allowing new callback to get kthread's attention.
- * @cbs_lock: Lock protecting callback list.
+ * @cbs_gbl_lock: Lock protecting callback list.
* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
* @gp_func: This flavor's grace-period-wait function.
* @gp_state: Grace period's most recent state transition (debugging).
@@ -32,7 +55,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @init_fract: Initial backoff sleep interval.
* @gp_jiffies: Time of last @gp_state transition.
* @gp_start: Most recent grace-period start in jiffies.
- * @n_gps: Number of grace periods completed since boot.
+ * @tasks_gp_seq: Number of grace periods completed since boot.
* @n_ipis: Number of IPIs sent to encourage grace periods to end.
* @n_ipis_fails: Number of IPI-send failures.
* @pregp_func: This flavor's pre-grace-period function (optional).
@@ -41,20 +64,27 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @holdouts_func: This flavor's holdout-list scan function (optional).
* @postgp_func: This flavor's post-grace-period function (optional).
* @call_func: This flavor's call_rcu()-equivalent function.
+ * @rtpcpu: This flavor's rcu_tasks_percpu structure.
+ * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks.
+ * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing.
+ * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing.
+ * @percpu_dequeue_gpseq: RCU grace-period number to propagate enqueue limit to dequeuers.
+ * @barrier_q_mutex: Serialize barrier operations.
+ * @barrier_q_count: Number of queues being waited on.
+ * @barrier_q_completion: Barrier wait/wakeup mechanism.
+ * @barrier_q_seq: Sequence number for barrier operations.
* @name: This flavor's textual name.
* @kname: This flavor's kthread name.
*/
struct rcu_tasks {
- struct rcu_head *cbs_head;
- struct rcu_head **cbs_tail;
struct wait_queue_head cbs_wq;
- raw_spinlock_t cbs_lock;
+ raw_spinlock_t cbs_gbl_lock;
int gp_state;
int gp_sleep;
int init_fract;
unsigned long gp_jiffies;
unsigned long gp_start;
- unsigned long n_gps;
+ unsigned long tasks_gp_seq;
unsigned long n_ipis;
unsigned long n_ipis_fails;
struct task_struct *kthread_ptr;
@@ -65,20 +95,40 @@ struct rcu_tasks {
holdouts_func_t holdouts_func;
postgp_func_t postgp_func;
call_rcu_func_t call_func;
+ struct rcu_tasks_percpu __percpu *rtpcpu;
+ int percpu_enqueue_shift;
+ int percpu_enqueue_lim;
+ int percpu_dequeue_lim;
+ unsigned long percpu_dequeue_gpseq;
+ struct mutex barrier_q_mutex;
+ atomic_t barrier_q_count;
+ struct completion barrier_q_completion;
+ unsigned long barrier_q_seq;
char *name;
char *kname;
};
-#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \
-static struct rcu_tasks rt_name = \
-{ \
- .cbs_tail = &rt_name.cbs_head, \
- .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \
- .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \
- .gp_func = gp, \
- .call_func = call, \
- .name = n, \
- .kname = #rt_name, \
+static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp);
+
+#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \
+static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \
+ .rtp_irq_work = IRQ_WORK_INIT(call_rcu_tasks_iw_wakeup), \
+}; \
+static struct rcu_tasks rt_name = \
+{ \
+ .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \
+ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \
+ .gp_func = gp, \
+ .call_func = call, \
+ .rtpcpu = &rt_name ## __percpu, \
+ .name = n, \
+ .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS), \
+ .percpu_enqueue_lim = 1, \
+ .percpu_dequeue_lim = 1, \
+ .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \
+ .barrier_q_seq = (0UL - 50UL) << RCU_SEQ_CTR_SHIFT, \
+ .kname = #rt_name, \
}
/* Track exiting tasks in order to allow them to be waited for. */
@@ -94,6 +144,15 @@ module_param(rcu_task_ipi_delay, int, 0644);
static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
+static int rcu_task_enqueue_lim __read_mostly = -1;
+module_param(rcu_task_enqueue_lim, int, 0444);
+
+static bool rcu_task_cb_adjust;
+static int rcu_task_contend_lim __read_mostly = 100;
+module_param(rcu_task_contend_lim, int, 0444);
+static int rcu_task_collapse_lim __read_mostly = 10;
+module_param(rcu_task_collapse_lim, int, 0444);
+
/* RCU tasks grace-period state for debugging. */
#define RTGS_INIT 0
#define RTGS_WAIT_WAIT_CBS 1
@@ -128,6 +187,8 @@ static const char * const rcu_tasks_gp_state_names[] = {
//
// Generic code.
+static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp);
+
/* Record grace-period phase and time. */
static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate)
{
@@ -148,23 +209,106 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
}
#endif /* #ifndef CONFIG_TINY_RCU */
+// Initialize per-CPU callback lists for the specified flavor of
+// Tasks RCU.
+static void cblist_init_generic(struct rcu_tasks *rtp)
+{
+ int cpu;
+ unsigned long flags;
+ int lim;
+
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rcu_task_enqueue_lim < 0) {
+ rcu_task_enqueue_lim = 1;
+ rcu_task_cb_adjust = true;
+ pr_info("%s: Setting adjustable number of callback queues.\n", __func__);
+ } else if (rcu_task_enqueue_lim == 0) {
+ rcu_task_enqueue_lim = 1;
+ }
+ lim = rcu_task_enqueue_lim;
+
+ if (lim > nr_cpu_ids)
+ lim = nr_cpu_ids;
+ WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids / lim));
+ WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
+ smp_store_release(&rtp->percpu_enqueue_lim, lim);
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ WARN_ON_ONCE(!rtpcp);
+ if (cpu)
+ raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock));
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ if (rcu_segcblist_empty(&rtpcp->cblist))
+ rcu_segcblist_init(&rtpcp->cblist);
+ INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
+ rtpcp->cpu = cpu;
+ rtpcp->rtpp = rtp;
+ raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim));
+}
+
+// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic().
+static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work);
+
+ rtp = rtpcp->rtpp;
+ wake_up(&rtp->cbs_wq);
+}
+
// Enqueue a callback for the specified flavor of Tasks RCU.
static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
struct rcu_tasks *rtp)
{
unsigned long flags;
+ unsigned long j;
+ bool needadjust = false;
bool needwake;
+ struct rcu_tasks_percpu *rtpcp;
rhp->next = NULL;
rhp->func = func;
- raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
- needwake = !rtp->cbs_head;
- WRITE_ONCE(*rtp->cbs_tail, rhp);
- rtp->cbs_tail = &rhp->next;
- raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);
+ local_irq_save(flags);
+ rcu_read_lock();
+ rtpcp = per_cpu_ptr(rtp->rtpcpu,
+ smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift));
+ if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled.
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ j = jiffies;
+ if (rtpcp->rtp_jiffies != j) {
+ rtpcp->rtp_jiffies = j;
+ rtpcp->rtp_n_lock_retries = 0;
+ }
+ if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim &&
+ READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids)
+ needadjust = true; // Defer adjustment to avoid deadlock.
+ }
+ if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) {
+ raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
+ cblist_init_generic(rtp);
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ }
+ needwake = rcu_segcblist_empty(&rtpcp->cblist);
+ rcu_segcblist_enqueue(&rtpcp->cblist, rhp);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ if (unlikely(needadjust)) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim != nr_cpu_ids) {
+ WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids));
+ WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids);
+ smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids);
+ pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+ rcu_read_unlock();
/* We can't create the thread unless interrupts are enabled. */
if (needwake && READ_ONCE(rtp->kthread_ptr))
- wake_up(&rtp->cbs_wq);
+ irq_work_queue(&rtpcp->rtp_irq_work);
}
// Wait for a grace period for the specified flavor of Tasks RCU.
@@ -178,12 +322,173 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
wait_rcu_gp(rtp->call_func);
}
+// RCU callback function for rcu_barrier_tasks_generic().
+static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp;
+
+ rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head);
+ rtp = rtpcp->rtpp;
+ if (atomic_dec_and_test(&rtp->barrier_q_count))
+ complete(&rtp->barrier_q_completion);
+}
+
+// Wait for all in-flight callbacks for the specified RCU Tasks flavor.
+// Operates in a manner similar to rcu_barrier().
+static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ unsigned long s = rcu_seq_snap(&rtp->barrier_q_seq);
+
+ mutex_lock(&rtp->barrier_q_mutex);
+ if (rcu_seq_done(&rtp->barrier_q_seq, s)) {
+ smp_mb();
+ mutex_unlock(&rtp->barrier_q_mutex);
+ return;
+ }
+ rcu_seq_start(&rtp->barrier_q_seq);
+ init_completion(&rtp->barrier_q_completion);
+ atomic_set(&rtp->barrier_q_count, 2);
+ for_each_possible_cpu(cpu) {
+ if (cpu >= smp_load_acquire(&rtp->percpu_dequeue_lim))
+ break;
+ rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+ rtpcp->barrier_q_head.func = rcu_barrier_tasks_generic_cb;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (rcu_segcblist_entrain(&rtpcp->cblist, &rtpcp->barrier_q_head))
+ atomic_inc(&rtp->barrier_q_count);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+ if (atomic_sub_and_test(2, &rtp->barrier_q_count))
+ complete(&rtp->barrier_q_completion);
+ wait_for_completion(&rtp->barrier_q_completion);
+ rcu_seq_end(&rtp->barrier_q_seq);
+ mutex_unlock(&rtp->barrier_q_mutex);
+}
+
+// Advance callbacks and indicate whether either a grace period or
+// callback invocation is needed.
+static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
+{
+ int cpu;
+ unsigned long flags;
+ long n;
+ long ncbs = 0;
+ long ncbsnz = 0;
+ int needgpcb = 0;
+
+ for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ /* Advance and accelerate any new callbacks. */
+ if (!rcu_segcblist_n_cbs(&rtpcp->cblist))
+ continue;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ // Should we shrink down to a single callback queue?
+ n = rcu_segcblist_n_cbs(&rtpcp->cblist);
+ if (n) {
+ ncbs += n;
+ if (cpu > 0)
+ ncbsnz += n;
+ }
+ rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+ (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+ if (rcu_segcblist_pend_cbs(&rtpcp->cblist))
+ needgpcb |= 0x3;
+ if (!rcu_segcblist_empty(&rtpcp->cblist))
+ needgpcb |= 0x1;
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+
+ // Shrink down to a single callback queue if appropriate.
+ // This is done in two stages: (1) If there are no more than
+ // rcu_task_collapse_lim callbacks on CPU 0 and none on any other
+ // CPU, limit enqueueing to CPU 0. (2) After an RCU grace period,
+ // if there has not been an increase in callbacks, limit dequeuing
+ // to CPU 0. Note the matching RCU read-side critical section in
+ // call_rcu_tasks_generic().
+ if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim > 1) {
+ WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids));
+ smp_store_release(&rtp->percpu_enqueue_lim, 1);
+ rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
+ pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+ if (rcu_task_cb_adjust && !ncbsnz &&
+ poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq)) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) {
+ WRITE_ONCE(rtp->percpu_dequeue_lim, 1);
+ pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+
+ return needgpcb;
+}
+
+// Advance callbacks and invoke any that are ready.
+static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp)
+{
+ int cpu;
+ int cpunext;
+ unsigned long flags;
+ int len;
+ struct rcu_head *rhp;
+ struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
+ struct rcu_tasks_percpu *rtpcp_next;
+
+ cpu = rtpcp->cpu;
+ cpunext = cpu * 2 + 1;
+ if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
+ queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work);
+ cpunext++;
+ if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
+ queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work);
+ }
+ }
+
+ if (rcu_segcblist_empty(&rtpcp->cblist))
+ return;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+ rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ len = rcl.len;
+ for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
+ cond_resched();
+ }
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ rcu_segcblist_add_len(&rtpcp->cblist, -len);
+ (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+}
+
+// Workqueue flood to advance callbacks and invoke any that are ready.
+static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = container_of(wp, struct rcu_tasks_percpu, rtp_work);
+
+ rtp = rtpcp->rtpp;
+ rcu_tasks_invoke_cbs(rtp, rtpcp);
+}
+
/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
static int __noreturn rcu_tasks_kthread(void *arg)
{
- unsigned long flags;
- struct rcu_head *list;
- struct rcu_head *next;
+ int needgpcb;
struct rcu_tasks *rtp = arg;
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
@@ -197,47 +502,26 @@ static int __noreturn rcu_tasks_kthread(void *arg)
* This loop is terminated by the system going down. ;-)
*/
for (;;) {
-
- /* Pick up any new callbacks. */
- raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
- smp_mb__after_spinlock(); // Order updates vs. GP.
- list = rtp->cbs_head;
- rtp->cbs_head = NULL;
- rtp->cbs_tail = &rtp->cbs_head;
- raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);
+ set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
/* If there were none, wait a bit and start over. */
- if (!list) {
- wait_event_interruptible(rtp->cbs_wq,
- READ_ONCE(rtp->cbs_head));
- if (!rtp->cbs_head) {
- WARN_ON(signal_pending(current));
- set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS);
- schedule_timeout_idle(HZ/10);
- }
- continue;
+ wait_event_idle(rtp->cbs_wq, (needgpcb = rcu_tasks_need_gpcb(rtp)));
+
+ if (needgpcb & 0x2) {
+ // Wait for one grace period.
+ set_tasks_gp_state(rtp, RTGS_WAIT_GP);
+ rtp->gp_start = jiffies;
+ rcu_seq_start(&rtp->tasks_gp_seq);
+ rtp->gp_func(rtp);
+ rcu_seq_end(&rtp->tasks_gp_seq);
}
- // Wait for one grace period.
- set_tasks_gp_state(rtp, RTGS_WAIT_GP);
- rtp->gp_start = jiffies;
- rtp->gp_func(rtp);
- rtp->n_gps++;
-
- /* Invoke the callbacks. */
+ /* Invoke callbacks. */
set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
- while (list) {
- next = list->next;
- local_bh_disable();
- list->func(list);
- local_bh_enable();
- list = next;
- cond_resched();
- }
+ rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
+
/* Paranoid sleep to keep this from entering a tight loop */
schedule_timeout_idle(rtp->gp_sleep);
-
- set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
}
}
@@ -280,14 +564,15 @@ static void __init rcu_tasks_bootup_oddness(void)
/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each...
pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n",
rtp->kname,
tasks_gp_state_getname(rtp), data_race(rtp->gp_state),
jiffies - data_race(rtp->gp_jiffies),
- data_race(rtp->n_gps),
+ data_race(rcu_seq_current(&rtp->tasks_gp_seq)),
data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis),
".k"[!!data_race(rtp->kthread_ptr)],
- ".C"[!!data_race(rtp->cbs_head)],
+ ".C"[!data_race(rcu_segcblist_empty(&rtpcp->cblist))],
s);
}
#endif // #ifndef CONFIG_TINY_RCU
@@ -369,7 +654,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
////////////////////////////////////////////////////////////////////////
//
// Simple variant of RCU whose quiescent states are voluntary context
-// switch, cond_resched_rcu_qs(), user-space execution, and idle.
+// switch, cond_resched_tasks_rcu_qs(), user-space execution, and idle.
// As such, grace periods can take one good long time. There are no
// read-side primitives similar to rcu_read_lock() and rcu_read_unlock()
// because this implementation is intended to get the system into a safe
@@ -412,10 +697,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
// read-side critical sections waited for by rcu_tasks_postscan().
//
-// Pre-grace-period update-side code is ordered before the grace via the
-// ->cbs_lock and the smp_mb__after_spinlock(). Pre-grace-period read-side
-// code is ordered before the grace period via synchronize_rcu() call
-// in rcu_tasks_pregp_step() and by the scheduler's locks and interrupt
+// Pre-grace-period update-side code is ordered before the grace
+// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code
+// is ordered before the grace period via synchronize_rcu() call in
+// rcu_tasks_pregp_step() and by the scheduler's locks and interrupt
// disabling.
/* Pre-grace-period preparation. */
@@ -540,7 +825,7 @@ DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_tasks() assumes
* that the read-side critical sections end at a voluntary context
- * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle,
+ * switch (not a preemption!), cond_resched_tasks_rcu_qs(), entry into idle,
* or transition to usermode execution. As such, there are no read-side
* primitives analogous to rcu_read_lock() and rcu_read_unlock() because
* this primitive is intended to determine that all tasks have passed
@@ -587,13 +872,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
*/
void rcu_barrier_tasks(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks();
+ rcu_barrier_tasks_generic(&rcu_tasks);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
static int __init rcu_spawn_tasks_kthread(void)
{
+ cblist_init_generic(&rcu_tasks);
rcu_tasks.gp_sleep = HZ / 10;
rcu_tasks.init_fract = HZ / 10;
rcu_tasks.pregp_func = rcu_tasks_pregp_step;
@@ -678,11 +963,11 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_tasks_rude()
* assumes that the read-side critical sections end at context switch,
- * cond_resched_rcu_qs(), or transition to usermode execution. As such,
- * there are no read-side primitives analogous to rcu_read_lock() and
- * rcu_read_unlock() because this primitive is intended to determine
- * that all tasks have passed through a safe state, not so much for
- * data-structure synchronization.
+ * cond_resched_tasks_rcu_qs(), or transition to usermode execution (as
+ * usermode execution is schedulable). As such, there are no read-side
+ * primitives analogous to rcu_read_lock() and rcu_read_unlock() because
+ * this primitive is intended to determine that all tasks have passed
+ * through a safe state, not so much for data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -700,8 +985,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_rude);
* grace period has elapsed, in other words after all currently
* executing rcu-tasks read-side critical sections have elapsed. These
* read-side critical sections are delimited by calls to schedule(),
- * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory,
- * anyway) cond_resched().
+ * cond_resched_tasks_rcu_qs(), userspace execution (which is a schedulable
+ * context), and (in theory, anyway) cond_resched().
*
* This is a very specialized primitive, intended only for a few uses in
* tracing and other situations requiring manipulation of function preambles
@@ -725,13 +1010,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude);
*/
void rcu_barrier_tasks_rude(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks_rude();
+ rcu_barrier_tasks_generic(&rcu_tasks_rude);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude);
static int __init rcu_spawn_tasks_rude_kthread(void)
{
+ cblist_init_generic(&rcu_tasks_rude);
rcu_tasks_rude.gp_sleep = HZ / 10;
rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
return 0;
@@ -758,7 +1043,7 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
// 2. Protects code in the idle loop, exception entry/exit, and
// CPU-hotplug code paths, similar to the capabilities of SRCU.
//
-// 3. Avoids expensive read-side instruction, having overhead similar
+// 3. Avoids expensive read-side instructions, having overhead similar
// to that of Preemptible RCU.
//
// There are of course downsides. The grace-period code can send IPIs to
@@ -848,7 +1133,7 @@ static void rcu_read_unlock_iw(struct irq_work *iwp)
static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw);
/* If we are the last reader, wake up the grace-period kthread. */
-void rcu_read_unlock_trace_special(struct task_struct *t, int nesting)
+void rcu_read_unlock_trace_special(struct task_struct *t)
{
int nq = READ_ONCE(t->trc_reader_special.b.need_qs);
@@ -858,7 +1143,7 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting)
// Update .need_qs before ->trc_reader_nesting for irq/NMI handlers.
if (nq)
WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
- WRITE_ONCE(t->trc_reader_nesting, nesting);
+ WRITE_ONCE(t->trc_reader_nesting, 0);
if (nq && atomic_dec_and_test(&trc_n_readers_need_end))
irq_work_queue(&rcu_tasks_trace_iw);
}
@@ -890,32 +1175,24 @@ static void trc_read_check_handler(void *t_in)
// If the task is no longer running on this CPU, leave.
if (unlikely(texp != t)) {
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
- wake_up(&trc_wait);
goto reset_ipi; // Already on holdout list, so will check later.
}
// If the task is not in a read-side critical section, and
// if this is the last reader, awaken the grace-period kthread.
if (likely(!READ_ONCE(t->trc_reader_nesting))) {
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
- wake_up(&trc_wait);
- // Mark as checked after decrement to avoid false
- // positives on the above WARN_ON_ONCE().
WRITE_ONCE(t->trc_reader_checked, true);
goto reset_ipi;
}
// If we are racing with an rcu_read_unlock_trace(), try again later.
- if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) {
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
- wake_up(&trc_wait);
+ if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0))
goto reset_ipi;
- }
WRITE_ONCE(t->trc_reader_checked, true);
// Get here if the task is in a read-side critical section. Set
// its state so that it will awaken the grace-period kthread upon
// exit from that critical section.
+ atomic_inc(&trc_n_readers_need_end); // One more to wait on.
WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
@@ -928,10 +1205,10 @@ reset_ipi:
}
/* Callback function for scheduler to check locked-down task. */
-static bool trc_inspect_reader(struct task_struct *t, void *arg)
+static int trc_inspect_reader(struct task_struct *t, void *arg)
{
int cpu = task_cpu(t);
- bool in_qs = false;
+ int nesting;
bool ofl = cpu_is_offline(cpu);
if (task_curr(t)) {
@@ -939,7 +1216,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
// If no chance of heavyweight readers, do it the hard way.
if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- return false;
+ return -EINVAL;
// If heavyweight readers are enabled on the remote task,
// we can inspect its state despite its currently running.
@@ -947,22 +1224,22 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
n_heavy_reader_attempts++;
if (!ofl && // Check for "running" idle tasks on offline CPUs.
!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
- return false; // No quiescent state, do it the hard way.
+ return -EINVAL; // No quiescent state, do it the hard way.
n_heavy_reader_updates++;
if (ofl)
n_heavy_reader_ofl_updates++;
- in_qs = true;
+ nesting = 0;
} else {
// The task is not running, so C-language access is safe.
- in_qs = likely(!t->trc_reader_nesting);
+ nesting = t->trc_reader_nesting;
}
- // Mark as checked so that the grace-period kthread will
- // remove it from the holdout list.
- t->trc_reader_checked = true;
-
- if (in_qs)
- return true; // Already in quiescent state, done!!!
+ // If not exiting a read-side critical section, mark as checked
+ // so that the grace-period kthread will remove it from the
+ // holdout list.
+ t->trc_reader_checked = nesting >= 0;
+ if (nesting <= 0)
+ return nesting ? -EINVAL : 0; // If in QS, done, otherwise try again later.
// The task is in a read-side critical section, so set up its
// state so that it will awaken the grace-period kthread upon exit
@@ -970,7 +1247,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
atomic_inc(&trc_n_readers_need_end); // One more to wait on.
WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
- return true;
+ return 0;
}
/* Attempt to extract the state for the specified task. */
@@ -992,7 +1269,7 @@ static void trc_wait_for_one_reader(struct task_struct *t,
// Attempt to nail down the task for inspection.
get_task_struct(t);
- if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) {
+ if (!task_call_func(t, trc_inspect_reader, NULL)) {
put_task_struct(t);
return;
}
@@ -1000,7 +1277,7 @@ static void trc_wait_for_one_reader(struct task_struct *t,
// If this task is not yet on the holdout list, then we are in
// an RCU read-side critical section. Otherwise, the invocation of
- // rcu_add_holdout() that added it to the list did the necessary
+ // trc_add_holdout() that added it to the list did the necessary
// get_task_struct(). Either way, the task cannot be freed out
// from under this code.
@@ -1015,21 +1292,17 @@ static void trc_wait_for_one_reader(struct task_struct *t,
if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0)
return;
- atomic_inc(&trc_n_readers_need_end);
per_cpu(trc_ipi_to_cpu, cpu) = true;
t->trc_ipi_to_cpu = cpu;
rcu_tasks_trace.n_ipis++;
- if (smp_call_function_single(cpu,
- trc_read_check_handler, t, 0)) {
+ if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) {
// Just in case there is some other reason for
// failure than the target CPU being offline.
+ WARN_ONCE(1, "%s(): smp_call_function_single() failed for CPU: %d\n",
+ __func__, cpu);
rcu_tasks_trace.n_ipis_fails++;
per_cpu(trc_ipi_to_cpu, cpu) = false;
- t->trc_ipi_to_cpu = cpu;
- if (atomic_dec_and_test(&trc_n_readers_need_end)) {
- WARN_ON_ONCE(1);
- wake_up(&trc_wait);
- }
+ t->trc_ipi_to_cpu = -1;
}
}
}
@@ -1086,25 +1359,50 @@ static void rcu_tasks_trace_postscan(struct list_head *hop)
// Any tasks that exit after this point will set ->trc_reader_checked.
}
+/* Communicate task state back to the RCU tasks trace stall warning request. */
+struct trc_stall_chk_rdr {
+ int nesting;
+ int ipi_to_cpu;
+ u8 needqs;
+};
+
+static int trc_check_slow_task(struct task_struct *t, void *arg)
+{
+ struct trc_stall_chk_rdr *trc_rdrp = arg;
+
+ if (task_curr(t))
+ return false; // It is running, so decline to inspect it.
+ trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting);
+ trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu);
+ trc_rdrp->needqs = READ_ONCE(t->trc_reader_special.b.need_qs);
+ return true;
+}
+
/* Show the state of a task stalling the current RCU tasks trace GP. */
static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
{
int cpu;
+ struct trc_stall_chk_rdr trc_rdr;
+ bool is_idle_tsk = is_idle_task(t);
if (*firstreport) {
pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n");
*firstreport = false;
}
- // FIXME: This should attempt to use try_invoke_on_nonrunning_task().
cpu = task_cpu(t);
- pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n",
- t->pid,
- ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0],
- ".i"[is_idle_task(t)],
- ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)],
- READ_ONCE(t->trc_reader_nesting),
- " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)],
- cpu);
+ if (!task_call_func(t, trc_check_slow_task, &trc_rdr))
+ pr_alert("P%d: %c\n",
+ t->pid,
+ ".i"[is_idle_tsk]);
+ else
+ pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n",
+ t->pid,
+ ".I"[trc_rdr.ipi_to_cpu >= 0],
+ ".i"[is_idle_tsk],
+ ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)],
+ trc_rdr.nesting,
+ " N"[!!trc_rdr.needqs],
+ cpu);
sched_show_task(t);
}
@@ -1134,7 +1432,8 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
trc_wait_for_one_reader(t, hop);
// If check succeeded, remove this task from the list.
- if (READ_ONCE(t->trc_reader_checked))
+ if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 &&
+ READ_ONCE(t->trc_reader_checked))
trc_del_holdout(t);
else if (needreport)
show_stalled_task_trace(t, firstreport);
@@ -1144,20 +1443,34 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
cpus_read_unlock();
if (needreport) {
- if (firstreport)
+ if (*firstreport)
pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n");
show_stalled_ipi_trace();
}
}
+static void rcu_tasks_trace_empty_fn(void *unused)
+{
+}
+
/* Wait for grace period to complete and provide ordering. */
static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
{
+ int cpu;
bool firstreport;
struct task_struct *g, *t;
LIST_HEAD(holdouts);
long ret;
+ // Wait for any lingering IPI handlers to complete. Note that
+ // if a CPU has gone offline or transitioned to userspace in the
+ // meantime, all IPI handlers should have been drained beforehand.
+ // Yes, this assumes that CPUs process IPIs in order. If that ever
+ // changes, there will need to be a recheck and/or timed wait.
+ for_each_online_cpu(cpu)
+ if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))))
+ smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
+
// Remove the safety count.
smp_mb__before_atomic(); // Order vs. earlier atomics
atomic_dec(&trc_n_readers_need_end);
@@ -1200,7 +1513,7 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t)
WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
WRITE_ONCE(t->trc_reader_nesting, 0);
if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
- rcu_read_unlock_trace_special(t, 0);
+ rcu_read_unlock_trace_special(t);
}
/**
@@ -1208,15 +1521,11 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t)
* @rhp: structure to be used for queueing the RCU updates.
* @func: actual callback function to be invoked after the grace period
*
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_tasks_trace()
- * assumes that the read-side critical sections end at context switch,
- * cond_resched_rcu_qs(), or transition to usermode execution. As such,
- * there are no read-side primitives analogous to rcu_read_lock() and
- * rcu_read_unlock() because this primitive is intended to determine
- * that all tasks have passed through a safe state, not so much for
- * data-structure synchronization.
+ * The callback function will be invoked some time after a trace rcu-tasks
+ * grace period elapses, in other words after all currently executing
+ * trace rcu-tasks read-side critical sections have completed. These
+ * read-side critical sections are delimited by calls to rcu_read_lock_trace()
+ * and rcu_read_unlock_trace().
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -1232,7 +1541,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace);
*
* Control will return to the caller some time after a trace rcu-tasks
* grace period has elapsed, in other words after all currently executing
- * rcu-tasks read-side critical sections have elapsed. These read-side
+ * trace rcu-tasks read-side critical sections have elapsed. These read-side
* critical sections are delimited by calls to rcu_read_lock_trace()
* and rcu_read_unlock_trace().
*
@@ -1259,13 +1568,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace);
*/
void rcu_barrier_tasks_trace(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks_trace();
+ rcu_barrier_tasks_generic(&rcu_tasks_trace);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace);
static int __init rcu_spawn_tasks_trace_kthread(void)
{
+ cblist_init_generic(&rcu_tasks_trace);
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) {
rcu_tasks_trace.gp_sleep = HZ / 10;
rcu_tasks_trace.init_fract = HZ / 10;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bce848e50512..a4c25a6283b0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -79,7 +79,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
.dynticks = ATOMIC_INIT(1),
#ifdef CONFIG_RCU_NOCB_CPU
- .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
+ .cblist.flags = SEGCBLIST_RCU_CORE,
#endif
};
static struct rcu_state rcu_state = {
@@ -327,7 +327,7 @@ static void rcu_dynticks_eqs_online(void)
*/
static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
{
- return !(atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1);
+ return !(arch_atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1);
}
/*
@@ -624,7 +624,6 @@ static noinstr void rcu_eqs_enter(bool user)
instrumentation_begin();
trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
- rcu_prepare_for_idle();
rcu_preempt_deferred_qs(current);
// instrumentation for the noinstr rcu_dynticks_eqs_enter()
@@ -768,9 +767,6 @@ noinstr void rcu_nmi_exit(void)
trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
- if (!in_nmi())
- rcu_prepare_for_idle();
-
// instrumentation for the noinstr rcu_dynticks_eqs_enter()
instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
instrumentation_end();
@@ -872,7 +868,6 @@ static void noinstr rcu_eqs_exit(bool user)
// instrumentation for the noinstr rcu_dynticks_eqs_exit()
instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
- rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
WRITE_ONCE(rdp->dynticks_nesting, 1);
@@ -1014,12 +1009,6 @@ noinstr void rcu_nmi_enter(void)
rcu_dynticks_eqs_exit();
// ... but is watching here.
- if (!in_nmi()) {
- instrumentation_begin();
- rcu_cleanup_after_idle();
- instrumentation_end();
- }
-
instrumentation_begin();
// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
@@ -1087,6 +1076,24 @@ void rcu_irq_enter_irqson(void)
}
/*
+ * Check to see if any future non-offloaded RCU-related work will need
+ * to be done by the current CPU, even if none need be done immediately,
+ * returning 1 if so. This function is part of the RCU implementation;
+ * it is -not- an exported member of the RCU API. This is used by
+ * the idle-entry code to figure out whether it is safe to disable the
+ * scheduler-clock interrupt.
+ *
+ * Just check whether or not this CPU has non-offloaded RCU callbacks
+ * queued.
+ */
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
+{
+ *nextevt = KTIME_MAX;
+ return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
+ !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
+}
+
+/*
* If any sort of urgency was applied to the current CPU (for example,
* the scheduler-clock interrupt was enabled on a nohz_full CPU) in order
* to get to a quiescent state, disable it.
@@ -1219,8 +1226,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
unsigned long jtsq;
- bool *rnhqp;
- bool *ruqp;
struct rcu_node *rnp = rdp->mynode;
/*
@@ -1285,17 +1290,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* is set way high.
*/
jtsq = READ_ONCE(jiffies_to_sched_qs);
- ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu);
- rnhqp = per_cpu_ptr(&rcu_data.rcu_need_heavy_qs, rdp->cpu);
- if (!READ_ONCE(*rnhqp) &&
+ if (!READ_ONCE(rdp->rcu_need_heavy_qs) &&
(time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
time_after(jiffies, rcu_state.jiffies_resched) ||
rcu_state.cbovld)) {
- WRITE_ONCE(*rnhqp, true);
+ WRITE_ONCE(rdp->rcu_need_heavy_qs, true);
/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
- smp_store_release(ruqp, true);
+ smp_store_release(&rdp->rcu_urgent_qs, true);
} else if (time_after(jiffies, rcu_state.gp_start + jtsq)) {
- WRITE_ONCE(*ruqp, true);
+ WRITE_ONCE(rdp->rcu_urgent_qs, true);
}
/*
@@ -1309,7 +1312,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (tick_nohz_full_cpu(rdp->cpu) &&
(time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
rcu_state.cbovld)) {
- WRITE_ONCE(*ruqp, true);
+ WRITE_ONCE(rdp->rcu_urgent_qs, true);
resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
}
@@ -1471,7 +1474,7 @@ static void rcu_gp_kthread_wake(void)
{
struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
- if ((current == t && !in_irq() && !in_serving_softirq()) ||
+ if ((current == t && !in_hardirq() && !in_serving_softirq()) ||
!READ_ONCE(rcu_state.gp_flags) || !t)
return;
WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
@@ -1594,10 +1597,11 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
struct rcu_data *rdp)
{
rcu_lockdep_assert_cblist_protected(rdp);
- if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
- !raw_spin_trylock_rcu_node(rnp))
+ if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp))
return;
- WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
+ // The grace period cannot end while we hold the rcu_node lock.
+ if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))
+ WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
raw_spin_unlock_rcu_node(rnp);
}
@@ -1779,6 +1783,8 @@ static noinline_for_stack bool rcu_gp_init(void)
*/
WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
rcu_for_each_leaf_node(rnp) {
+ // Wait for CPU-hotplug operations that might have
+ // started before this grace period did.
smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
firstseq = READ_ONCE(rnp->ofl_seq);
if (firstseq & 0x1)
@@ -1907,7 +1913,7 @@ static void rcu_gp_fqs(bool first_time)
struct rcu_node *rnp = rcu_get_root();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
- rcu_state.n_force_qs++;
+ WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
if (first_time) {
/* Collect dyntick-idle snapshots. */
force_qs_rnp(dyntick_save_progress_counter);
@@ -2279,7 +2285,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
unsigned long flags;
unsigned long mask;
bool needwake = false;
- const bool offloaded = rcu_rdp_is_offloaded(rdp);
+ bool needacc = false;
struct rcu_node *rnp;
WARN_ON_ONCE(rdp->cpu != smp_processor_id());
@@ -2306,15 +2312,30 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
/*
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
+ *
+ * NOCB kthreads have their own way to deal with that...
*/
- if (!offloaded)
+ if (!rcu_rdp_is_offloaded(rdp)) {
needwake = rcu_accelerate_cbs(rnp, rdp);
+ } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
+ /*
+ * ...but NOCB kthreads may miss or delay callbacks acceleration
+ * if in the middle of a (de-)offloading process.
+ */
+ needacc = true;
+ }
rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
if (needwake)
rcu_gp_kthread_wake();
+
+ if (needacc) {
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_accelerate_cbs_unlocked(rnp, rdp);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
}
}
@@ -2358,7 +2379,7 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
int rcutree_dying_cpu(unsigned int cpu)
{
bool blkd;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_node *rnp = rdp->mynode;
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
@@ -2446,7 +2467,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
int div;
bool __maybe_unused empty;
unsigned long flags;
- const bool offloaded = rcu_rdp_is_offloaded(rdp);
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
long bl, count = 0;
@@ -2464,18 +2484,17 @@ static void rcu_do_batch(struct rcu_data *rdp)
}
/*
- * Extract the list of ready callbacks, disabling to prevent
+ * Extract the list of ready callbacks, disabling IRQs to prevent
* races with call_rcu() from interrupt handlers. Leave the
* callback counts, as rcu_barrier() needs to be conservative.
*/
- local_irq_save(flags);
- rcu_nocb_lock(rdp);
+ rcu_nocb_lock_irqsave(rdp, flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
pending = rcu_segcblist_n_cbs(&rdp->cblist);
div = READ_ONCE(rcu_divisor);
div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
bl = max(rdp->blimit, pending >> div);
- if (unlikely(bl > 100)) {
+ if (in_serving_softirq() && unlikely(bl > 100)) {
long rrn = READ_ONCE(rcu_resched_ns);
rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
@@ -2484,7 +2503,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
trace_rcu_batch_start(rcu_state.name,
rcu_segcblist_n_cbs(&rdp->cblist), bl);
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
- if (offloaded)
+ if (rcu_rdp_is_offloaded(rdp))
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
@@ -2512,18 +2531,21 @@ static void rcu_do_batch(struct rcu_data *rdp)
/*
* Stop only if limit reached and CPU has something to do.
*/
- if (count >= bl && !offloaded &&
- (need_resched() ||
- (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
- break;
- if (unlikely(tlimit)) {
- /* only call local_clock() every 32 callbacks */
- if (likely((count & 31) || local_clock() < tlimit))
- continue;
- /* Exceeded the time limit, so leave. */
- break;
- }
- if (!in_serving_softirq()) {
+ if (in_serving_softirq()) {
+ if (count >= bl && (need_resched() || !is_idle_task(current)))
+ break;
+ /*
+ * Make sure we don't spend too much time here and deprive other
+ * softirq vectors of CPU cycles.
+ */
+ if (unlikely(tlimit)) {
+ /* only call local_clock() every 32 callbacks */
+ if (likely((count & 31) || local_clock() < tlimit))
+ continue;
+ /* Exceeded the time limit, so leave. */
+ break;
+ }
+ } else {
local_bh_enable();
lockdep_assert_irqs_enabled();
cond_resched_tasks_rcu_qs();
@@ -2532,8 +2554,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
}
}
- local_irq_save(flags);
- rcu_nocb_lock(rdp);
+ rcu_nocb_lock_irqsave(rdp, flags);
rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
@@ -2550,7 +2571,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
if (count == 0 && rdp->qlen_last_fqs_check != 0) {
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
} else if (count < rdp->qlen_last_fqs_check - qhimark)
rdp->qlen_last_fqs_check = count;
@@ -2567,9 +2588,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_nocb_unlock_irqrestore(rdp, flags);
- /* Re-invoke RCU core processing if there are callbacks remaining. */
- if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
- invoke_rcu_core();
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
}
@@ -2708,6 +2726,23 @@ static __latent_entropy void rcu_core(void)
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ /*
+ * On RT rcu_core() can be preempted when IRQs aren't disabled.
+ * Therefore this function can race with concurrent NOCB (de-)offloading
+ * on this CPU and the below condition must be considered volatile.
+ * However if we race with:
+ *
+ * _ Offloading: In the worst case we accelerate or process callbacks
+ * concurrently with NOCB kthreads. We are guaranteed to
+ * call rcu_nocb_lock() if that happens.
+ *
+ * _ Deoffloading: In the worst case we miss callbacks acceleration or
+ * processing. This is fine because the early stage
+ * of deoffloading invokes rcu_core() after setting
+ * SEGCBLIST_RCU_CORE. So we guarantee that we'll process
+ * what could have been dismissed without the need to wait
+ * for the next rcu_pending() check in the next jiffy.
+ */
const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
@@ -2716,7 +2751,7 @@ static __latent_entropy void rcu_core(void)
WARN_ON_ONCE(!rdp->beenonline);
/* Report any deferred quiescent states if preemption enabled. */
- if (!(preempt_count() & PREEMPT_MASK)) {
+ if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
rcu_preempt_deferred_qs(current);
} else if (rcu_preempt_need_deferred_qs(current)) {
set_tsk_need_resched(current);
@@ -2739,8 +2774,12 @@ static __latent_entropy void rcu_core(void)
/* If there are callbacks ready, invoke them. */
if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
- likely(READ_ONCE(rcu_scheduler_fully_active)))
+ likely(READ_ONCE(rcu_scheduler_fully_active))) {
rcu_do_batch(rdp);
+ /* Re-invoke RCU core processing if there are callbacks remaining. */
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
+ invoke_rcu_core();
+ }
/* Do any needed deferred wakeups of rcuo kthreads. */
do_nocb_deferred_wakeup(rdp);
@@ -2898,10 +2937,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
} else {
/* Give the grace period a kick. */
rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
- if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
+ if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
rcu_force_quiescent_state();
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
}
}
@@ -2984,7 +3023,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
head->func = func;
head->next = NULL;
local_irq_save(flags);
- kasan_record_aux_stack(head);
+ kasan_record_aux_stack_noalloc(head);
rdp = this_cpu_ptr(&rcu_data);
/* Add the callback to our list. */
@@ -3549,7 +3588,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
return;
}
- kasan_record_aux_stack(ptr);
+ kasan_record_aux_stack_noalloc(ptr);
success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
if (!success) {
run_page_cache_worker(krcp);
@@ -4128,10 +4167,9 @@ int rcutree_prepare_cpu(unsigned int cpu)
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->blimit = blimit;
rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
- rcu_dynticks_eqs_online();
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
/*
@@ -4251,6 +4289,7 @@ void rcu_cpu_starting(unsigned int cpu)
mask = rdp->grpmask;
WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+ rcu_dynticks_eqs_online();
smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
@@ -4296,9 +4335,7 @@ void rcu_report_dead(unsigned int cpu)
do_nocb_deferred_wakeup(rdp);
/* QS for any half-done expedited grace period. */
- preempt_disable();
- rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
- preempt_enable();
+ rcu_report_exp_rdp(rdp);
rcu_preempt_deferred_qs(current);
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 305cf6aeb408..486fc901bd08 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -157,7 +157,6 @@ struct rcu_data {
bool core_needs_qs; /* Core waits for quiescent state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible ->gp_seq wrap. */
- bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */
bool cpu_started; /* RCU watching this onlining CPU. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
@@ -189,11 +188,6 @@ struct rcu_data {
bool rcu_urgent_qs; /* GP old need light quiescent state. */
bool rcu_forced_tick; /* Forced tick to provide QS. */
bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
-#ifdef CONFIG_RCU_FAST_NO_HZ
- unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
- unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */
- int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
-#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
/* 4) rcu_barrier(), OOM callbacks, and expediting. */
struct rcu_head barrier_head;
@@ -227,8 +221,11 @@ struct rcu_data {
struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */
struct task_struct *nocb_cb_kthread;
- struct rcu_data *nocb_next_cb_rdp;
- /* Next rcu_data in wakeup chain. */
+ struct list_head nocb_head_rdp; /*
+ * Head of rcu_data list in wakeup chain,
+ * if rdp_gp.
+ */
+ struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */
/* The following fields are used by CB kthread, hence new cacheline. */
struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
@@ -419,8 +416,6 @@ static bool rcu_is_callbacks_kthread(void);
static void rcu_cpu_kthread_setup(unsigned int cpu);
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
static void __init rcu_spawn_boost_kthreads(void);
-static void rcu_cleanup_after_idle(void);
-static void rcu_prepare_for_idle(void);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
static void rcu_preempt_deferred_qs(struct task_struct *t);
@@ -447,12 +442,16 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
#ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads(void);
-#define rcu_nocb_lock_irqsave(rdp, flags) \
-do { \
- if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
- local_irq_save(flags); \
- else \
- raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \
+
+/*
+ * Disable IRQs before checking offloaded state so that local
+ * locking is safe against concurrent de-offloading.
+ */
+#define rcu_nocb_lock_irqsave(rdp, flags) \
+do { \
+ local_irq_save(flags); \
+ if (rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
+ raw_spin_lock(&(rdp)->nocb_lock); \
} while (0)
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags)
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 2796084ef85a..237a79989aba 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -255,7 +255,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
*/
static void rcu_report_exp_rdp(struct rcu_data *rdp)
{
- WRITE_ONCE(rdp->exp_deferred_qs, false);
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, false);
rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true);
}
@@ -387,6 +387,7 @@ retry_ipi:
continue;
}
if (get_cpu() == cpu) {
+ mask_ofl_test |= mask;
put_cpu();
continue;
}
@@ -506,13 +507,15 @@ static void synchronize_rcu_expedited_wait(void)
if (rdp->rcu_forced_tick_exp)
continue;
rdp->rcu_forced_tick_exp = true;
- tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+ preempt_disable();
+ if (cpu_online(cpu))
+ tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+ preempt_enable();
}
}
j = READ_ONCE(jiffies_till_first_fqs);
if (synchronize_rcu_expedited_wait_once(j + HZ))
return;
- WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT));
}
for (;;) {
@@ -656,7 +659,7 @@ static void rcu_exp_handler(void *unused)
rcu_dynticks_curr_cpu_in_eqs()) {
rcu_report_exp_rdp(rdp);
} else {
- rdp->exp_deferred_qs = true;
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
set_tsk_need_resched(t);
set_preempt_need_resched();
}
@@ -678,7 +681,7 @@ static void rcu_exp_handler(void *unused)
if (depth > 0) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmask & rdp->grpmask) {
- rdp->exp_deferred_qs = true;
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
t->rcu_read_unlock_special.b.exp_hint = true;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -760,7 +763,7 @@ static void sync_sched_exp_online_cleanup(int cpu)
my_cpu = get_cpu();
/* Quiescent state either not needed or already requested, leave. */
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
- __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) {
+ READ_ONCE(rdp->cpu_no_qs.b.exp)) {
put_cpu();
return;
}
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 8fdf44f8523f..eeafb546a7a0 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -60,16 +60,22 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
* If the list is invalid, a warning is emitted and all CPUs are offloaded.
*/
+
+static bool rcu_nocb_is_setup;
+
static int __init rcu_nocb_setup(char *str)
{
alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- if (cpulist_parse(str, rcu_nocb_mask)) {
- pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
- cpumask_setall(rcu_nocb_mask);
+ if (*str == '=') {
+ if (cpulist_parse(++str, rcu_nocb_mask)) {
+ pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+ cpumask_setall(rcu_nocb_mask);
+ }
}
+ rcu_nocb_is_setup = true;
return 1;
}
-__setup("rcu_nocbs=", rcu_nocb_setup);
+__setup("rcu_nocbs", rcu_nocb_setup);
static int __init parse_rcu_nocb_poll(char *arg)
{
@@ -549,7 +555,6 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
rcu_nocb_unlock_irqrestore(rdp, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
}
- return;
}
/*
@@ -626,7 +631,21 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
* and the global grace-period kthread are awakened if needed.
*/
WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
+ /*
+ * An rcu_data structure is removed from the list after its
+ * CPU is de-offloaded and added to the list before that CPU is
+ * (re-)offloaded. If the following loop happens to be referencing
+ * that rcu_data structure during the time that the corresponding
+ * CPU is de-offloaded and then immediately re-offloaded, this
+ * loop's rdp pointer will be carried to the end of the list by
+ * the resulting pair of list operations. This can cause the loop
+ * to skip over some of the rcu_data structures that were supposed
+ * to have been scanned. Fortunately a new iteration through the
+ * entire loop is forced after a given CPU's rcu_data structure
+ * is added to the list, so the skipped-over rcu_data structures
+ * won't be ignored for long.
+ */
+ list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) {
bool needwake_state = false;
if (!nocb_gp_enabled_cb(rdp))
@@ -767,6 +786,7 @@ static int rcu_nocb_gp_kthread(void *arg)
static inline bool nocb_cb_can_run(struct rcu_data *rdp)
{
u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
+
return rcu_segcblist_test_flags(&rdp->cblist, flags);
}
@@ -789,6 +809,18 @@ static void nocb_cb_wait(struct rcu_data *rdp)
bool can_sleep = true;
struct rcu_node *rnp = rdp->mynode;
+ do {
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+
+ // VVV Ensure CB invocation follows _sleep test.
+ if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ } while (!nocb_cb_can_run(rdp));
+
+
local_irq_save(flags);
rcu_momentary_dyntick_idle();
local_irq_restore(flags);
@@ -841,17 +873,6 @@ static void nocb_cb_wait(struct rcu_data *rdp)
if (needwake_state)
swake_up_one(&rdp->nocb_state_wq);
-
- do {
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- nocb_cb_wait_cond(rdp));
-
- // VVV Ensure CB invocation follows _sleep test.
- if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
- }
- } while (!nocb_cb_can_run(rdp));
}
/*
@@ -990,22 +1011,33 @@ static long rcu_nocb_rdp_deoffload(void *arg)
* will refuse to put anything into the bypass.
*/
WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+ /*
+ * Start with invoking rcu_core() early. This way if the current thread
+ * happens to preempt an ongoing call to rcu_core() in the middle,
+ * leaving some work dismissed because rcu_core() still thinks the rdp is
+ * completely offloaded, we are guaranteed a nearby future instance of
+ * rcu_core() to catch up.
+ */
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE);
+ invoke_rcu_core();
ret = rdp_offload_toggle(rdp, false, flags);
swait_event_exclusive(rdp->nocb_state_wq,
!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
SEGCBLIST_KTHREAD_GP));
+ /* Stop nocb_gp_wait() from iterating over this structure. */
+ list_del_rcu(&rdp->nocb_entry_rdp);
/*
* Lock one last time to acquire latest callback updates from kthreads
* so we can later handle callbacks locally without locking.
*/
rcu_nocb_lock_irqsave(rdp, flags);
/*
- * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb
+ * Theoretically we could clear SEGCBLIST_LOCKING after the nocb
* lock is released but how about being paranoid for once?
*/
- rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING);
/*
- * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
+ * Without SEGCBLIST_LOCKING, we can't use
* rcu_nocb_unlock_irqrestore() anymore.
*/
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
@@ -1057,15 +1089,26 @@ static long rcu_nocb_rdp_offload(void *arg)
return -EINVAL;
pr_info("Offloading %d\n", rdp->cpu);
+
+ /*
+ * Cause future nocb_gp_wait() invocations to iterate over
+ * structure, resetting ->nocb_gp_sleep and waking up the related
+ * "rcuog". Since nocb_gp_wait() in turn locks ->nocb_gp_lock
+ * before setting ->nocb_gp_sleep again, we are guaranteed to
+ * iterate this newly added structure before "rcuog" goes to
+ * sleep again.
+ */
+ list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp);
+
/*
- * Can't use rcu_nocb_lock_irqsave() while we are in
- * SEGCBLIST_SOFTIRQ_ONLY mode.
+ * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
+ * is set.
*/
raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
/*
* We didn't take the nocb lock while working on the
- * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
+ * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode).
* Every modifications that have been done previously on
* rdp->cblist must be visible remotely by the nocb kthreads
* upon wake up after reading the cblist flags.
@@ -1084,6 +1127,14 @@ static long rcu_nocb_rdp_offload(void *arg)
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+ /*
+ * All kthreads are ready to work, we can finally relieve rcu_core() and
+ * enable nocb bypass.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
return ret;
}
@@ -1122,13 +1173,17 @@ void __init rcu_init_nohz(void)
need_rcu_nocb_mask = true;
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
- if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
- if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
- pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
- return;
+ if (need_rcu_nocb_mask) {
+ if (!cpumask_available(rcu_nocb_mask)) {
+ if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
+ pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
+ return;
+ }
}
+ rcu_nocb_is_setup = true;
}
- if (!cpumask_available(rcu_nocb_mask))
+
+ if (!rcu_nocb_is_setup)
return;
#if defined(CONFIG_NO_HZ_FULL)
@@ -1154,8 +1209,8 @@ void __init rcu_init_nohz(void)
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
rcu_segcblist_offload(&rdp->cblist, true);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE);
}
rcu_organize_nocb_kthreads();
}
@@ -1178,17 +1233,17 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
* rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
* for this CPU's group has not yet been created, spawn it as well.
*/
-static void rcu_spawn_one_nocb_kthread(int cpu)
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
{
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_data *rdp_gp;
struct task_struct *t;
- /*
- * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
- * then nothing to do.
- */
- if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
+ if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup)
+ return;
+
+ /* If there already is an rcuo kthread, then nothing to do. */
+ if (rdp->nocb_cb_kthread)
return;
/* If we didn't spawn the GP kthread first, reorganize! */
@@ -1211,16 +1266,6 @@ static void rcu_spawn_one_nocb_kthread(int cpu)
}
/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthread, spawn it.
- */
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
- if (rcu_scheduler_fully_active)
- rcu_spawn_one_nocb_kthread(cpu);
-}
-
-/*
* Once the scheduler is running, spawn rcuo kthreads for all online
* no-CBs CPUs. This assumes that the early_initcall()s happen before
* non-boot CPUs come online -- if this changes, we will need to add
@@ -1230,8 +1275,10 @@ static void __init rcu_spawn_nocb_kthreads(void)
{
int cpu;
- for_each_online_cpu(cpu)
- rcu_spawn_cpu_nocb_kthread(cpu);
+ if (rcu_nocb_is_setup) {
+ for_each_online_cpu(cpu)
+ rcu_spawn_cpu_nocb_kthread(cpu);
+ }
}
/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
@@ -1251,7 +1298,6 @@ static void __init rcu_organize_nocb_kthreads(void)
int nl = 0; /* Next GP kthread. */
struct rcu_data *rdp;
struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
- struct rcu_data *rdp_prev = NULL;
if (!cpumask_available(rcu_nocb_mask))
return;
@@ -1265,14 +1311,14 @@ static void __init rcu_organize_nocb_kthreads(void)
* Should the corresponding CPU come online in the future, then
* we will spawn the needed set of rcu_nocb_kthread() kthreads.
*/
- for_each_cpu(cpu, rcu_nocb_mask) {
+ for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->cpu >= nl) {
/* New GP kthread, set up for CBs & next GP. */
gotnocbs = true;
nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
- rdp->nocb_gp_rdp = rdp;
rdp_gp = rdp;
+ INIT_LIST_HEAD(&rdp->nocb_head_rdp);
if (dump_tree) {
if (!firsttime)
pr_cont("%s\n", gotnocbscbs
@@ -1285,12 +1331,12 @@ static void __init rcu_organize_nocb_kthreads(void)
} else {
/* Another CB kthread, link to previous GP kthread. */
gotnocbscbs = true;
- rdp->nocb_gp_rdp = rdp_gp;
- rdp_prev->nocb_next_cb_rdp = rdp;
if (dump_tree)
pr_cont(" %d", cpu);
}
- rdp_prev = rdp;
+ rdp->nocb_gp_rdp = rdp_gp;
+ if (cpumask_test_cpu(cpu, rcu_nocb_mask))
+ list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp);
}
if (gotnocbs && dump_tree)
pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
@@ -1352,6 +1398,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
{
char bufw[20];
char bufr[20];
+ struct rcu_data *nocb_next_rdp;
struct rcu_segcblist *rsclp = &rdp->cblist;
bool waslocked;
bool wassleep;
@@ -1359,11 +1406,16 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
if (rdp->nocb_gp_rdp == rdp)
show_rcu_nocb_gp_state(rdp);
+ nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp,
+ &rdp->nocb_entry_rdp,
+ typeof(*rdp),
+ nocb_entry_rdp);
+
sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
rdp->cpu, rdp->nocb_gp_rdp->cpu,
- rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
+ nocb_next_rdp ? nocb_next_rdp->cpu : -1,
"kK"[!!rdp->nocb_cb_kthread],
"bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
"cC"[!!atomic_read(&rdp->nocb_lock_contended)],
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d070059163d7..c5b45c2f68a1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -16,7 +16,7 @@
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
{
/*
- * In order to read the offloaded state of an rdp is a safe
+ * In order to read the offloaded state of an rdp in a safe
* and stable way and prevent from its value to be changed
* under us, we must either hold the barrier mutex, the cpu
* hotplug lock (read or write) or the nocb lock. Local
@@ -51,12 +51,10 @@ static void __init rcu_bootup_announce_oddness(void)
RCU_FANOUT);
if (rcu_fanout_exact)
pr_info("\tHierarchical RCU autobalancing is disabled.\n");
- if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
- pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
if (IS_ENABLED(CONFIG_PROVE_RCU))
pr_info("\tRCU lockdep checking is enabled.\n");
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
- pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n");
+ pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n");
if (RCU_NUM_LVLS >= 4)
pr_info("\tFour(or more)-level hierarchy is enabled.\n");
if (RCU_FANOUT_LEAF != 16)
@@ -88,13 +86,13 @@ static void __init rcu_bootup_announce_oddness(void)
if (rcu_kick_kthreads)
pr_info("\tKick kthreads if too-long grace period.\n");
if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
- pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
+ pr_info("\tRCU callback double-/use-after-free debug is enabled.\n");
if (gp_preinit_delay)
pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
if (gp_init_delay)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
if (gp_cleanup_delay)
- pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
+ pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay);
if (!use_softirq)
pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
@@ -260,10 +258,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* no need to check for a subsequent expedited GP. (Though we are
* still in a quiescent state in any case.)
*/
- if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs)
+ if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
else
- WARN_ON_ONCE(rdp->exp_deferred_qs);
+ WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
}
/*
@@ -277,12 +275,16 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* current task, there might be any number of other tasks blocked while
* in an RCU read-side critical section.
*
+ * Unlike non-preemptible-RCU, quiescent state reports for expedited
+ * grace periods are handled separately via deferred quiescent states
+ * and context switch events.
+ *
* Callers to this function must disable preemption.
*/
static void rcu_qs(void)
{
RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n");
- if (__this_cpu_read(rcu_data.cpu_no_qs.s)) {
+ if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
__this_cpu_read(rcu_data.gp_seq),
TPS("cpuqs"));
@@ -350,7 +352,7 @@ void rcu_note_context_switch(bool preempt)
* means that we continue to block the current grace period.
*/
rcu_qs();
- if (rdp->exp_deferred_qs)
+ if (rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
rcu_tasks_qs(current, preempt);
trace_rcu_utilization(TPS("End context switch"));
@@ -477,7 +479,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
*/
special = t->rcu_read_unlock_special;
rdp = this_cpu_ptr(&rcu_data);
- if (!special.s && !rdp->exp_deferred_qs) {
+ if (!special.s && !rdp->cpu_no_qs.b.exp) {
local_irq_restore(flags);
return;
}
@@ -497,7 +499,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* tasks are handled when removing the task from the
* blocked-tasks list below.
*/
- if (rdp->exp_deferred_qs)
+ if (rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
/* Clean up if blocked during RCU read-side critical section. */
@@ -580,7 +582,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
*/
static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
- return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
+ return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) ||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
rcu_preempt_depth() == 0;
}
@@ -642,7 +644,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
(IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
t->rcu_blocked_node);
// Need to defer quiescent state until everything is enabled.
- if (use_softirq && (in_irq() || (expboost && !irqs_were_disabled))) {
+ if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) {
// Using softirq, safe to awaken, and either the
// wakeup is free or there is either an expedited
// GP in flight or a potential need to deboost.
@@ -814,8 +816,7 @@ void rcu_read_unlock_strict(void)
{
struct rcu_data *rdp;
- if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ||
- irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
+ if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
return;
rdp = this_cpu_ptr(&rcu_data);
rcu_report_qs_rdp(rdp);
@@ -846,10 +847,8 @@ static void rcu_qs(void)
trace_rcu_grace_period(TPS("rcu_sched"),
__this_cpu_read(rcu_data.gp_seq), TPS("cpuqs"));
__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
- if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
- return;
- __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false);
- rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
+ if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
+ rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
}
/*
@@ -926,7 +925,18 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return false;
}
-static void rcu_preempt_deferred_qs(struct task_struct *t) { }
+
+// Except that we do need to respond to a request by an expedited grace
+// period for a quiescent state from this CPU. Note that requests from
+// tasks are handled when removing the task from the blocked-tasks list
+// below.
+static void rcu_preempt_deferred_qs(struct task_struct *t)
+{
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
+ if (rdp->cpu_no_qs.b.exp)
+ rcu_report_exp_rdp(rdp);
+}
/*
* Because there is no preemptible RCU, there can be no readers blocked,
@@ -1154,7 +1164,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
/*
* Create an RCU-boost kthread for the specified node if one does not
* already exist. We only create this kthread for preemptible RCU.
- * Returns zero if all is well, a negated errno otherwise.
*/
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
@@ -1205,8 +1214,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
+ cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU));
if (cpumask_weight(cm) == 0)
- cpumask_setall(cm);
+ cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU));
set_cpus_allowed_ptr(t, cm);
free_cpumask_var(cm);
}
@@ -1254,201 +1264,6 @@ static void __init rcu_spawn_boost_kthreads(void)
#endif /* #else #ifdef CONFIG_RCU_BOOST */
-#if !defined(CONFIG_RCU_FAST_NO_HZ)
-
-/*
- * Check to see if any future non-offloaded RCU-related work will need
- * to be done by the current CPU, even if none need be done immediately,
- * returning 1 if so. This function is part of the RCU implementation;
- * it is -not- an exported member of the RCU API.
- *
- * Because we not have RCU_FAST_NO_HZ, just check whether or not this
- * CPU has RCU callbacks queued.
- */
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
-{
- *nextevt = KTIME_MAX;
- return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
- !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
-}
-
-/*
- * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
- * after it.
- */
-static void rcu_cleanup_after_idle(void)
-{
-}
-
-/*
- * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
- * is nothing.
- */
-static void rcu_prepare_for_idle(void)
-{
-}
-
-#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
-/*
- * This code is invoked when a CPU goes idle, at which point we want
- * to have the CPU do everything required for RCU so that it can enter
- * the energy-efficient dyntick-idle mode.
- *
- * The following preprocessor symbol controls this:
- *
- * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
- * to sleep in dyntick-idle mode with RCU callbacks pending. This
- * is sized to be roughly one RCU grace period. Those energy-efficiency
- * benchmarkers who might otherwise be tempted to set this to a large
- * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
- * system. And if you are -that- concerned about energy efficiency,
- * just power the system down and be done with it!
- *
- * The value below works well in practice. If future workloads require
- * adjustment, they can be converted into kernel config parameters, though
- * making the state machine smarter might be a better option.
- */
-#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
-
-static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
-module_param(rcu_idle_gp_delay, int, 0644);
-
-/*
- * Try to advance callbacks on the current CPU, but only if it has been
- * awhile since the last time we did so. Afterwards, if there are any
- * callbacks ready for immediate invocation, return true.
- */
-static bool __maybe_unused rcu_try_advance_all_cbs(void)
-{
- bool cbs_ready = false;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp;
-
- /* Exit early if we advanced recently. */
- if (jiffies == rdp->last_advance_all)
- return false;
- rdp->last_advance_all = jiffies;
-
- rnp = rdp->mynode;
-
- /*
- * Don't bother checking unless a grace period has
- * completed since we last checked and there are
- * callbacks not yet ready to invoke.
- */
- if ((rcu_seq_completed_gp(rdp->gp_seq,
- rcu_seq_current(&rnp->gp_seq)) ||
- unlikely(READ_ONCE(rdp->gpwrap))) &&
- rcu_segcblist_pend_cbs(&rdp->cblist))
- note_gp_changes(rdp);
-
- if (rcu_segcblist_ready_cbs(&rdp->cblist))
- cbs_ready = true;
- return cbs_ready;
-}
-
-/*
- * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
- * to invoke. If the CPU has callbacks, try to advance them. Tell the
- * caller about what to set the timeout.
- *
- * The caller must have disabled interrupts.
- */
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- unsigned long dj;
-
- lockdep_assert_irqs_disabled();
-
- /* If no non-offloaded callbacks, RCU doesn't need the CPU. */
- if (rcu_segcblist_empty(&rdp->cblist) ||
- rcu_rdp_is_offloaded(rdp)) {
- *nextevt = KTIME_MAX;
- return 0;
- }
-
- /* Attempt to advance callbacks. */
- if (rcu_try_advance_all_cbs()) {
- /* Some ready to invoke, so initiate later invocation. */
- invoke_rcu_core();
- return 1;
- }
- rdp->last_accelerate = jiffies;
-
- /* Request timer and round. */
- dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies;
-
- *nextevt = basemono + dj * TICK_NSEC;
- return 0;
-}
-
-/*
- * Prepare a CPU for idle from an RCU perspective. The first major task is to
- * sense whether nohz mode has been enabled or disabled via sysfs. The second
- * major task is to accelerate (that is, assign grace-period numbers to) any
- * recently arrived callbacks.
- *
- * The caller must have disabled interrupts.
- */
-static void rcu_prepare_for_idle(void)
-{
- bool needwake;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp;
- int tne;
-
- lockdep_assert_irqs_disabled();
- if (rcu_rdp_is_offloaded(rdp))
- return;
-
- /* Handle nohz enablement switches conservatively. */
- tne = READ_ONCE(tick_nohz_active);
- if (tne != rdp->tick_nohz_enabled_snap) {
- if (!rcu_segcblist_empty(&rdp->cblist))
- invoke_rcu_core(); /* force nohz to see update. */
- rdp->tick_nohz_enabled_snap = tne;
- return;
- }
- if (!tne)
- return;
-
- /*
- * If we have not yet accelerated this jiffy, accelerate all
- * callbacks on this CPU.
- */
- if (rdp->last_accelerate == jiffies)
- return;
- rdp->last_accelerate = jiffies;
- if (rcu_segcblist_pend_cbs(&rdp->cblist)) {
- rnp = rdp->mynode;
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- needwake = rcu_accelerate_cbs(rnp, rdp);
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- if (needwake)
- rcu_gp_kthread_wake();
- }
-}
-
-/*
- * Clean up for exit from idle. Attempt to advance callbacks based on
- * any grace periods that elapsed while the CPU was idle, and if any
- * callbacks are now ready to invoke, initiate invocation.
- */
-static void rcu_cleanup_after_idle(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- lockdep_assert_irqs_disabled();
- if (rcu_rdp_is_offloaded(rdp))
- return;
- if (rcu_try_advance_all_cbs())
- invoke_rcu_core();
-}
-
-#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
/*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing?
@@ -1456,7 +1271,7 @@ static void rcu_cleanup_after_idle(void)
* CPU unless the grace period has extended for too long.
*
* This code relies on the fact that all NO_HZ_FULL CPUs are also
- * CONFIG_RCU_NOCB_CPU CPUs.
+ * RCU_NOCB_CPU CPUs.
*/
static bool rcu_nohz_full_cpu(void)
{
@@ -1480,7 +1295,7 @@ static void rcu_bind_gp_kthread(void)
}
/* Record the current task on dyntick-idle entry. */
-static void noinstr rcu_dynticks_task_enter(void)
+static __always_inline void rcu_dynticks_task_enter(void)
{
#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
@@ -1488,7 +1303,7 @@ static void noinstr rcu_dynticks_task_enter(void)
}
/* Record no current task on dyntick-idle exit. */
-static void noinstr rcu_dynticks_task_exit(void)
+static __always_inline void rcu_dynticks_task_exit(void)
{
#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
@@ -1496,7 +1311,7 @@ static void noinstr rcu_dynticks_task_exit(void)
}
/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
-static void rcu_dynticks_task_trace_enter(void)
+static __always_inline void rcu_dynticks_task_trace_enter(void)
{
#ifdef CONFIG_TASKS_TRACE_RCU
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
@@ -1505,7 +1320,7 @@ static void rcu_dynticks_task_trace_enter(void)
}
/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
-static void rcu_dynticks_task_trace_exit(void)
+static __always_inline void rcu_dynticks_task_trace_exit(void)
{
#ifdef CONFIG_TASKS_TRACE_RCU
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 677ee3d8671b..21bebf7c9030 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -240,16 +240,16 @@ struct rcu_stall_chk_rdr {
* Report out the state of a not-running task that is stalling the
* current RCU grace period.
*/
-static bool check_slow_task(struct task_struct *t, void *arg)
+static int check_slow_task(struct task_struct *t, void *arg)
{
struct rcu_stall_chk_rdr *rscrp = arg;
if (task_curr(t))
- return false; // It is running, so decline to inspect it.
+ return -EBUSY; // It is running, so decline to inspect it.
rscrp->nesting = t->rcu_read_lock_nesting;
rscrp->rs = t->rcu_read_unlock_special;
rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry);
- return true;
+ return 0;
}
/*
@@ -283,7 +283,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
while (i) {
t = ts[--i];
- if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr))
+ if (task_call_func(t, check_slow_task, &rscr))
pr_cont(" P%d", t->pid);
else
pr_cont(" P%d/%d:%c%c%c%c",
@@ -347,26 +347,6 @@ static void rcu_dump_cpu_stacks(void)
}
}
-#ifdef CONFIG_RCU_FAST_NO_HZ
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-
- sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d",
- rdp->last_accelerate & 0xffff, jiffies & 0xffff,
- !!rdp->tick_nohz_enabled_snap);
-}
-
-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- *cp = '\0';
-}
-
-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-
static const char * const gp_state_names[] = {
[RCU_GP_IDLE] = "RCU_GP_IDLE",
[RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS",
@@ -408,13 +388,12 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp)
* of RCU grace periods that this CPU is ignorant of, for example, "1"
* if the CPU was aware of the previous grace period.
*
- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ * Also print out idle info.
*/
static void print_cpu_stall_info(int cpu)
{
unsigned long delta;
bool falsepositive;
- char fast_no_hz[72];
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
char *ticks_title;
unsigned long ticks_value;
@@ -432,11 +411,10 @@ static void print_cpu_stall_info(int cpu)
ticks_title = "ticks this GP";
ticks_value = rdp->ticks_this_gp;
}
- print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
falsepositive = rcu_is_gp_kthread_starving(NULL) &&
rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
- pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n",
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
@@ -449,7 +427,6 @@ static void print_cpu_stall_info(int cpu)
rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
- fast_no_hz,
falsepositive ? " (false positive?)" : "");
}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c21b38cc25e9..156892c22bb5 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -54,11 +54,11 @@
#define MODULE_PARAM_PREFIX "rcupdate."
#ifndef CONFIG_TINY_RCU
-module_param(rcu_expedited, int, 0);
-module_param(rcu_normal, int, 0);
+module_param(rcu_expedited, int, 0444);
+module_param(rcu_normal, int, 0444);
static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT);
-#ifndef CONFIG_PREEMPT_RT
-module_param(rcu_normal_after_boot, int, 0);
+#if !defined(CONFIG_PREEMPT_RT) || defined(CONFIG_NO_HZ_FULL)
+module_param(rcu_normal_after_boot, int, 0444);
#endif
#endif /* #ifndef CONFIG_TINY_RCU */
@@ -247,7 +247,7 @@ struct lockdep_map rcu_lock_map = {
.name = "rcu_read_lock",
.key = &rcu_lock_key,
.wait_type_outer = LD_WAIT_FREE,
- .wait_type_inner = LD_WAIT_CONFIG, /* XXX PREEMPT_RCU ? */
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT implies PREEMPT_RCU */
};
EXPORT_SYMBOL_GPL(rcu_lock_map);
@@ -256,7 +256,7 @@ struct lockdep_map rcu_bh_lock_map = {
.name = "rcu_read_lock_bh",
.key = &rcu_bh_lock_key,
.wait_type_outer = LD_WAIT_FREE,
- .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_LOCK also makes BH preemptible */
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */
};
EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index f7440c0c7e43..6bcc5d6a6572 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -33,6 +33,7 @@ EXPORT_SYMBOL(cad_pid);
#define DEFAULT_REBOOT_MODE
#endif
enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+EXPORT_SYMBOL_GPL(reboot_mode);
enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED;
/*
@@ -359,7 +360,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
case LINUX_REBOOT_CMD_HALT:
kernel_halt();
do_exit(0);
- panic("cannot halt");
case LINUX_REBOOT_CMD_POWER_OFF:
kernel_power_off();
diff --git a/kernel/resource.c b/kernel/resource.c
index ca9f5198a01f..5ad3eba619ba 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -73,6 +73,18 @@ static struct resource *next_resource(struct resource *p)
return p->sibling;
}
+static struct resource *next_resource_skip_children(struct resource *p)
+{
+ while (!p->sibling && p->parent)
+ p = p->parent;
+ return p->sibling;
+}
+
+#define for_each_resource(_root, _p, _skip_children) \
+ for ((_p) = (_root)->child; (_p); \
+ (_p) = (_skip_children) ? next_resource_skip_children(_p) : \
+ next_resource(_p))
+
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
@@ -1707,37 +1719,49 @@ static int strict_iomem_checks;
#endif
/*
- * check if an address is reserved in the iomem resource tree
- * returns true if reserved, false if not reserved.
+ * Check if an address is exclusive to the kernel and must not be mapped to
+ * user space, for example, via /dev/mem.
+ *
+ * Returns true if exclusive to the kernel, otherwise returns false.
*/
bool iomem_is_exclusive(u64 addr)
{
- struct resource *p = &iomem_resource;
- bool err = false;
- loff_t l;
+ const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM |
+ IORESOURCE_EXCLUSIVE;
+ bool skip_children = false, err = false;
int size = PAGE_SIZE;
-
- if (!strict_iomem_checks)
- return false;
+ struct resource *p;
addr = addr & PAGE_MASK;
read_lock(&resource_lock);
- for (p = p->child; p ; p = r_next(NULL, p, &l)) {
- /*
- * We can probably skip the resources without
- * IORESOURCE_IO attribute?
- */
+ for_each_resource(&iomem_resource, p, skip_children) {
if (p->start >= addr + size)
break;
- if (p->end < addr)
+ if (p->end < addr) {
+ skip_children = true;
continue;
+ }
+ skip_children = false;
+
+ /*
+ * IORESOURCE_SYSTEM_RAM resources are exclusive if
+ * IORESOURCE_EXCLUSIVE is set, even if they
+ * are not busy and even if "iomem=relaxed" is set. The
+ * responsible driver dynamically adds/removes system RAM within
+ * such an area and uncontrolled access is dangerous.
+ */
+ if ((p->flags & exclusive_system_ram) == exclusive_system_ram) {
+ err = true;
+ break;
+ }
+
/*
* A resource is exclusive if IORESOURCE_EXCLUSIVE is set
* or CONFIG_IO_STRICT_DEVMEM is enabled and the
* resource is busy.
*/
- if ((p->flags & IORESOURCE_BUSY) == 0)
+ if (!strict_iomem_checks || !(p->flags & IORESOURCE_BUSY))
continue;
if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
|| p->flags & IORESOURCE_EXCLUSIVE) {
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 35f7bd0fced0..6d45ac3dae7f 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -282,9 +282,17 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(t->flags & PF_EXITING))
return;
- ret = rseq_ip_fixup(regs);
- if (unlikely(ret < 0))
- goto error;
+
+ /*
+ * regs is NULL if and only if the caller is in a syscall path. Skip
+ * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
+ * kill a misbehaving userspace on debug kernels.
+ */
+ if (regs) {
+ ret = rseq_ip_fixup(regs);
+ if (unlikely(ret < 0))
+ goto error;
+ }
if (unlikely(rseq_update_cpu_id(t)))
goto error;
return;
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 64a08288b1a6..dcb0410950e4 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -38,14 +38,10 @@
#define SCFTORT_STRING "scftorture"
#define SCFTORT_FLAG SCFTORT_STRING ": "
-#define SCFTORTOUT(s, x...) \
- pr_alert(SCFTORT_FLAG s, ## x)
-
#define VERBOSE_SCFTORTOUT(s, x...) \
- do { if (verbose) pr_alert(SCFTORT_FLAG s, ## x); } while (0)
+ do { if (verbose) pr_alert(SCFTORT_FLAG s "\n", ## x); } while (0)
-#define VERBOSE_SCFTORTOUT_ERRSTRING(s, x...) \
- do { if (verbose) pr_alert(SCFTORT_FLAG "!!! " s, ## x); } while (0)
+#define SCFTORTOUT_ERRSTRING(s, x...) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
@@ -341,6 +337,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
cpu = torture_random(trsp) % nr_cpu_ids;
scfp->n_resched++;
resched_cpu(cpu);
+ this_cpu_inc(scf_invoked_count);
}
break;
case SCF_PRIM_SINGLE:
@@ -553,18 +550,18 @@ static int __init scf_torture_init(void)
scftorture_print_module_parms("Start of test");
- if (weight_resched == -1 &&
- weight_single == -1 && weight_single_rpc == -1 && weight_single_wait == -1 &&
- weight_many == -1 && weight_many_wait == -1 &&
- weight_all == -1 && weight_all_wait == -1) {
- weight_resched1 = 2 * nr_cpu_ids;
- weight_single1 = 2 * nr_cpu_ids;
- weight_single_rpc1 = 2 * nr_cpu_ids;
- weight_single_wait1 = 2 * nr_cpu_ids;
- weight_many1 = 2;
- weight_many_wait1 = 2;
- weight_all1 = 1;
- weight_all_wait1 = 1;
+ if (weight_resched <= 0 &&
+ weight_single <= 0 && weight_single_rpc <= 0 && weight_single_wait <= 0 &&
+ weight_many <= 0 && weight_many_wait <= 0 &&
+ weight_all <= 0 && weight_all_wait <= 0) {
+ weight_resched1 = weight_resched == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_single1 = weight_single == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_single_rpc1 = weight_single_rpc == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_single_wait1 = weight_single_wait == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_many1 = weight_many == 0 ? 0 : 2;
+ weight_many_wait1 = weight_many_wait == 0 ? 0 : 2;
+ weight_all1 = weight_all == 0 ? 0 : 1;
+ weight_all_wait1 = weight_all_wait == 0 ? 0 : 1;
} else {
if (weight_resched == -1)
weight_resched1 = 0;
@@ -583,17 +580,17 @@ static int __init scf_torture_init(void)
if (weight_all_wait == -1)
weight_all_wait1 = 0;
}
- if (weight_single1 == 0 && weight_single_rpc1 == 0 && weight_single_wait1 == 0 &&
- weight_many1 == 0 && weight_many_wait1 == 0 &&
+ if (weight_resched1 == 0 && weight_single1 == 0 && weight_single_rpc1 == 0 &&
+ weight_single_wait1 == 0 && weight_many1 == 0 && weight_many_wait1 == 0 &&
weight_all1 == 0 && weight_all_wait1 == 0) {
- VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense");
+ SCFTORTOUT_ERRSTRING("all zero weights makes no sense");
firsterr = -EINVAL;
goto unwind;
}
if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST))
scf_sel_add(weight_resched1, SCF_PRIM_RESCHED, false);
else if (weight_resched1)
- VERBOSE_SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored");
+ SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored");
scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false);
scf_sel_add(weight_single_rpc1, SCF_PRIM_SINGLE_RPC, true);
scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true);
@@ -605,17 +602,17 @@ static int __init scf_torture_init(void)
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shutdown_secs > 0) {
firsterr = torture_shutdown_init(shutdown_secs, scf_torture_cleanup);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stutter > 0) {
firsterr = torture_stutter_init(stutter, stutter);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
@@ -624,24 +621,24 @@ static int __init scf_torture_init(void)
nthreads = num_online_cpus();
scf_stats_p = kcalloc(nthreads, sizeof(scf_stats_p[0]), GFP_KERNEL);
if (!scf_stats_p) {
- VERBOSE_SCFTORTOUT_ERRSTRING("out of memory");
+ SCFTORTOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
- VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads\n", nthreads);
+ VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads", nthreads);
atomic_set(&n_started, nthreads);
for (i = 0; i < nthreads; i++) {
scf_stats_p[i].cpu = i;
firsterr = torture_create_kthread(scftorture_invoker, (void *)&scf_stats_p[i],
scf_stats_p[i].task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(scf_torture_stats, NULL, scf_torture_stats_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
@@ -651,6 +648,10 @@ static int __init scf_torture_init(void)
unwind:
torture_init_end();
scf_torture_cleanup();
+ if (shutdown_secs) {
+ WARN_ON(!IS_MODULE(CONFIG_SCF_TORTURE_TEST));
+ kernel_power_off();
+ }
return firsterr;
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 978fcfca5871..c83b37af155b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -3,15 +3,18 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
endif
+# The compilers are complaining about unused variables inside an if(0) scope
+# block. This is daft, shut them up.
+ccflags-y += $(call cc-disable-warning, unused-but-set-variable)
+
# These files are disabled because they produce non-interesting flaky coverage
# that is not a function of syscall inputs. E.g. involuntary context switches.
KCOV_INSTRUMENT := n
-# There are numerous data races here, however, most of them are due to plain accesses.
-# This would make it even harder for syzbot to find reproducers, because these
-# bugs trigger without specific input. Disable by default, but should re-enable
-# eventually.
+# Disable KCSAN to avoid excessive noise and performance degradation. To avoid
+# false positives ensure barriers implied by sched functions are instrumented.
KCSAN_SANITIZE := n
+KCSAN_INSTRUMENT_BARRIERS := y
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2067080bb235..8629b37d118e 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -31,7 +31,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
- sched_offline_group(ag->tg);
+ sched_release_group(ag->tg);
sched_destroy_group(ag->tg);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c4462c454ab9..83872f95a1ea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -13,7 +13,7 @@
#include "sched.h"
#include <linux/nospec.h>
-
+#include <linux/blkdev.h>
#include <linux/kcov.h>
#include <linux/scs.h>
@@ -74,7 +74,11 @@ __read_mostly int sysctl_resched_latency_warn_once = 1;
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
+#ifdef CONFIG_PREEMPT_RT
+const_debug unsigned int sysctl_sched_nr_migrate = 8;
+#else
const_debug unsigned int sysctl_sched_nr_migrate = 32;
+#endif
/*
* period over which we measure -rt task CPU usage in us.
@@ -140,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false;
/* flip prio, so high prio is leftmost */
- if (prio_less(b, a, task_rq(a)->core->core_forceidle))
+ if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
return true;
return false;
@@ -177,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
}
-void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
rq->core->core_task_seq++;
- if (!sched_core_enqueued(p))
- return;
+ if (sched_core_enqueued(p)) {
+ rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
+ }
- rb_erase(&p->core_node, &rq->core_tree);
- RB_CLEAR_NODE(&p->core_node);
+ /*
+ * Migrating the last task off the cpu, with the cpu in forced idle
+ * state. Reschedule to create an accounting edge for forced idle,
+ * and re-examine whether the core is still in forced idle state.
+ */
+ if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+ rq->core->core_forceidle_count && rq->curr == rq->idle)
+ resched_curr(rq);
}
/*
@@ -276,6 +288,8 @@ static void __sched_core_flip(bool enabled)
for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled;
+ cpu_rq(cpu)->core->core_forceidle_start = 0;
+
sched_core_unlock(cpu, &flags);
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
@@ -360,7 +374,8 @@ void sched_core_put(void)
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
-static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
#endif /* CONFIG_SCHED_CORE */
@@ -1914,7 +1929,7 @@ static void __init init_uclamp_rq(struct rq *rq)
};
}
- rq->uclamp_flags = 0;
+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
}
static void __init init_uclamp(void)
@@ -1962,6 +1977,25 @@ bool sched_task_on_rq(struct task_struct *p)
return task_on_rq_queued(p);
}
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long ip = 0;
+ unsigned int state;
+
+ if (!p || p == current)
+ return 0;
+
+ /* Only get wchan if task is blocked and we can keep it that way. */
+ raw_spin_lock_irq(&p->pi_lock);
+ state = READ_ONCE(p->__state);
+ smp_rmb(); /* see try_to_wake_up() */
+ if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
+ ip = __get_wchan(p);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return ip;
+}
+
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
@@ -1982,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
- sched_core_dequeue(rq, p);
+ sched_core_dequeue(rq, p, flags);
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
@@ -2150,6 +2184,9 @@ void migrate_enable(void)
return;
}
+ if (WARN_ON_ONCE(!p->migration_disabled))
+ return;
+
/*
* Ensure stop_task runs either before or after this, and that
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
@@ -3251,7 +3288,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
ktime_t to = NSEC_PER_SEC / HZ;
set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
continue;
}
@@ -3489,11 +3526,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
- __schedstat_inc(p->se.statistics.nr_wakeups_local);
+ __schedstat_inc(p->stats.nr_wakeups_local);
} else {
struct sched_domain *sd;
- __schedstat_inc(p->se.statistics.nr_wakeups_remote);
+ __schedstat_inc(p->stats.nr_wakeups_remote);
rcu_read_lock();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
@@ -3505,14 +3542,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
if (wake_flags & WF_MIGRATED)
- __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+ __schedstat_inc(p->stats.nr_wakeups_migrate);
#endif /* CONFIG_SMP */
__schedstat_inc(rq->ttwu_count);
- __schedstat_inc(p->se.statistics.nr_wakeups);
+ __schedstat_inc(p->stats.nr_wakeups);
if (wake_flags & WF_SYNC)
- __schedstat_inc(p->se.statistics.nr_wakeups_sync);
+ __schedstat_inc(p->stats.nr_wakeups_sync);
}
/*
@@ -3691,15 +3728,11 @@ void wake_up_if_idle(int cpu)
if (!is_idle_task(rcu_dereference(rq->curr)))
goto out;
- if (set_nr_if_polling(rq->idle)) {
- trace_sched_wake_idle_without_ipi(cpu);
- } else {
- rq_lock_irqsave(rq, &rf);
- if (is_idle_task(rq->curr))
- smp_send_reschedule(cpu);
- /* Else CPU is not idle, do nothing here: */
- rq_unlock_irqrestore(rq, &rf);
- }
+ rq_lock_irqsave(rq, &rf);
+ if (is_idle_task(rq->curr))
+ resched_curr(rq);
+ /* Else CPU is not idle, do nothing here: */
+ rq_unlock_irqrestore(rq, &rf);
out:
rcu_read_unlock();
@@ -3707,6 +3740,9 @@ out:
bool cpus_share_cache(int this_cpu, int that_cpu)
{
+ if (this_cpu == that_cpu)
+ return true;
+
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
@@ -4106,46 +4142,61 @@ out:
}
/**
- * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
+ * task_call_func - Invoke a function on task in fixed state
* @p: Process for which the function is to be invoked, can be @current.
* @func: Function to invoke.
* @arg: Argument to function.
*
- * If the specified task can be quickly locked into a definite state
- * (either sleeping or on a given runqueue), arrange to keep it in that
- * state while invoking @func(@arg). This function can use ->on_rq and
- * task_curr() to work out what the state is, if required. Given that
- * @func can be invoked with a runqueue lock held, it had better be quite
- * lightweight.
+ * Fix the task in it's current state by avoiding wakeups and or rq operations
+ * and call @func(@arg) on it. This function can use ->on_rq and task_curr()
+ * to work out what the state is, if required. Given that @func can be invoked
+ * with a runqueue lock held, it had better be quite lightweight.
*
* Returns:
- * @false if the task slipped out from under the locks.
- * @true if the task was locked onto a runqueue or is sleeping.
- * However, @func can override this by returning @false.
+ * Whatever @func returns
*/
-bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
+int task_call_func(struct task_struct *p, task_call_f func, void *arg)
{
+ struct rq *rq = NULL;
+ unsigned int state;
struct rq_flags rf;
- bool ret = false;
- struct rq *rq;
+ int ret;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- if (p->on_rq) {
+
+ state = READ_ONCE(p->__state);
+
+ /*
+ * Ensure we load p->on_rq after p->__state, otherwise it would be
+ * possible to, falsely, observe p->on_rq == 0.
+ *
+ * See try_to_wake_up() for a longer comment.
+ */
+ smp_rmb();
+
+ /*
+ * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
+ * the task is blocked. Make sure to check @state since ttwu() can drop
+ * locks at the end, see ttwu_queue_wakelist().
+ */
+ if (state == TASK_RUNNING || state == TASK_WAKING || p->on_rq)
rq = __task_rq_lock(p, &rf);
- if (task_rq(p) == rq)
- ret = func(p, arg);
+
+ /*
+ * At this point the task is pinned; either:
+ * - blocked and we're holding off wakeups (pi->lock)
+ * - woken, and we're holding off enqueue (rq->lock)
+ * - queued, and we're holding off schedule (rq->lock)
+ * - running, and we're holding off de-schedule (rq->lock)
+ *
+ * The called function (@func) can use: task_curr(), p->on_rq and
+ * p->__state to differentiate between these states.
+ */
+ ret = func(p, arg);
+
+ if (rq)
rq_unlock(rq, &rf);
- } else {
- switch (READ_ONCE(p->__state)) {
- case TASK_RUNNING:
- case TASK_WAKING:
- break;
- default:
- smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
- if (!p->on_rq)
- ret = func(p, arg);
- }
- }
+
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
}
@@ -4196,7 +4247,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_SCHEDSTATS
/* Even if schedstat is disabled, there should not be garbage */
- memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+ memset(&p->stats, 0, sizeof(p->stats));
#endif
RB_CLEAR_NODE(&p->dl.rb_node);
@@ -4328,8 +4379,6 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
*/
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
- unsigned long flags;
-
__sched_fork(clone_flags, p);
/*
* We mark the process as NEW here. This guarantees that
@@ -4375,24 +4424,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
init_entity_runnable_average(&p->se);
- /*
- * The child is not yet in the pid-hash so no cgroup attach races,
- * and the cgroup is pinned to this child due to cgroup_fork()
- * is ran before sched_fork().
- *
- * Silence PROVE_RCU.
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- rseq_migrate(p);
- /*
- * We're setting the CPU for the first time, we don't migrate,
- * so use __set_task_cpu().
- */
- __set_task_cpu(p, smp_processor_id());
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -4408,8 +4439,29 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}
-void sched_post_fork(struct task_struct *p)
+void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
+ unsigned long flags;
+#ifdef CONFIG_CGROUP_SCHED
+ struct task_group *tg;
+#endif
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+#ifdef CONFIG_CGROUP_SCHED
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+ struct task_group, css);
+ p->sched_task_group = autogroup_task_group(p, tg);
+#endif
+ rseq_migrate(p);
+ /*
+ * We're setting the CPU for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, smp_processor_id());
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
uclamp_post_fork(p);
}
@@ -4836,18 +4888,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
*/
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
- mmdrop(mm);
+ mmdrop_sched(mm);
}
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
- /*
- * Remove function-return probe instances associated with this
- * task and put them back on the free list.
- */
- kprobe_flush_task(prev);
-
/* Task is done with its stack. */
put_task_stack(prev);
@@ -5212,6 +5258,7 @@ void scheduler_tick(void)
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
+ sched_core_tick(rq);
rq_unlock(rq, &rf);
@@ -5580,8 +5627,7 @@ restart:
return p;
}
- /* The idle class should always have a runnable task: */
- BUG();
+ BUG(); /* The idle class should always have a runnable task. */
}
#ifdef CONFIG_SCHED_CORE
@@ -5603,54 +5649,18 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
return a->core_cookie == b->core_cookie;
}
-// XXX fairness/fwd progress conditions
-/*
- * Returns
- * - NULL if there is no runnable task for this class.
- * - the highest priority task for this runqueue if it matches
- * rq->core->core_cookie or its priority is greater than max.
- * - Else returns idle_task.
- */
-static struct task_struct *
-pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi)
+static inline struct task_struct *pick_task(struct rq *rq)
{
- struct task_struct *class_pick, *cookie_pick;
- unsigned long cookie = rq->core->core_cookie;
-
- class_pick = class->pick_task(rq);
- if (!class_pick)
- return NULL;
-
- if (!cookie) {
- /*
- * If class_pick is tagged, return it only if it has
- * higher priority than max.
- */
- if (max && class_pick->core_cookie &&
- prio_less(class_pick, max, in_fi))
- return idle_sched_class.pick_task(rq);
+ const struct sched_class *class;
+ struct task_struct *p;
- return class_pick;
+ for_each_class(class) {
+ p = class->pick_task(rq);
+ if (p)
+ return p;
}
- /*
- * If class_pick is idle or matches cookie, return early.
- */
- if (cookie_equals(class_pick, cookie))
- return class_pick;
-
- cookie_pick = sched_core_find(rq, cookie);
-
- /*
- * If class > max && class > cookie, it is the highest priority task on
- * the core (so far) and it must be selected, otherwise we must go with
- * the cookie pick in order to satisfy the constraint.
- */
- if (prio_less(cookie_pick, class_pick, in_fi) &&
- (!max || prio_less(max, class_pick, in_fi)))
- return class_pick;
-
- return cookie_pick;
+ BUG(); /* The idle class should always have a runnable task. */
}
extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
@@ -5658,11 +5668,13 @@ extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_f
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- struct task_struct *next, *max = NULL;
- const struct sched_class *class;
+ struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask;
bool fi_before = false;
- int i, j, cpu, occ = 0;
+ bool core_clock_updated = (rq == rq->core);
+ unsigned long cookie;
+ int i, cpu, occ = 0;
+ struct rq *rq_i;
bool need_sync;
if (!sched_core_enabled(rq))
@@ -5712,10 +5724,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* reset state */
rq->core->core_cookie = 0UL;
- if (rq->core->core_forceidle) {
+ if (rq->core->core_forceidle_count) {
+ if (!core_clock_updated) {
+ update_rq_clock(rq->core);
+ core_clock_updated = true;
+ }
+ sched_core_account_forceidle(rq);
+ /* reset after accounting force idle */
+ rq->core->core_forceidle_start = 0;
+ rq->core->core_forceidle_count = 0;
+ rq->core->core_forceidle_occupation = 0;
need_sync = true;
fi_before = true;
- rq->core->core_forceidle = false;
}
/*
@@ -5735,12 +5755,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* and there are no cookied tasks running on siblings.
*/
if (!need_sync) {
- for_each_class(class) {
- next = class->pick_task(rq);
- if (next)
- break;
- }
-
+ next = pick_task(rq);
if (!next->core_cookie) {
rq->core_pick = NULL;
/*
@@ -5753,79 +5768,65 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
}
- for_each_cpu(i, smt_mask) {
- struct rq *rq_i = cpu_rq(i);
-
- rq_i->core_pick = NULL;
+ /*
+ * For each thread: do the regular task pick and find the max prio task
+ * amongst them.
+ *
+ * Tie-break prio towards the current CPU
+ */
+ for_each_cpu_wrap(i, smt_mask, cpu) {
+ rq_i = cpu_rq(i);
- if (i != cpu)
+ /*
+ * Current cpu always has its clock updated on entrance to
+ * pick_next_task(). If the current cpu is not the core,
+ * the core may also have been updated above.
+ */
+ if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
+
+ p = rq_i->core_pick = pick_task(rq_i);
+ if (!max || prio_less(max, p, fi_before))
+ max = p;
}
+ cookie = rq->core->core_cookie = max->core_cookie;
+
/*
- * Try and select tasks for each sibling in descending sched_class
- * order.
+ * For each thread: try and find a runnable task that matches @max or
+ * force idle.
*/
- for_each_class(class) {
-again:
- for_each_cpu_wrap(i, smt_mask, cpu) {
- struct rq *rq_i = cpu_rq(i);
- struct task_struct *p;
-
- if (rq_i->core_pick)
- continue;
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick;
- /*
- * If this sibling doesn't yet have a suitable task to
- * run; ask for the most eligible task, given the
- * highest priority task already selected for this
- * core.
- */
- p = pick_task(rq_i, class, max, fi_before);
+ if (!cookie_equals(p, cookie)) {
+ p = NULL;
+ if (cookie)
+ p = sched_core_find(rq_i, cookie);
if (!p)
- continue;
+ p = idle_sched_class.pick_task(rq_i);
+ }
- if (!is_task_rq_idle(p))
- occ++;
+ rq_i->core_pick = p;
- rq_i->core_pick = p;
- if (rq_i->idle == p && rq_i->nr_running) {
- rq->core->core_forceidle = true;
+ if (p == rq_i->idle) {
+ if (rq_i->nr_running) {
+ rq->core->core_forceidle_count++;
if (!fi_before)
rq->core->core_forceidle_seq++;
}
-
- /*
- * If this new candidate is of higher priority than the
- * previous; and they're incompatible; we need to wipe
- * the slate and start over. pick_task makes sure that
- * p's priority is more than max if it doesn't match
- * max's cookie.
- *
- * NOTE: this is a linear max-filter and is thus bounded
- * in execution time.
- */
- if (!max || !cookie_match(max, p)) {
- struct task_struct *old_max = max;
-
- rq->core->core_cookie = p->core_cookie;
- max = p;
-
- if (old_max) {
- rq->core->core_forceidle = false;
- for_each_cpu(j, smt_mask) {
- if (j == i)
- continue;
-
- cpu_rq(j)->core_pick = NULL;
- }
- occ = 1;
- goto again;
- }
- }
+ } else {
+ occ++;
}
}
+ if (schedstat_enabled() && rq->core->core_forceidle_count) {
+ if (cookie)
+ rq->core->core_forceidle_start = rq_clock(rq->core);
+ rq->core->core_forceidle_occupation = occ;
+ }
+
rq->core->core_pick_seq = rq->core->core_task_seq;
next = rq->core_pick;
rq->core_sched_seq = rq->core->core_pick_seq;
@@ -5842,7 +5843,7 @@ again:
* non-matching user state.
*/
for_each_cpu(i, smt_mask) {
- struct rq *rq_i = cpu_rq(i);
+ rq_i = cpu_rq(i);
/*
* An online sibling might have gone offline before a task
@@ -5862,8 +5863,8 @@ again:
* 1 0 1
* 1 1 0
*/
- if (!(fi_before && rq->core->core_forceidle))
- task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
+ if (!(fi_before && rq->core->core_forceidle_count))
+ task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
rq_i->core_pick->core_occupation = occ;
@@ -6067,11 +6068,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
goto unlock;
/* copy the shared state to the new leader */
- core_rq->core_task_seq = rq->core_task_seq;
- core_rq->core_pick_seq = rq->core_pick_seq;
- core_rq->core_cookie = rq->core_cookie;
- core_rq->core_forceidle = rq->core_forceidle;
- core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_task_seq = rq->core_task_seq;
+ core_rq->core_pick_seq = rq->core_pick_seq;
+ core_rq->core_cookie = rq->core_cookie;
+ core_rq->core_forceidle_count = rq->core_forceidle_count;
+ core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
+
+ /*
+ * Accounting edge for forced idle is handled in pick_next_task().
+ * Don't need another one here, since the hotplug thread shouldn't
+ * have a cookie.
+ */
+ core_rq->core_forceidle_start = 0;
/* install new leader */
for_each_cpu(t, smt_mask) {
@@ -6319,20 +6328,14 @@ static inline void sched_submit_work(struct task_struct *tsk)
task_flags = tsk->flags;
/*
- * If a worker went to sleep, notify and ask workqueue whether
- * it wants to wake up a task to maintain concurrency.
- * As this function is called inside the schedule() context,
- * we disable preemption to avoid it calling schedule() again
- * in the possible wakeup of a kworker and because wq_worker_sleeping()
- * requires it.
+ * If a worker goes to sleep, notify and ask workqueue whether it
+ * wants to wake up a task to maintain concurrency.
*/
if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
- preempt_disable();
if (task_flags & PF_WQ_WORKER)
wq_worker_sleeping(tsk);
else
io_wq_worker_sleeping(tsk);
- preempt_enable_no_resched();
}
if (tsk_is_pi_blocked(tsk))
@@ -6343,7 +6346,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
* make sure to submit it to avoid deadlocks.
*/
if (blk_needs_flush_plug(tsk))
- blk_schedule_flush_plug(tsk);
+ blk_flush_plug(tsk->plug, true);
}
static void sched_update_worker(struct task_struct *tsk)
@@ -6586,12 +6589,13 @@ EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
*/
enum {
- preempt_dynamic_none = 0,
+ preempt_dynamic_undefined = -1,
+ preempt_dynamic_none,
preempt_dynamic_voluntary,
preempt_dynamic_full,
};
-int preempt_dynamic_mode = preempt_dynamic_full;
+int preempt_dynamic_mode = preempt_dynamic_undefined;
int sched_dynamic_mode(const char *str)
{
@@ -6656,15 +6660,35 @@ static int __init setup_preempt_mode(char *str)
int mode = sched_dynamic_mode(str);
if (mode < 0) {
pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
- return 1;
+ return 0;
}
sched_dynamic_update(mode);
- return 0;
+ return 1;
}
__setup("preempt=", setup_preempt_mode);
-#endif /* CONFIG_PREEMPT_DYNAMIC */
+static void __init preempt_dynamic_init(void)
+{
+ if (preempt_dynamic_mode == preempt_dynamic_undefined) {
+ if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
+ sched_dynamic_update(preempt_dynamic_none);
+ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
+ sched_dynamic_update(preempt_dynamic_voluntary);
+ } else {
+ /* Default static call setting, nothing to do */
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
+ preempt_dynamic_mode = preempt_dynamic_full;
+ pr_info("Dynamic Preempt: full\n");
+ }
+ }
+}
+
+#else /* !CONFIG_PREEMPT_DYNAMIC */
+
+static inline void preempt_dynamic_init(void) { }
+
+#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
/*
* This is the entry point to schedule() from kernel preemption
@@ -7145,7 +7169,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long sched_cpu_util(int cpu, unsigned long max)
{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
ENERGY_UTIL, NULL);
}
#endif /* CONFIG_SMP */
@@ -8354,7 +8378,8 @@ int io_schedule_prepare(void)
int old_iowait = current->in_iowait;
current->in_iowait = 1;
- blk_schedule_flush_plug(current);
+ if (current->plug)
+ blk_flush_plug(current->plug, true);
return old_iowait;
}
@@ -8538,7 +8563,7 @@ void sched_show_task(struct task_struct *p)
rcu_read_unlock();
pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
free, task_pid_nr(p), ppid,
- (unsigned long)task_thread_info(p)->flags);
+ read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
@@ -8637,9 +8662,6 @@ void __init init_idle(struct task_struct *idle, int cpu)
idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
kthread_set_per_cpu(idle, cpu);
- scs_task_reset(idle);
- kasan_unpoison_task_stack(idle);
-
#ifdef CONFIG_SMP
/*
* It's possible that init_idle() gets called multiple times on a task,
@@ -8836,7 +8858,6 @@ static void balance_push(struct rq *rq)
struct task_struct *push_task = rq->curr;
lockdep_assert_rq_held(rq);
- SCHED_WARN_ON(rq->cpu != smp_processor_id());
/*
* Ensure the thing is persistent until balance_push_set(.on = false);
@@ -8844,9 +8865,10 @@ static void balance_push(struct rq *rq)
rq->balance_callback = &balance_push_callback;
/*
- * Only active while going offline.
+ * Only active while going offline and when invoked on the outgoing
+ * CPU.
*/
- if (!cpu_dying(rq->cpu))
+ if (!cpu_dying(rq->cpu) || rq != this_rq())
return;
/*
@@ -9430,7 +9452,9 @@ void __init sched_init(void)
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
- rq->core_forceidle = false;
+ rq->core_forceidle_count = 0;
+ rq->core_forceidle_occupation = 0;
+ rq->core_forceidle_start = 0;
rq->core_cookie = 0UL;
#endif
@@ -9464,18 +9488,14 @@ void __init sched_init(void)
init_uclamp();
+ preempt_dynamic_init();
+
scheduler_running = 1;
}
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-static inline int preempt_count_equals(int preempt_offset)
-{
- int nested = preempt_count() + rcu_preempt_depth();
-
- return (nested == preempt_offset);
-}
-void __might_sleep(const char *file, int line, int preempt_offset)
+void __might_sleep(const char *file, int line)
{
unsigned int state = get_current_state();
/*
@@ -9489,11 +9509,32 @@ void __might_sleep(const char *file, int line, int preempt_offset)
(void *)current->task_state_change,
(void *)current->task_state_change);
- ___might_sleep(file, line, preempt_offset);
+ __might_resched(file, line, 0);
}
EXPORT_SYMBOL(__might_sleep);
-void ___might_sleep(const char *file, int line, int preempt_offset)
+static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
+{
+ if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
+ return;
+
+ if (preempt_count() == preempt_offset)
+ return;
+
+ pr_err("Preemption disabled at:");
+ print_ip_sym(KERN_ERR, ip);
+}
+
+static inline bool resched_offsets_ok(unsigned int offsets)
+{
+ unsigned int nested = preempt_count();
+
+ nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT;
+
+ return nested == offsets;
+}
+
+void __might_resched(const char *file, int line, unsigned int offsets)
{
/* Ratelimiting timestamp: */
static unsigned long prev_jiffy;
@@ -9503,7 +9544,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
/* WARN_ON_ONCE() by default, no rate limit required: */
rcu_sleep_check();
- if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ if ((resched_offsets_ok(offsets) && !irqs_disabled() &&
!is_idle_task(current) && !current->non_block_count) ||
system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
oops_in_progress)
@@ -9516,29 +9557,33 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
/* Save this before calling printk(), since that will clobber it: */
preempt_disable_ip = get_preempt_disable_ip(current);
- printk(KERN_ERR
- "BUG: sleeping function called from invalid context at %s:%d\n",
- file, line);
- printk(KERN_ERR
- "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
- in_atomic(), irqs_disabled(), current->non_block_count,
- current->pid, current->comm);
+ pr_err("BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), current->non_block_count,
+ current->pid, current->comm);
+ pr_err("preempt_count: %x, expected: %x\n", preempt_count(),
+ offsets & MIGHT_RESCHED_PREEMPT_MASK);
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
+ pr_err("RCU nest depth: %d, expected: %u\n",
+ rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT);
+ }
if (task_stack_end_corrupted(current))
- printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+ pr_emerg("Thread overran stack, or stack corrupted\n");
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
- if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
- && !preempt_count_equals(preempt_offset)) {
- pr_err("Preemption disabled at:");
- print_ip_sym(KERN_ERR, preempt_disable_ip);
- }
+
+ print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK,
+ preempt_disable_ip);
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
-EXPORT_SYMBOL(___might_sleep);
+EXPORT_SYMBOL(__might_resched);
void __cant_sleep(const char *file, int line, int preempt_offset)
{
@@ -9619,9 +9664,9 @@ void normalize_rt_tasks(void)
continue;
p->se.exec_start = 0;
- schedstat_set(p->se.statistics.wait_start, 0);
- schedstat_set(p->se.statistics.sleep_start, 0);
- schedstat_set(p->se.statistics.block_start, 0);
+ schedstat_set(p->stats.wait_start, 0);
+ schedstat_set(p->stats.sleep_start, 0);
+ schedstat_set(p->stats.block_start, 0);
if (!dl_task(p) && !rt_task(p)) {
/*
@@ -9715,6 +9760,22 @@ static void sched_free_group(struct task_group *tg)
kmem_cache_free(task_group_cache, tg);
}
+static void sched_free_group_rcu(struct rcu_head *rcu)
+{
+ sched_free_group(container_of(rcu, struct task_group, rcu));
+}
+
+static void sched_unregister_group(struct task_group *tg)
+{
+ unregister_fair_sched_group(tg);
+ unregister_rt_sched_group(tg);
+ /*
+ * We have to wait for yet another RCU grace period to expire, as
+ * print_cfs_stats() might run concurrently.
+ */
+ call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
/* allocate runqueue etc for a new task group */
struct task_group *sched_create_group(struct task_group *parent)
{
@@ -9758,25 +9819,35 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
}
/* rcu callback to free various structures associated with a task group */
-static void sched_free_group_rcu(struct rcu_head *rhp)
+static void sched_unregister_group_rcu(struct rcu_head *rhp)
{
/* Now it should be safe to free those cfs_rqs: */
- sched_free_group(container_of(rhp, struct task_group, rcu));
+ sched_unregister_group(container_of(rhp, struct task_group, rcu));
}
void sched_destroy_group(struct task_group *tg)
{
/* Wait for possible concurrent references to cfs_rqs complete: */
- call_rcu(&tg->rcu, sched_free_group_rcu);
+ call_rcu(&tg->rcu, sched_unregister_group_rcu);
}
-void sched_offline_group(struct task_group *tg)
+void sched_release_group(struct task_group *tg)
{
unsigned long flags;
- /* End participation in shares distribution: */
- unregister_fair_sched_group(tg);
-
+ /*
+ * Unlink first, to avoid walk_tg_tree_from() from finding us (via
+ * sched_cfs_period_timer()).
+ *
+ * For this to be effective, we have to wait for all pending users of
+ * this task group to leave their RCU critical section to ensure no new
+ * user will see our dying task group any more. Specifically ensure
+ * that tg_unthrottle_up() won't add decayed cfs_rq's to it.
+ *
+ * We therefore defer calling unregister_fair_sched_group() to
+ * sched_unregister_group() which is guarantied to get called only after the
+ * current RCU grace period has expired.
+ */
spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
@@ -9895,7 +9966,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- sched_offline_group(tg);
+ sched_release_group(tg);
}
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -9905,7 +9976,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
/*
* Relies on the RCU grace period between css_released() and this.
*/
- sched_free_group(tg);
+ sched_unregister_group(tg);
}
/*
@@ -10463,15 +10534,21 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
if (schedstat_enabled() && tg != &root_task_group) {
+ struct sched_statistics *stats;
u64 ws = 0;
int i;
- for_each_possible_cpu(i)
- ws += schedstat_val(tg->se[i]->statistics.wait_sum);
+ for_each_possible_cpu(i) {
+ stats = __schedstats_from_se(tg->se[i]);
+ ws += schedstat_val(stats->wait_sum);
+ }
seq_printf(sf, "wait_sum %llu\n", ws);
}
+ seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst);
+ seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time);
+
return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -10587,16 +10664,20 @@ static int cpu_extra_stat_show(struct seq_file *sf,
{
struct task_group *tg = css_tg(css);
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
- u64 throttled_usec;
+ u64 throttled_usec, burst_usec;
throttled_usec = cfs_b->throttled_time;
do_div(throttled_usec, NSEC_PER_USEC);
+ burst_usec = cfs_b->burst_time;
+ do_div(burst_usec, NSEC_PER_USEC);
seq_printf(sf, "nr_periods %d\n"
"nr_throttled %d\n"
- "throttled_usec %llu\n",
+ "throttled_usec %llu\n"
+ "nr_bursts %d\n"
+ "burst_usec %llu\n",
cfs_b->nr_periods, cfs_b->nr_throttled,
- throttled_usec);
+ throttled_usec, cfs_b->nr_burst, burst_usec);
}
#endif
return 0;
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 9a80e9a474c0..1fb45672ec85 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -11,7 +11,7 @@ struct sched_core_cookie {
refcount_t refcnt;
};
-unsigned long sched_core_alloc_cookie(void)
+static unsigned long sched_core_alloc_cookie(void)
{
struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL);
if (!ck)
@@ -23,7 +23,7 @@ unsigned long sched_core_alloc_cookie(void)
return (unsigned long)ck;
}
-void sched_core_put_cookie(unsigned long cookie)
+static void sched_core_put_cookie(unsigned long cookie)
{
struct sched_core_cookie *ptr = (void *)cookie;
@@ -33,7 +33,7 @@ void sched_core_put_cookie(unsigned long cookie)
}
}
-unsigned long sched_core_get_cookie(unsigned long cookie)
+static unsigned long sched_core_get_cookie(unsigned long cookie)
{
struct sched_core_cookie *ptr = (void *)cookie;
@@ -53,7 +53,8 @@ unsigned long sched_core_get_cookie(unsigned long cookie)
*
* Returns: the old cookie
*/
-unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie)
+static unsigned long sched_core_update_cookie(struct task_struct *p,
+ unsigned long cookie)
{
unsigned long old_cookie;
struct rq_flags rf;
@@ -72,7 +73,7 @@ unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cook
enqueued = sched_core_enqueued(p);
if (enqueued)
- sched_core_dequeue(rq, p);
+ sched_core_dequeue(rq, p, DEQUEUE_SAVE);
old_cookie = p->core_cookie;
p->core_cookie = cookie;
@@ -84,6 +85,10 @@ unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cook
* If task is currently running, it may not be compatible anymore after
* the cookie change, so enter the scheduler on its CPU to schedule it
* away.
+ *
+ * Note that it is possible that as a result of this cookie change, the
+ * core has now entered/left forced idle state. Defer accounting to the
+ * next scheduling edge, rather than always forcing a reschedule here.
*/
if (task_running(rq, p))
resched_curr(rq);
@@ -134,6 +139,10 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
if (!static_branch_likely(&sched_smt_present))
return -ENODEV;
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD != PIDTYPE_PID);
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD_GROUP != PIDTYPE_TGID);
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_PROCESS_GROUP != PIDTYPE_PGID);
+
if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 ||
(cmd != PR_SCHED_CORE_GET && uaddr))
return -EINVAL;
@@ -227,3 +236,63 @@ out:
return err;
}
+#ifdef CONFIG_SCHEDSTATS
+
+/* REQUIRES: rq->core's clock recently updated. */
+void __sched_core_account_forceidle(struct rq *rq)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+ u64 delta, now = rq_clock(rq->core);
+ struct rq *rq_i;
+ struct task_struct *p;
+ int i;
+
+ lockdep_assert_rq_held(rq);
+
+ WARN_ON_ONCE(!rq->core->core_forceidle_count);
+
+ if (rq->core->core_forceidle_start == 0)
+ return;
+
+ delta = now - rq->core->core_forceidle_start;
+ if (unlikely((s64)delta <= 0))
+ return;
+
+ rq->core->core_forceidle_start = now;
+
+ if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
+ /* can't be forced idle without a running task */
+ } else if (rq->core->core_forceidle_count > 1 ||
+ rq->core->core_forceidle_occupation > 1) {
+ /*
+ * For larger SMT configurations, we need to scale the charged
+ * forced idle amount since there can be more than one forced
+ * idle sibling and more than one running cookied task.
+ */
+ delta *= rq->core->core_forceidle_count;
+ delta = div_u64(delta, rq->core->core_forceidle_occupation);
+ }
+
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick ?: rq_i->curr;
+
+ if (!p->core_cookie)
+ continue;
+
+ __schedstat_add(p->stats.core_forceidle_sum, delta);
+ }
+}
+
+void __sched_core_tick(struct rq *rq)
+{
+ if (!rq->core->core_forceidle_count)
+ return;
+
+ if (rq != rq->core)
+ update_rq_clock(rq->core);
+
+ __sched_core_account_forceidle(rq);
+}
+
+#endif /* CONFIG_SCHEDSTATS */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 893eece65bfd..3d06c5e4220d 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -21,15 +21,11 @@ static const char * const cpuacct_stat_desc[] = {
[CPUACCT_STAT_SYSTEM] = "system",
};
-struct cpuacct_usage {
- u64 usages[CPUACCT_STAT_NSTATS];
-};
-
/* track CPU usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every CPU */
- struct cpuacct_usage __percpu *cpuusage;
+ u64 __percpu *cpuusage;
struct kernel_cpustat __percpu *cpustat;
};
@@ -49,7 +45,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
return css_ca(ca->css.parent);
}
-static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
+static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
static struct cpuacct root_cpuacct = {
.cpustat = &kernel_cpustat,
.cpuusage = &root_cpuacct_cpuusage,
@@ -68,7 +64,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
if (!ca)
goto out;
- ca->cpuusage = alloc_percpu(struct cpuacct_usage);
+ ca->cpuusage = alloc_percpu(u64);
if (!ca->cpuusage)
goto out_free_ca;
@@ -99,14 +95,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
enum cpuacct_stat_index index)
{
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
u64 data;
/*
* We allow index == CPUACCT_STAT_NSTATS here to read
* the sum of usages.
*/
- BUG_ON(index > CPUACCT_STAT_NSTATS);
+ if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS))
+ return 0;
#ifndef CONFIG_64BIT
/*
@@ -115,14 +113,17 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
- if (index == CPUACCT_STAT_NSTATS) {
- int i = 0;
-
- data = 0;
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- data += cpuusage->usages[i];
- } else {
- data = cpuusage->usages[index];
+ switch (index) {
+ case CPUACCT_STAT_USER:
+ data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE];
+ break;
+ case CPUACCT_STAT_SYSTEM:
+ data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] +
+ cpustat[CPUTIME_SOFTIRQ];
+ break;
+ case CPUACCT_STAT_NSTATS:
+ data = *cpuusage;
+ break;
}
#ifndef CONFIG_64BIT
@@ -132,10 +133,14 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
return data;
}
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu)
{
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- int i;
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
+
+ /* Don't allow to reset global kernel_cpustat */
+ if (ca == &root_cpuacct)
+ return;
#ifndef CONFIG_64BIT
/*
@@ -143,9 +148,10 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
*/
raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
-
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- cpuusage->usages[i] = val;
+ *cpuusage = 0;
+ cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0;
+ cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0;
+ cpustat[CPUTIME_SOFTIRQ] = 0;
#ifndef CONFIG_64BIT
raw_spin_rq_unlock_irq(cpu_rq(cpu));
@@ -196,7 +202,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
return -EINVAL;
for_each_possible_cpu(cpu)
- cpuacct_cpuusage_write(ca, cpu, 0);
+ cpuacct_cpuusage_write(ca, cpu);
return 0;
}
@@ -243,25 +249,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
seq_puts(m, "\n");
for_each_possible_cpu(cpu) {
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-
seq_printf(m, "%d", cpu);
-
- for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit read safe on 32-bit
- * platforms.
- */
- raw_spin_rq_lock_irq(cpu_rq(cpu));
-#endif
-
- seq_printf(m, " %llu", cpuusage->usages[index]);
-
-#ifndef CONFIG_64BIT
- raw_spin_rq_unlock_irq(cpu_rq(cpu));
-#endif
- }
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+ seq_printf(m, " %llu",
+ cpuacct_cpuusage_read(ca, cpu, index));
seq_puts(m, "\n");
}
return 0;
@@ -270,25 +261,30 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
static int cpuacct_stats_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(seq_css(sf));
- s64 val[CPUACCT_STAT_NSTATS];
+ struct task_cputime cputime;
+ u64 val[CPUACCT_STAT_NSTATS];
int cpu;
int stat;
- memset(val, 0, sizeof(val));
+ memset(&cputime, 0, sizeof(cputime));
for_each_possible_cpu(cpu) {
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
+ cputime.utime += cpustat[CPUTIME_USER];
+ cputime.utime += cpustat[CPUTIME_NICE];
+ cputime.stime += cpustat[CPUTIME_SYSTEM];
+ cputime.stime += cpustat[CPUTIME_IRQ];
+ cputime.stime += cpustat[CPUTIME_SOFTIRQ];
+
+ cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu);
}
+ cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime,
+ &val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]);
+
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
- seq_printf(sf, "%s %lld\n",
- cpuacct_stat_desc[stat],
- (long long)nsec_to_clock_t(val[stat]));
+ seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat],
+ nsec_to_clock_t(val[stat]));
}
return 0;
@@ -339,16 +335,11 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
- int index = CPUACCT_STAT_SYSTEM;
- struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
-
- if (regs && user_mode(regs))
- index = CPUACCT_STAT_USER;
rcu_read_lock();
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
- __this_cpu_add(ca->cpuusage->usages[index], cputime);
+ __this_cpu_add(*ca->cpuusage, cputime);
rcu_read_unlock();
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e7af18857371..26778884d9ab 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -168,7 +168,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,
+ sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max,
FREQUENCY_UTIL, NULL);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 872e481d5098..b7ec42732b28 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -148,10 +148,10 @@ void account_guest_time(struct task_struct *p, u64 cputime)
/* Add guest time to cpustat. */
if (task_nice(p) > 0) {
- cpustat[CPUTIME_NICE] += cputime;
+ task_group_account_field(p, CPUTIME_NICE, cputime);
cpustat[CPUTIME_GUEST_NICE] += cputime;
} else {
- cpustat[CPUTIME_USER] += cputime;
+ task_group_account_field(p, CPUTIME_USER, cputime);
cpustat[CPUTIME_GUEST] += cputime;
}
}
@@ -615,7 +615,8 @@ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
.sum_exec_runtime = p->se.sum_exec_runtime,
};
- task_cputime(p, &cputime.utime, &cputime.stime);
+ if (task_cputime(p, &cputime.utime, &cputime.stime))
+ cputime.sum_exec_runtime = task_sched_runtime(p);
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
EXPORT_SYMBOL_GPL(task_cputime_adjusted);
@@ -828,19 +829,21 @@ u64 task_gtime(struct task_struct *t)
* add up the pending nohz execution time since the last
* cputime snapshot.
*/
-void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
+bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{
struct vtime *vtime = &t->vtime;
unsigned int seq;
u64 delta;
+ int ret;
if (!vtime_accounting_enabled()) {
*utime = t->utime;
*stime = t->stime;
- return;
+ return false;
}
do {
+ ret = false;
seq = read_seqcount_begin(&vtime->seqcount);
*utime = t->utime;
@@ -850,6 +853,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
if (vtime->state < VTIME_SYS)
continue;
+ ret = true;
delta = vtime_delta(vtime);
/*
@@ -861,6 +865,8 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
else
*utime += vtime->utime + delta;
} while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return ret;
}
static int vtime_state_fetch(struct vtime *vtime, int cpu)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e94314633b39..d2c072b0ef01 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1265,8 +1265,10 @@ static void update_curr_dl(struct rq *rq)
return;
}
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
+ schedstat_set(curr->stats.exec_max,
+ max(curr->stats.exec_max, delta_exec));
+
+ trace_sched_stat_runtime(curr, delta_exec, 0);
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
@@ -1472,6 +1474,82 @@ static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
}
+static inline struct sched_statistics *
+__schedstats_from_dl_se(struct sched_dl_entity *dl_se)
+{
+ return &dl_task_of(dl_se)->stats;
+}
+
+static inline void
+update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats;
+
+ if (!schedstat_enabled())
+ return;
+
+ stats = __schedstats_from_dl_se(dl_se);
+ __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats;
+
+ if (!schedstat_enabled())
+ return;
+
+ stats = __schedstats_from_dl_se(dl_se);
+ __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats;
+
+ if (!schedstat_enabled())
+ return;
+
+ stats = __schedstats_from_dl_se(dl_se);
+ __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_enqueue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags)
+{
+ if (!schedstat_enabled())
+ return;
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper_dl(dl_rq, dl_se);
+}
+
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (!schedstat_enabled())
+ return;
+
+ if ((flags & DEQUEUE_SLEEP)) {
+ unsigned int state;
+
+ state = READ_ONCE(p->__state);
+ if (state & TASK_INTERRUPTIBLE)
+ __schedstat_set(p->stats.sleep_start,
+ rq_clock(rq_of_dl_rq(dl_rq)));
+
+ if (state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(p->stats.block_start,
+ rq_clock(rq_of_dl_rq(dl_rq)));
+ }
+}
+
static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
@@ -1502,6 +1580,8 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
BUG_ON(on_dl_rq(dl_se));
+ update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
+
/*
* If this is a wakeup or a new instance, the scheduling
* parameters of the task might need updating. Otherwise,
@@ -1598,6 +1678,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
return;
}
+ check_schedstat_required();
+ update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl);
+
enqueue_dl_entity(&p->dl, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -1606,6 +1689,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
+ update_stats_dequeue_dl(&rq->dl, &p->dl, flags);
dequeue_dl_entity(&p->dl);
dequeue_pushable_dl_task(rq, p);
}
@@ -1825,7 +1909,12 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct dl_rq *dl_rq = &rq->dl;
+
p->se.exec_start = rq_clock_task(rq);
+ if (on_dl_rq(&p->dl))
+ update_stats_wait_end_dl(dl_rq, dl_se);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
@@ -1882,6 +1971,12 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct dl_rq *dl_rq = &rq->dl;
+
+ if (on_dl_rq(&p->dl))
+ update_stats_wait_start_dl(dl_rq, dl_se);
+
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 49716228efb4..aa29211de1bf 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -173,16 +173,22 @@ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[16];
+ unsigned int scaling;
if (cnt > 15)
cnt = 15;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
+ buf[cnt] = '\0';
- if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling))
+ if (kstrtouint(buf, 10, &scaling))
return -EINVAL;
+ if (scaling >= SCHED_TUNABLESCALING_END)
+ return -EINVAL;
+
+ sysctl_sched_tunable_scaling = scaling;
if (sched_update_scaling())
return -EINVAL;
@@ -305,6 +311,7 @@ static __init int sched_init_debug(void)
debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
+ debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
@@ -442,9 +449,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
struct sched_entity *se = tg->se[cpu];
#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
-#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
+#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", \
+ #F, (long long)schedstat_val(stats->F))
#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
+#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", \
+ #F, SPLIT_NS((long long)schedstat_val(stats->F)))
if (!se)
return;
@@ -454,16 +463,19 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->sum_exec_runtime);
if (schedstat_enabled()) {
- PN_SCHEDSTAT(se->statistics.wait_start);
- PN_SCHEDSTAT(se->statistics.sleep_start);
- PN_SCHEDSTAT(se->statistics.block_start);
- PN_SCHEDSTAT(se->statistics.sleep_max);
- PN_SCHEDSTAT(se->statistics.block_max);
- PN_SCHEDSTAT(se->statistics.exec_max);
- PN_SCHEDSTAT(se->statistics.slice_max);
- PN_SCHEDSTAT(se->statistics.wait_max);
- PN_SCHEDSTAT(se->statistics.wait_sum);
- P_SCHEDSTAT(se->statistics.wait_count);
+ struct sched_statistics *stats;
+ stats = __schedstats_from_se(se);
+
+ PN_SCHEDSTAT(wait_start);
+ PN_SCHEDSTAT(sleep_start);
+ PN_SCHEDSTAT(block_start);
+ PN_SCHEDSTAT(sleep_max);
+ PN_SCHEDSTAT(block_max);
+ PN_SCHEDSTAT(exec_max);
+ PN_SCHEDSTAT(slice_max);
+ PN_SCHEDSTAT(wait_max);
+ PN_SCHEDSTAT(wait_sum);
+ P_SCHEDSTAT(wait_count);
}
P(se->load.weight);
@@ -529,10 +541,11 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
(long long)(p->nvcsw + p->nivcsw),
p->prio);
- SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
+ SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld",
+ SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
+ SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
+ SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
@@ -608,6 +621,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
+ cfs_rq->idle_nr_running);
SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
cfs_rq->idle_h_nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
@@ -804,6 +819,7 @@ static void sched_debug_header(struct seq_file *m)
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
PN(sysctl_sched_latency);
PN(sysctl_sched_min_granularity);
+ PN(sysctl_sched_idle_min_granularity);
PN(sysctl_sched_wakeup_granularity);
P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
@@ -948,8 +964,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
"---------------------------------------------------------"
"----------\n");
-#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F))
-#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F))
+#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->stats.F))
+#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->stats.F))
PN(se.exec_start);
PN(se.vruntime);
@@ -962,33 +978,34 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
- PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
- PN_SCHEDSTAT(se.statistics.wait_start);
- PN_SCHEDSTAT(se.statistics.sleep_start);
- PN_SCHEDSTAT(se.statistics.block_start);
- PN_SCHEDSTAT(se.statistics.sleep_max);
- PN_SCHEDSTAT(se.statistics.block_max);
- PN_SCHEDSTAT(se.statistics.exec_max);
- PN_SCHEDSTAT(se.statistics.slice_max);
- PN_SCHEDSTAT(se.statistics.wait_max);
- PN_SCHEDSTAT(se.statistics.wait_sum);
- P_SCHEDSTAT(se.statistics.wait_count);
- PN_SCHEDSTAT(se.statistics.iowait_sum);
- P_SCHEDSTAT(se.statistics.iowait_count);
- P_SCHEDSTAT(se.statistics.nr_migrations_cold);
- P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
- P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
- P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
- P_SCHEDSTAT(se.statistics.nr_forced_migrations);
- P_SCHEDSTAT(se.statistics.nr_wakeups);
- P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
- P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
- P_SCHEDSTAT(se.statistics.nr_wakeups_local);
- P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
- P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
- P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
- P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
- P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
+ PN_SCHEDSTAT(sum_sleep_runtime);
+ PN_SCHEDSTAT(sum_block_runtime);
+ PN_SCHEDSTAT(wait_start);
+ PN_SCHEDSTAT(sleep_start);
+ PN_SCHEDSTAT(block_start);
+ PN_SCHEDSTAT(sleep_max);
+ PN_SCHEDSTAT(block_max);
+ PN_SCHEDSTAT(exec_max);
+ PN_SCHEDSTAT(slice_max);
+ PN_SCHEDSTAT(wait_max);
+ PN_SCHEDSTAT(wait_sum);
+ P_SCHEDSTAT(wait_count);
+ PN_SCHEDSTAT(iowait_sum);
+ P_SCHEDSTAT(iowait_count);
+ P_SCHEDSTAT(nr_migrations_cold);
+ P_SCHEDSTAT(nr_failed_migrations_affine);
+ P_SCHEDSTAT(nr_failed_migrations_running);
+ P_SCHEDSTAT(nr_failed_migrations_hot);
+ P_SCHEDSTAT(nr_forced_migrations);
+ P_SCHEDSTAT(nr_wakeups);
+ P_SCHEDSTAT(nr_wakeups_sync);
+ P_SCHEDSTAT(nr_wakeups_migrate);
+ P_SCHEDSTAT(nr_wakeups_local);
+ P_SCHEDSTAT(nr_wakeups_remote);
+ P_SCHEDSTAT(nr_wakeups_affine);
+ P_SCHEDSTAT(nr_wakeups_affine_attempts);
+ P_SCHEDSTAT(nr_wakeups_passive);
+ P_SCHEDSTAT(nr_wakeups_idle);
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
@@ -1006,6 +1023,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
__PN(avg_atom);
__PN(avg_per_cpu);
+
+#ifdef CONFIG_SCHED_CORE
+ PN_SCHEDSTAT(core_forceidle_sum);
+#endif
}
__P(nr_switches);
@@ -1054,7 +1075,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
void proc_sched_set_task(struct task_struct *p)
{
#ifdef CONFIG_SCHEDSTATS
- memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+ memset(&p->stats, 0, sizeof(p->stats));
#endif
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ff69f245b939..095b0aa378df 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -60,6 +60,14 @@ unsigned int sysctl_sched_min_granularity = 750000ULL;
static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
+ * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
+ * Applies only when SCHED_IDLE tasks compete with normal tasks.
+ *
+ * (default: 0.75 msec)
+ */
+unsigned int sysctl_sched_idle_min_granularity = 750000ULL;
+
+/*
* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
*/
static unsigned int sched_nr_latency = 8;
@@ -665,6 +673,8 @@ static u64 __sched_period(unsigned long nr_running)
return sysctl_sched_latency;
}
+static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
+
/*
* We calculate the wall-time slice from the period by taking a part
* proportional to the weight.
@@ -674,6 +684,8 @@ static u64 __sched_period(unsigned long nr_running)
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned int nr_running = cfs_rq->nr_running;
+ struct sched_entity *init_se = se;
+ unsigned int min_gran;
u64 slice;
if (sched_feat(ALT_PERIOD))
@@ -684,12 +696,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
for_each_sched_entity(se) {
struct load_weight *load;
struct load_weight lw;
+ struct cfs_rq *qcfs_rq;
- cfs_rq = cfs_rq_of(se);
- load = &cfs_rq->load;
+ qcfs_rq = cfs_rq_of(se);
+ load = &qcfs_rq->load;
if (unlikely(!se->on_rq)) {
- lw = cfs_rq->load;
+ lw = qcfs_rq->load;
update_load_add(&lw, se->load.weight);
load = &lw;
@@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
slice = __calc_delta(slice, se->load.weight, load);
}
- if (sched_feat(BASE_SLICE))
- slice = max(slice, (u64)sysctl_sched_min_granularity);
+ if (sched_feat(BASE_SLICE)) {
+ if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
+ min_gran = sysctl_sched_idle_min_granularity;
+ else
+ min_gran = sysctl_sched_min_granularity;
+
+ slice = max_t(u64, slice, min_gran);
+ }
return slice;
}
@@ -837,8 +856,13 @@ static void update_curr(struct cfs_rq *cfs_rq)
curr->exec_start = now;
- schedstat_set(curr->statistics.exec_max,
- max(delta_exec, curr->statistics.exec_max));
+ if (schedstat_enabled()) {
+ struct sched_statistics *stats;
+
+ stats = __schedstats_from_se(curr);
+ __schedstat_set(stats->exec_max,
+ max(delta_exec, stats->exec_max));
+ }
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq->exec_clock, delta_exec);
@@ -863,137 +887,70 @@ static void update_curr_fair(struct rq *rq)
}
static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 wait_start, prev_wait_start;
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
if (!schedstat_enabled())
return;
- wait_start = rq_clock(rq_of(cfs_rq));
- prev_wait_start = schedstat_val(se->statistics.wait_start);
+ stats = __schedstats_from_se(se);
- if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
- likely(wait_start > prev_wait_start))
- wait_start -= prev_wait_start;
+ if (entity_is_task(se))
+ p = task_of(se);
- __schedstat_set(se->statistics.wait_start, wait_start);
+ __update_stats_wait_start(rq_of(cfs_rq), p, stats);
}
static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct task_struct *p;
- u64 delta;
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
if (!schedstat_enabled())
return;
+ stats = __schedstats_from_se(se);
+
/*
* When the sched_schedstat changes from 0 to 1, some sched se
* maybe already in the runqueue, the se->statistics.wait_start
* will be 0.So it will let the delta wrong. We need to avoid this
* scenario.
*/
- if (unlikely(!schedstat_val(se->statistics.wait_start)))
+ if (unlikely(!schedstat_val(stats->wait_start)))
return;
- delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
-
- if (entity_is_task(se)) {
+ if (entity_is_task(se))
p = task_of(se);
- if (task_on_rq_migrating(p)) {
- /*
- * Preserve migrating task's wait time so wait_start
- * time stamp can be adjusted to accumulate wait time
- * prior to migration.
- */
- __schedstat_set(se->statistics.wait_start, delta);
- return;
- }
- trace_sched_stat_wait(p, delta);
- }
- __schedstat_set(se->statistics.wait_max,
- max(schedstat_val(se->statistics.wait_max), delta));
- __schedstat_inc(se->statistics.wait_count);
- __schedstat_add(se->statistics.wait_sum, delta);
- __schedstat_set(se->statistics.wait_start, 0);
+ __update_stats_wait_end(rq_of(cfs_rq), p, stats);
}
static inline void
-update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ struct sched_statistics *stats;
struct task_struct *tsk = NULL;
- u64 sleep_start, block_start;
if (!schedstat_enabled())
return;
- sleep_start = schedstat_val(se->statistics.sleep_start);
- block_start = schedstat_val(se->statistics.block_start);
+ stats = __schedstats_from_se(se);
if (entity_is_task(se))
tsk = task_of(se);
- if (sleep_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
- __schedstat_set(se->statistics.sleep_max, delta);
-
- __schedstat_set(se->statistics.sleep_start, 0);
- __schedstat_add(se->statistics.sum_sleep_runtime, delta);
-
- if (tsk) {
- account_scheduler_latency(tsk, delta >> 10, 1);
- trace_sched_stat_sleep(tsk, delta);
- }
- }
- if (block_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > schedstat_val(se->statistics.block_max)))
- __schedstat_set(se->statistics.block_max, delta);
-
- __schedstat_set(se->statistics.block_start, 0);
- __schedstat_add(se->statistics.sum_sleep_runtime, delta);
-
- if (tsk) {
- if (tsk->in_iowait) {
- __schedstat_add(se->statistics.iowait_sum, delta);
- __schedstat_inc(se->statistics.iowait_count);
- trace_sched_stat_iowait(tsk, delta);
- }
-
- trace_sched_stat_blocked(tsk, delta);
-
- /*
- * Blocking time is in units of nanosecs, so shift by
- * 20 to get a milliseconds-range estimation of the
- * amount of time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
- profile_hits(SLEEP_PROFILING,
- (void *)get_wchan(tsk),
- delta >> 20);
- }
- account_scheduler_latency(tsk, delta >> 10, 0);
- }
- }
+ __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
}
/*
* Task is being enqueued - update stats:
*/
static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled())
return;
@@ -1003,14 +960,14 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
- update_stats_wait_start(cfs_rq, se);
+ update_stats_wait_start_fair(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
- update_stats_enqueue_sleeper(cfs_rq, se);
+ update_stats_enqueue_sleeper_fair(cfs_rq, se);
}
static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled())
@@ -1021,7 +978,7 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* waiting task:
*/
if (se != cfs_rq->curr)
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end_fair(cfs_rq, se);
if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
@@ -1030,10 +987,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* XXX racy against TTWU */
state = READ_ONCE(tsk->__state);
if (state & TASK_INTERRUPTIBLE)
- __schedstat_set(se->statistics.sleep_start,
+ __schedstat_set(tsk->stats.sleep_start,
rq_clock(rq_of(cfs_rq)));
if (state & TASK_UNINTERRUPTIBLE)
- __schedstat_set(se->statistics.block_start,
+ __schedstat_set(tsk->stats.block_start,
rq_clock(rq_of(cfs_rq)));
}
}
@@ -1081,11 +1038,12 @@ struct numa_group {
unsigned long total_faults;
unsigned long max_faults_cpu;
/*
+ * faults[] array is split into two regions: faults_mem and faults_cpu.
+ *
* Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted
* more by CPU use than by memory faults.
*/
- unsigned long *faults_cpu;
unsigned long faults[];
};
@@ -1259,8 +1217,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{
- return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
- group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
+ return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
+ group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
}
static inline unsigned long group_faults_priv(struct numa_group *ng)
@@ -1544,7 +1502,6 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
-static unsigned long cpu_util(int cpu);
static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int dst_weight);
@@ -1611,7 +1568,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
- ns->util += cpu_util(cpu);
+ ns->util += cpu_util_cfs(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
@@ -2116,7 +2073,7 @@ static void numa_migrate_preferred(struct task_struct *p)
}
/*
- * Find out how many nodes on the workload is actively running on. Do this by
+ * Find out how many nodes the workload is actively running on. Do this by
* tracking the nodes from which NUMA hinting faults are triggered. This can
* be different from the set of nodes where the workload's memory is currently
* located.
@@ -2170,7 +2127,7 @@ static void update_task_scan_period(struct task_struct *p,
/*
* If there were no record hinting faults then either the task is
- * completely idle or all activity is areas that are not of interest
+ * completely idle or all activity is in areas that are not of interest
* to automatic numa balancing. Related to that, if there were failed
* migration then it implies we are migrating too quickly or the local
* node is overloaded. In either case, scan slower
@@ -2427,7 +2384,7 @@ static void task_numa_placement(struct task_struct *p)
* is at the beginning of the numa_faults array.
*/
ng->faults[mem_idx] += diff;
- ng->faults_cpu[mem_idx] += f_diff;
+ ng->faults[cpu_idx] += f_diff;
ng->total_faults += diff;
group_faults += ng->faults[mem_idx];
}
@@ -2481,7 +2438,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (unlikely(!deref_curr_numa_group(p))) {
unsigned int size = sizeof(struct numa_group) +
- 4*nr_node_ids*sizeof(unsigned long);
+ NR_NUMA_HINT_FAULT_STATS *
+ nr_node_ids * sizeof(unsigned long);
grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!grp)
@@ -2492,9 +2450,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
grp->gid = p->pid;
- /* Second half of the array tracks nids where faults happen */
- grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
- nr_node_ids;
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults[i];
@@ -2995,6 +2950,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#endif
cfs_rq->nr_running++;
+ if (se_is_idle(se))
+ cfs_rq->idle_nr_running++;
}
static void
@@ -3008,6 +2965,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#endif
cfs_rq->nr_running--;
+ if (se_is_idle(se))
+ cfs_rq->idle_nr_running--;
}
/*
@@ -3280,7 +3239,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
- * See cpu_util().
+ * See cpu_util_cfs().
*/
cpufreq_update_util(rq, flags);
}
@@ -4110,7 +4069,8 @@ done:
trace_sched_util_est_se_tp(&p->se);
}
-static inline int task_fits_capacity(struct task_struct *p, long capacity)
+static inline int task_fits_capacity(struct task_struct *p,
+ unsigned long capacity)
{
return fits_capacity(uclamp_task_util(p), capacity);
}
@@ -4207,7 +4167,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
/* sleeps up to a single latency don't count. */
if (!initial) {
- unsigned long thresh = sysctl_sched_latency;
+ unsigned long thresh;
+
+ if (se_is_idle(se))
+ thresh = sysctl_sched_min_granularity;
+ else
+ thresh = sysctl_sched_latency;
/*
* Halve their sleep time's effect, to allow
@@ -4225,26 +4190,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-static inline void check_schedstat_required(void)
-{
-#ifdef CONFIG_SCHEDSTATS
- if (schedstat_enabled())
- return;
-
- /* Force schedstat enabled if a dependent tracepoint is active */
- if (trace_sched_stat_wait_enabled() ||
- trace_sched_stat_sleep_enabled() ||
- trace_sched_stat_iowait_enabled() ||
- trace_sched_stat_blocked_enabled() ||
- trace_sched_stat_runtime_enabled()) {
- printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
- "stat_blocked and stat_runtime require the "
- "kernel parameter schedstats=enable or "
- "kernel.sched_schedstats=1\n");
- }
-#endif
-}
-
static inline bool cfs_bandwidth_used(void);
/*
@@ -4318,7 +4263,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
place_entity(cfs_rq, se, 0);
check_schedstat_required();
- update_stats_enqueue(cfs_rq, se, flags);
+ update_stats_enqueue_fair(cfs_rq, se, flags);
check_spread(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
@@ -4402,7 +4347,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_load_avg(cfs_rq, se, UPDATE_TG);
se_update_runnable(se);
- update_stats_dequeue(cfs_rq, se, flags);
+ update_stats_dequeue_fair(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
@@ -4487,7 +4432,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
}
@@ -4502,9 +4447,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
if (schedstat_enabled() &&
rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
- schedstat_set(se->statistics.slice_max,
- max((u64)schedstat_val(se->statistics.slice_max),
- se->sum_exec_runtime - se->prev_sum_exec_runtime));
+ struct sched_statistics *stats;
+
+ stats = __schedstats_from_se(se);
+ __schedstat_set(stats->slice_max,
+ max((u64)stats->slice_max,
+ se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
se->prev_sum_exec_runtime = se->sum_exec_runtime;
@@ -4586,7 +4534,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
check_spread(cfs_rq, prev);
if (prev->on_rq) {
- update_stats_wait_start(cfs_rq, prev);
+ update_stats_wait_start_fair(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
@@ -4687,11 +4635,20 @@ static inline u64 sched_cfs_bandwidth_slice(void)
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
+ s64 runtime;
+
if (unlikely(cfs_b->quota == RUNTIME_INF))
return;
cfs_b->runtime += cfs_b->quota;
+ runtime = cfs_b->runtime_snap - cfs_b->runtime;
+ if (runtime > 0) {
+ cfs_b->burst_time += runtime;
+ cfs_b->nr_burst++;
+ }
+
cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
+ cfs_b->runtime_snap = cfs_b->runtime;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4936,8 +4893,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
- if (!cfs_rq->load.weight)
+ /* Nothing to run but something to decay (on_list)? Complete the branch */
+ if (!cfs_rq->load.weight) {
+ if (cfs_rq->on_list)
+ goto unthrottle_throttle;
return;
+ }
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
@@ -5548,11 +5509,9 @@ static inline void hrtick_update(struct rq *rq)
#endif
#ifdef CONFIG_SMP
-static inline unsigned long cpu_util(int cpu);
-
static inline bool cpu_overutilized(int cpu)
{
- return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
+ return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
}
static inline void update_overutilized_status(struct rq *rq)
@@ -5573,6 +5532,17 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
+/*
+ * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
+ * of idle_nr_running, which does not consider idle descendants of normal
+ * entities.
+ */
+static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->nr_running &&
+ cfs_rq->nr_running == cfs_rq->idle_nr_running;
+}
+
#ifdef CONFIG_SMP
static int sched_idle_cpu(int cpu)
{
@@ -5783,6 +5753,7 @@ static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
int has_blocked; /* Idle CPUS has blocked load */
+ int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;
@@ -5993,12 +5964,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
- schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
+ schedstat_inc(p->stats.nr_wakeups_affine_attempts);
if (target == nr_cpumask_bits)
return prev_cpu;
schedstat_inc(sd->ttwu_move_affine);
- schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ schedstat_inc(p->stats.nr_wakeups_affine);
return target;
}
@@ -6372,7 +6343,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
return best_cpu;
}
-static inline bool asym_fits_capacity(int task_util, int cpu)
+static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
{
if (static_branch_unlikely(&sched_asym_cpucapacity))
return fits_capacity(task_util, capacity_of(cpu));
@@ -6425,8 +6396,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* pattern is IO completions.
*/
if (is_per_cpu_kthread(current) &&
+ in_task() &&
prev == smp_processor_id() &&
- this_rq()->nr_running <= 1) {
+ this_rq()->nr_running <= 1 &&
+ asym_fits_capacity(task_util, prev)) {
return prev;
}
@@ -6439,11 +6412,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
asym_fits_capacity(task_util, recent_used_cpu)) {
- /*
- * Replace recent_used_cpu with prev as it is a potential
- * candidate for the next wake:
- */
- p->recent_used_cpu = prev;
return recent_used_cpu;
}
@@ -6488,58 +6456,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
-/**
- * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks.
- * @cpu: the CPU to get the utilization of
- *
- * The unit of the return value must be the one of capacity so we can compare
- * the utilization with the capacity of the CPU that is available for CFS task
- * (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * The estimated utilization of a CPU is defined to be the maximum between its
- * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
- * currently RUNNABLE on that CPU.
- * This allows to properly represent the expected utilization of a CPU which
- * has just got a big task running since a long sleep period. At the same time
- * however it preserves the benefits of the "blocked utilization" in
- * describing the potential for other tasks waking up on the same CPU.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- *
- * Return: the (estimated) utilization for the specified CPU
- */
-static inline unsigned long cpu_util(int cpu)
-{
- struct cfs_rq *cfs_rq;
- unsigned int util;
-
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
-
- if (sched_feat(UTIL_EST))
- util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
-
- return min_t(unsigned long, util, capacity_orig_of(cpu));
-}
-
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
@@ -6560,7 +6476,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- return cpu_util(cpu);
+ return cpu_util_cfs(cpu);
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
@@ -6624,7 +6540,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
/*
* Utilization (estimated) can exceed the CPU capacity, thus let's
* clamp to the maximum CPU capacity to ensure consistency with
- * the cpu_util call.
+ * cpu_util.
*/
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
@@ -6656,7 +6572,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
* During wake-up, the task isn't enqueued yet and doesn't
* appear in the cfs_rq->avg.util_est.enqueued of any rq,
* so just add it (if needed) to "simulate" what will be
- * cpu_util() after the task has been enqueued.
+ * cpu_util after the task has been enqueued.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
@@ -6947,6 +6863,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
break;
}
+ /*
+ * Usually only true for WF_EXEC and WF_FORK, as sched_domains
+ * usually do not have SD_BALANCE_WAKE set. That means wakeup
+ * will usually go to the fast path.
+ */
if (tmp->flags & sd_flag)
sd = tmp;
else if (!want_affine)
@@ -7802,7 +7723,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
- schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
+ schedstat_inc(p->stats.nr_failed_migrations_affine);
env->flags |= LBF_SOME_PINNED;
@@ -7836,7 +7757,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) {
- schedstat_inc(p->se.statistics.nr_failed_migrations_running);
+ schedstat_inc(p->stats.nr_failed_migrations_running);
return 0;
}
@@ -7858,12 +7779,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) {
schedstat_inc(env->sd->lb_hot_gained[env->idle]);
- schedstat_inc(p->se.statistics.nr_forced_migrations);
+ schedstat_inc(p->stats.nr_forced_migrations);
}
return 1;
}
- schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
+ schedstat_inc(p->stats.nr_failed_migrations_hot);
return 0;
}
@@ -8598,6 +8519,99 @@ group_type group_classify(unsigned int imbalance_pct,
}
/**
+ * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
+ * @dst_cpu: Destination CPU of the load balancing
+ * @sds: Load-balancing data with statistics of the local group
+ * @sgs: Load-balancing statistics of the candidate busiest group
+ * @sg: The candidate busiest group
+ *
+ * Check the state of the SMT siblings of both @sds::local and @sg and decide
+ * if @dst_cpu can pull tasks.
+ *
+ * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
+ * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
+ * only if @dst_cpu has higher priority.
+ *
+ * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
+ * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
+ * Bigger imbalances in the number of busy CPUs will be dealt with in
+ * update_sd_pick_busiest().
+ *
+ * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
+ * of @dst_cpu are idle and @sg has lower priority.
+ */
+static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
+ struct sg_lb_stats *sgs,
+ struct sched_group *sg)
+{
+#ifdef CONFIG_SCHED_SMT
+ bool local_is_smt, sg_is_smt;
+ int sg_busy_cpus;
+
+ local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
+ sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
+
+ sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
+
+ if (!local_is_smt) {
+ /*
+ * If we are here, @dst_cpu is idle and does not have SMT
+ * siblings. Pull tasks if candidate group has two or more
+ * busy CPUs.
+ */
+ if (sg_busy_cpus >= 2) /* implies sg_is_smt */
+ return true;
+
+ /*
+ * @dst_cpu does not have SMT siblings. @sg may have SMT
+ * siblings and only one is busy. In such case, @dst_cpu
+ * can help if it has higher priority and is idle (i.e.,
+ * it has no running tasks).
+ */
+ return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+ }
+
+ /* @dst_cpu has SMT siblings. */
+
+ if (sg_is_smt) {
+ int local_busy_cpus = sds->local->group_weight -
+ sds->local_stat.idle_cpus;
+ int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
+
+ if (busy_cpus_delta == 1)
+ return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+
+ return false;
+ }
+
+ /*
+ * @sg does not have SMT siblings. Ensure that @sds::local does not end
+ * up with more than one busy SMT sibling and only pull tasks if there
+ * are not busy CPUs (i.e., no CPU has running tasks).
+ */
+ if (!sds->local_stat.sum_nr_running)
+ return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+
+ return false;
+#else
+ /* Always return false so that callers deal with non-SMT cases. */
+ return false;
+#endif
+}
+
+static inline bool
+sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ /* Only do SMT checks if either local or candidate have SMT siblings */
+ if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
+ (group->flags & SD_SHARE_CPUCAPACITY))
+ return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
+
+ return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
+}
+
+/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
* @group: sched_group whose statistics are to be updated.
@@ -8605,6 +8619,7 @@ group_type group_classify(unsigned int imbalance_pct,
* @sg_status: Holds flag indicating the status of the sched_group
*/
static inline void update_sg_lb_stats(struct lb_env *env,
+ struct sd_lb_stats *sds,
struct sched_group *group,
struct sg_lb_stats *sgs,
int *sg_status)
@@ -8613,13 +8628,13 @@ static inline void update_sg_lb_stats(struct lb_env *env,
memset(sgs, 0, sizeof(*sgs));
- local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+ local_group = group == sds->local;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
sgs->group_load += cpu_load(rq);
- sgs->group_util += cpu_util(i);
+ sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
@@ -8656,18 +8671,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
}
}
- /* Check if dst CPU is idle and preferred to this group */
- if (env->sd->flags & SD_ASYM_PACKING &&
- env->idle != CPU_NOT_IDLE &&
- sgs->sum_h_nr_running &&
- sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
- sgs->group_asym_packing = 1;
- }
-
sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
+ /* Check if dst CPU is idle and preferred to this group */
+ if (!local_group && env->sd->flags & SD_ASYM_PACKING &&
+ env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
+ sched_asym(env, sds, sgs, group)) {
+ sgs->group_asym_packing = 1;
+ }
+
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
/* Computing avg_load makes sense only when group is overloaded */
@@ -9176,7 +9190,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sg, sgs, &sg_status);
+ update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
if (local_group)
goto next_group;
@@ -9599,6 +9613,12 @@ static struct rq *find_busiest_queue(struct lb_env *env,
nr_running == 1)
continue;
+ /* Make sure we only pull tasks from a CPU of lower priority */
+ if ((env->sd->flags & SD_ASYM_PACKING) &&
+ sched_asym_prefer(i, env->dst_cpu) &&
+ nr_running == 1)
+ continue;
+
switch (env->migration_type) {
case migrate_load:
/*
@@ -9632,7 +9652,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break;
case migrate_util:
- util = cpu_util(cpu_of(rq));
+ util = cpu_util_cfs(i);
/*
* Don't try to pull utilization from a CPU with one
@@ -10172,6 +10192,30 @@ void update_max_interval(void)
max_load_balance_interval = HZ*num_online_cpus()/10;
}
+static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+{
+ if (cost > sd->max_newidle_lb_cost) {
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ sd->max_newidle_lb_cost = cost;
+ sd->last_decay_max_lb_cost = jiffies;
+ } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
+ /*
+ * Decay the newidle max times by ~1% per second to ensure that
+ * it is not outdated and the current max cost is actually
+ * shorter.
+ */
+ sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
+ sd->last_decay_max_lb_cost = jiffies;
+
+ return true;
+ }
+
+ return false;
+}
+
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
@@ -10195,14 +10239,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
for_each_domain(cpu, sd) {
/*
* Decay the newidle max times here because this is a regular
- * visit to all the domains. Decay ~1% per second.
+ * visit to all the domains.
*/
- if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
- sd->max_newidle_lb_cost =
- (sd->max_newidle_lb_cost * 253) / 256;
- sd->next_decay_max_lb_cost = jiffies + HZ;
- need_decay = 1;
- }
+ need_decay = update_newidle_cost(sd, 0);
max_cost += sd->max_newidle_lb_cost;
/*
@@ -10371,7 +10410,7 @@ static void nohz_balancer_kick(struct rq *rq)
goto out;
if (rq->nr_running >= 2) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto out;
}
@@ -10385,7 +10424,7 @@ static void nohz_balancer_kick(struct rq *rq)
* on.
*/
if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10399,7 +10438,7 @@ static void nohz_balancer_kick(struct rq *rq)
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
if (sched_asym_prefer(i, cpu)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10412,7 +10451,7 @@ static void nohz_balancer_kick(struct rq *rq)
* to run the misfit task on.
*/
if (check_misfit_status(rq, sd)) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -10439,13 +10478,16 @@ static void nohz_balancer_kick(struct rq *rq)
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
unlock:
rcu_read_unlock();
out:
+ if (READ_ONCE(nohz.needs_update))
+ flags |= NOHZ_NEXT_KICK;
+
if (flags)
kick_ilb(flags);
}
@@ -10542,12 +10584,13 @@ void nohz_balance_enter_idle(int cpu)
/*
* Ensures that if nohz_idle_balance() fails to observe our
* @idle_cpus_mask store, it must observe the @has_blocked
- * store.
+ * and @needs_update stores.
*/
smp_mb__after_atomic();
set_cpu_sd_state_idle(cpu);
+ WRITE_ONCE(nohz.needs_update, 1);
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
@@ -10596,12 +10639,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
/*
* We assume there will be no idle load after this update and clear
* the has_blocked flag. If a cpu enters idle in the mean time, it will
- * set the has_blocked flag and trig another update of idle load.
+ * set the has_blocked flag and trigger another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
+ *
+ * Same applies to idle_cpus_mask vs needs_update.
*/
- WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 0);
/*
* Ensures that if we miss the CPU, we must see the has_blocked
@@ -10623,13 +10671,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* balancing owner will pick it up.
*/
if (need_resched()) {
- has_blocked_load = true;
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load = true;
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 1);
goto abort;
}
rq = cpu_rq(balance_cpu);
- has_blocked_load |= update_nohz_stats(rq);
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load |= update_nohz_stats(rq);
/*
* If time for next balance is due,
@@ -10660,8 +10712,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
if (likely(update_next_balance))
nohz.next_balance = next_balance;
- WRITE_ONCE(nohz.next_blocked,
- now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.next_blocked,
+ now + msecs_to_jiffies(LOAD_AVG_PERIOD));
abort:
/* There is still blocked load, enable periodic update */
@@ -10759,9 +10812,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
+ u64 t0, t1, curr_cost = 0;
struct sched_domain *sd;
int pulled_task = 0;
- u64 curr_cost = 0;
update_misfit_status(NULL, this_rq);
@@ -10792,47 +10845,49 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
*/
rq_unpin_lock(this_rq, rf);
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !READ_ONCE(this_rq->rd->overload)) {
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
+
+ if (!READ_ONCE(this_rq->rd->overload) ||
+ (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
- rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
update_next_balance(sd, &next_balance);
rcu_read_unlock();
goto out;
}
+ rcu_read_unlock();
raw_spin_rq_unlock(this_rq);
+ t0 = sched_clock_cpu(this_cpu);
update_blocked_averages(this_cpu);
+
rcu_read_lock();
for_each_domain(this_cpu, sd) {
int continue_balancing = 1;
- u64 t0, domain_cost;
+ u64 domain_cost;
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
- update_next_balance(sd, &next_balance);
+ update_next_balance(sd, &next_balance);
+
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
break;
- }
if (sd->flags & SD_BALANCE_NEWIDLE) {
- t0 = sched_clock_cpu(this_cpu);
pulled_task = load_balance(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
- domain_cost = sched_clock_cpu(this_cpu) - t0;
- if (domain_cost > sd->max_newidle_lb_cost)
- sd->max_newidle_lb_cost = domain_cost;
+ t1 = sched_clock_cpu(this_cpu);
+ domain_cost = t1 - t0;
+ update_newidle_cost(sd, domain_cost);
curr_cost += domain_cost;
+ t0 = t1;
}
- update_next_balance(sd, &next_balance);
-
/*
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
@@ -10966,7 +11021,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
* if we need to give up the CPU.
*/
- if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+ if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
@@ -11354,8 +11409,6 @@ void free_fair_sched_group(struct task_group *tg)
{
int i;
- destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
-
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
@@ -11390,7 +11443,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
if (!cfs_rq)
goto err;
- se = kzalloc_node(sizeof(struct sched_entity),
+ se = kzalloc_node(sizeof(struct sched_entity_stats),
GFP_KERNEL, cpu_to_node(i));
if (!se)
goto err_free_rq;
@@ -11432,6 +11485,8 @@ void unregister_fair_sched_group(struct task_group *tg)
struct rq *rq;
int cpu;
+ destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
for_each_possible_cpu(cpu) {
if (tg->se[cpu])
remove_entity_load_avg(tg->se[cpu]);
@@ -11556,7 +11611,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
struct sched_entity *se = tg->se[i];
- struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+ struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
long idle_task_delta;
struct rq_flags rf;
@@ -11567,6 +11622,14 @@ int sched_group_set_idle(struct task_group *tg, long idle)
if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
goto next_cpu;
+ if (se->on_rq) {
+ parent_cfs_rq = cfs_rq_of(se);
+ if (cfs_rq_is_idle(grp_cfs_rq))
+ parent_cfs_rq->idle_nr_running++;
+ else
+ parent_cfs_rq->idle_nr_running--;
+ }
+
idle_task_delta = grp_cfs_rq->h_nr_running -
grp_cfs_rq->idle_h_nr_running;
if (!cfs_rq_is_idle(grp_cfs_rq))
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 7f8dace0964c..1cf435bbcd9c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,11 +46,16 @@ SCHED_FEAT(DOUBLE_TICK, false)
*/
SCHED_FEAT(NONTASK_CAPACITY, true)
+#ifdef CONFIG_PREEMPT_RT
+SCHED_FEAT(TTWU_QUEUE, false)
+#else
+
/*
* Queue remote wakeups on the target CPU and process them
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
SCHED_FEAT(TTWU_QUEUE, true)
+#endif
/*
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 912b47aa99d8..d17b0a5ce6ac 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -379,10 +379,10 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
cpuidle_use_deepest_state(latency_ns);
it.done = 0;
- hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
it.timer.function = idle_inject_timer_fn;
hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
while (!READ_ONCE(it.done))
do_idle();
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1652f2bb54b7..a679613a7cb7 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Pressure stall information for CPU, memory and IO
*
@@ -34,13 +35,19 @@
* delayed on that resource such that nobody is advancing and the CPU
* goes idle. This leaves both workload and CPU unproductive.
*
- * Naturally, the FULL state doesn't exist for the CPU resource at the
- * system level, but exist at the cgroup level, means all non-idle tasks
- * in a cgroup are delayed on the CPU resource which used by others outside
- * of the cgroup or throttled by the cgroup cpu.max configuration.
- *
* SOME = nr_delayed_tasks != 0
- * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
+ *
+ * What it means for a task to be productive is defined differently
+ * for each resource. For IO, productive means a running task. For
+ * memory, productive means a running task that isn't a reclaimer. For
+ * CPU, productive means an oncpu task.
+ *
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level. At the cgroup level,
+ * FULL means all non-idle tasks in the cgroup are delayed on the CPU
+ * resource which is being used by others outside of the cgroup or
+ * throttled by the cgroup cpu.max configuration.
*
* The percentage of wallclock time spent in those compound stall
* states gives pressure numbers between 0 and 100 for each resource,
@@ -81,13 +88,13 @@
*
* threads = min(nr_nonidle_tasks, nr_cpus)
* SOME = min(nr_delayed_tasks / threads, 1)
- * FULL = (threads - min(nr_running_tasks, threads)) / threads
+ * FULL = (threads - min(nr_productive_tasks, threads)) / threads
*
* For the 257 number crunchers on 256 CPUs, this yields:
*
* threads = min(257, 256)
* SOME = min(1 / 256, 1) = 0.4%
- * FULL = (256 - min(257, 256)) / 256 = 0%
+ * FULL = (256 - min(256, 256)) / 256 = 0%
*
* For the 1 out of 4 memory-delayed tasks, this yields:
*
@@ -112,7 +119,7 @@
* For each runqueue, we track:
*
* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
- * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
*
* and then periodically aggregate:
@@ -233,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
case PSI_MEM_SOME:
return unlikely(tasks[NR_MEMSTALL]);
case PSI_MEM_FULL:
- return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+ return unlikely(tasks[NR_MEMSTALL] &&
+ tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME:
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
case PSI_CPU_FULL:
@@ -710,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (groupc->tasks[t]) {
groupc->tasks[t]--;
} else if (!psi_bug) {
- printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
- groupc->tasks[3], clear, set);
+ groupc->tasks[3], groupc->tasks[4],
+ clear, set);
psi_bug = 1;
}
}
@@ -833,7 +842,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
/*
* When switching between tasks that have an identical
* runtime state, the cgroup that contains both tasks
- * runtime state, the cgroup that contains both tasks
* we reach the first common ancestor. Iterate @next's
* ancestors only until we encounter @prev's ONCPU.
*/
@@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
int clear = TSK_ONCPU, set = 0;
/*
- * When we're going to sleep, psi_dequeue() lets us handle
- * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
- * with TSK_ONCPU and save walking common ancestors twice.
+ * When we're going to sleep, psi_dequeue() lets us
+ * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
+ * TSK_IOWAIT here, where we can combine it with
+ * TSK_ONCPU and save walking common ancestors twice.
*/
if (sleep) {
clear |= TSK_RUNNING;
+ if (prev->in_memstall)
+ clear |= TSK_MEMSTALL_RUNNING;
if (prev->in_iowait)
set |= TSK_IOWAIT;
}
@@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 1;
- psi_task_change(current, 0, TSK_MEMSTALL);
+ psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
rq_unlock_irq(rq, &rf);
}
@@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 0;
- psi_task_change(current, TSK_MEMSTALL, 0);
+ psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
rq_unlock_irq(rq, &rf);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3daf42a0f462..7b4f4fbbb404 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -52,11 +52,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
rt_b->rt_period_timer.function = sched_rt_period_timer;
}
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
- if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
- return;
-
raw_spin_lock(&rt_b->rt_runtime_lock);
if (!rt_b->rt_period_active) {
rt_b->rt_period_active = 1;
@@ -75,6 +72,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+ return;
+
+ do_start_rt_bandwidth(rt_b);
+}
+
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
@@ -137,13 +142,17 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
return rt_rq->rq;
}
-void free_rt_sched_group(struct task_group *tg)
+void unregister_rt_sched_group(struct task_group *tg)
{
- int i;
-
if (tg->rt_se)
destroy_rt_bandwidth(&tg->rt_bandwidth);
+}
+
+void free_rt_sched_group(struct task_group *tg)
+{
+ int i;
+
for_each_possible_cpu(i) {
if (tg->rt_rq)
kfree(tg->rt_rq[i]);
@@ -250,6 +259,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return &rq->rt;
}
+void unregister_rt_sched_group(struct task_group *tg) { }
+
void free_rt_sched_group(struct task_group *tg) { }
int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
@@ -1009,8 +1020,10 @@ static void update_curr_rt(struct rq *rq)
if (unlikely((s64)delta_exec <= 0))
return;
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
+ schedstat_set(curr->stats.exec_max,
+ max(curr->stats.exec_max, delta_exec));
+
+ trace_sched_stat_runtime(curr, delta_exec, 0);
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
@@ -1023,13 +1036,17 @@ static void update_curr_rt(struct rq *rq)
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ int exceeded;
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
- if (sched_rt_runtime_exceeded(rt_rq))
+ exceeded = sched_rt_runtime_exceeded(rt_rq);
+ if (exceeded)
resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (exceeded)
+ do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
}
@@ -1271,6 +1288,112 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr
rt_se->on_list = 0;
}
+static inline struct sched_statistics *
+__schedstats_from_rt_se(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* schedstats is not supported for rt group. */
+ if (!rt_entity_is_task(rt_se))
+ return NULL;
+#endif
+
+ return &rt_task_of(rt_se)->stats;
+}
+
+static inline void
+update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ stats = __schedstats_from_rt_se(rt_se);
+ if (!stats)
+ return;
+
+ __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ stats = __schedstats_from_rt_se(rt_se);
+ if (!stats)
+ return;
+
+ __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+ int flags)
+{
+ if (!schedstat_enabled())
+ return;
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper_rt(rt_rq, rt_se);
+}
+
+static inline void
+update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ stats = __schedstats_from_rt_se(rt_se);
+ if (!stats)
+ return;
+
+ __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+ int flags)
+{
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ if ((flags & DEQUEUE_SLEEP) && p) {
+ unsigned int state;
+
+ state = READ_ONCE(p->__state);
+ if (state & TASK_INTERRUPTIBLE)
+ __schedstat_set(p->stats.sleep_start,
+ rq_clock(rq_of_rt_rq(rt_rq)));
+
+ if (state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(p->stats.block_start,
+ rq_clock(rq_of_rt_rq(rt_rq)));
+ }
+}
+
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
@@ -1344,6 +1467,8 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
+ update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
+
dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se, flags);
@@ -1354,6 +1479,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
+ update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
+
dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se) {
@@ -1376,6 +1503,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
+ check_schedstat_required();
+ update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);
+
enqueue_rt_entity(rt_se, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -1576,7 +1706,12 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = &rq->rt;
+
p->se.exec_start = rq_clock_task(rq);
+ if (on_rt_rq(&p->rt))
+ update_stats_wait_end_rt(rt_rq, rt_se);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
@@ -1650,6 +1785,12 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ if (on_rt_rq(&p->rt))
+ update_stats_wait_start_rt(rt_rq, rt_se);
+
update_curr_rt(rq);
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
@@ -2779,8 +2920,12 @@ static int sched_rt_global_validate(void)
static void sched_rt_do_global(void)
{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
def_rt_bandwidth.rt_runtime = global_rt_runtime();
def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+ raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
}
int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3d3e5793e117..de53be905739 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -37,7 +37,6 @@
#include <linux/binfmts.h>
#include <linux/bitops.h>
-#include <linux/blkdev.h>
#include <linux/compat.h>
#include <linux/context_tracking.h>
#include <linux/cpufreq.h>
@@ -369,6 +368,7 @@ struct cfs_bandwidth {
u64 quota;
u64 runtime;
u64 burst;
+ u64 runtime_snap;
s64 hierarchical_quota;
u8 idle;
@@ -381,7 +381,9 @@ struct cfs_bandwidth {
/* Statistics: */
int nr_periods;
int nr_throttled;
+ int nr_burst;
u64 throttled_time;
+ u64 burst_time;
#endif
};
@@ -486,6 +488,7 @@ extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+extern void unregister_rt_sched_group(struct task_group *tg);
extern void free_rt_sched_group(struct task_group *tg);
extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
@@ -501,7 +504,7 @@ extern struct task_group *sched_create_group(struct task_group *parent);
extern void sched_online_group(struct task_group *tg,
struct task_group *parent);
extern void sched_destroy_group(struct task_group *tg);
-extern void sched_offline_group(struct task_group *tg);
+extern void sched_release_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk);
@@ -530,6 +533,7 @@ struct cfs_rq {
struct load_weight load;
unsigned int nr_running;
unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int idle_nr_running; /* SCHED_IDLE */
unsigned int idle_h_nr_running; /* SCHED_IDLE */
u64 exec_clock;
@@ -1107,8 +1111,10 @@ struct rq {
unsigned int core_task_seq;
unsigned int core_pick_seq;
unsigned long core_cookie;
- unsigned char core_forceidle;
+ unsigned int core_forceidle_count;
unsigned int core_forceidle_seq;
+ unsigned int core_forceidle_occupation;
+ u64 core_forceidle_start;
#endif
};
@@ -1249,16 +1255,11 @@ static inline bool sched_core_enqueued(struct task_struct *p)
}
extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
-extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
extern void sched_core_get(void);
extern void sched_core_put(void);
-extern unsigned long sched_core_alloc_cookie(void);
-extern void sched_core_put_cookie(unsigned long cookie);
-extern unsigned long sched_core_get_cookie(unsigned long cookie);
-extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie);
-
#else /* !CONFIG_SCHED_CORE */
static inline bool sched_core_enabled(struct rq *rq)
@@ -1422,11 +1423,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
extern void update_rq_clock(struct rq *rq);
-static inline u64 __rq_clock_broken(struct rq *rq)
-{
- return READ_ONCE(rq->clock);
-}
-
/*
* rq::clock_update_flags bits
*
@@ -1622,14 +1618,6 @@ rq_lock(struct rq *rq, struct rq_flags *rf)
}
static inline void
-rq_relock(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
-{
- raw_spin_rq_lock(rq);
- rq_repin_lock(rq, rf);
-}
-
-static inline void
rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
@@ -1809,6 +1797,7 @@ struct sched_group {
unsigned int group_weight;
struct sched_group_capacity *sgc;
int asym_prefer_cpu; /* CPU of highest priority in group */
+ int flags;
/*
* The CPUs this group covers.
@@ -1867,6 +1856,32 @@ static inline void flush_smp_call_function_from_idle(void) { }
#include "stats.h"
#include "autogroup.h"
+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+
+extern void __sched_core_account_forceidle(struct rq *rq);
+
+static inline void sched_core_account_forceidle(struct rq *rq)
+{
+ if (schedstat_enabled())
+ __sched_core_account_forceidle(rq);
+}
+
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
+{
+ if (sched_core_enabled(rq) && schedstat_enabled())
+ __sched_core_tick(rq);
+}
+
+#else
+
+static inline void sched_core_account_forceidle(struct rq *rq) {}
+
+static inline void sched_core_tick(struct rq *rq) {}
+
+#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
+
#ifdef CONFIG_CGROUP_SCHED
/*
@@ -1926,11 +1941,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
* per-task data have been completed by this moment.
*/
smp_wmb();
-#ifdef CONFIG_THREAD_INFO_IN_TASK
- WRITE_ONCE(p->cpu, cpu);
-#else
WRITE_ONCE(task_thread_info(p)->cpu, cpu);
-#endif
p->wake_cpu = cpu;
#endif
}
@@ -2402,6 +2413,7 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_idle_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
extern int sysctl_resched_latency_warn_ms;
extern int sysctl_resched_latency_warn_once;
@@ -2709,12 +2721,18 @@ extern void cfs_bandwidth_usage_dec(void);
#define NOHZ_BALANCE_KICK_BIT 0
#define NOHZ_STATS_KICK_BIT 1
#define NOHZ_NEWILB_KICK_BIT 2
+#define NOHZ_NEXT_KICK_BIT 3
+/* Run rebalance_domains() */
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
+/* Update blocked load */
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
+/* Update blocked load when entering idle */
#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
+/* Update nohz.next_balance */
+#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT)
-#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
@@ -2948,16 +2966,52 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
return READ_ONCE(rq->avg_dl.util_avg);
}
-static inline unsigned long cpu_util_cfs(struct rq *rq)
+/**
+ * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks.
+ * @cpu: the CPU to get the utilization for.
+ *
+ * The unit of the return value must be the same as the one of CPU capacity
+ * so that CPU utilization can be compared with CPU capacity.
+ *
+ * CPU utilization is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on that CPU.
+ * It represents the amount of CPU capacity currently used by CFS tasks in
+ * the range [0..max CPU capacity] with max CPU capacity being the CPU
+ * capacity at f_max.
+ *
+ * The estimated CPU utilization is defined as the maximum between CPU
+ * utilization and sum of the estimated utilization of the currently
+ * runnable tasks on that CPU. It preserves a utilization "snapshot" of
+ * previously-executed tasks, which helps better deduce how busy a CPU will
+ * be when a long-sleeping task wakes up. The contribution to CPU utilization
+ * of such a task would be significantly decayed at this point of time.
+ *
+ * CPU utilization can be higher than the current CPU capacity
+ * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
+ * of rounding errors as well as task migrations or wakeups of new tasks.
+ * CPU utilization has to be capped to fit into the [0..max CPU capacity]
+ * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
+ * could be seen as over-utilized even though CPU1 has 20% of spare CPU
+ * capacity. CPU utilization is allowed to overshoot current CPU capacity
+ * though since this is useful for predicting the CPU capacity required
+ * after task migrations (scheduler-driven DVFS).
+ *
+ * Return: (Estimated) utilization for the specified CPU.
+ */
+static inline unsigned long cpu_util_cfs(int cpu)
{
- unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
+ struct cfs_rq *cfs_rq;
+ unsigned long util;
+
+ cfs_rq = &cpu_rq(cpu)->cfs;
+ util = READ_ONCE(cfs_rq->avg.util_avg);
if (sched_feat(UTIL_EST)) {
util = max_t(unsigned long, util,
- READ_ONCE(rq->cfs.avg.util_est.enqueued));
+ READ_ONCE(cfs_rq->avg.util_est.enqueued));
}
- return util;
+ return min(util, capacity_orig_of(cpu));
}
static inline unsigned long cpu_util_rt(struct rq *rq)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 3f93fc3b5648..07dde2928c79 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -4,6 +4,110 @@
*/
#include "sched.h"
+void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats)
+{
+ u64 wait_start, prev_wait_start;
+
+ wait_start = rq_clock(rq);
+ prev_wait_start = schedstat_val(stats->wait_start);
+
+ if (p && likely(wait_start > prev_wait_start))
+ wait_start -= prev_wait_start;
+
+ __schedstat_set(stats->wait_start, wait_start);
+}
+
+void __update_stats_wait_end(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats)
+{
+ u64 delta = rq_clock(rq) - schedstat_val(stats->wait_start);
+
+ if (p) {
+ if (task_on_rq_migrating(p)) {
+ /*
+ * Preserve migrating task's wait time so wait_start
+ * time stamp can be adjusted to accumulate wait time
+ * prior to migration.
+ */
+ __schedstat_set(stats->wait_start, delta);
+
+ return;
+ }
+
+ trace_sched_stat_wait(p, delta);
+ }
+
+ __schedstat_set(stats->wait_max,
+ max(schedstat_val(stats->wait_max), delta));
+ __schedstat_inc(stats->wait_count);
+ __schedstat_add(stats->wait_sum, delta);
+ __schedstat_set(stats->wait_start, 0);
+}
+
+void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats)
+{
+ u64 sleep_start, block_start;
+
+ sleep_start = schedstat_val(stats->sleep_start);
+ block_start = schedstat_val(stats->block_start);
+
+ if (sleep_start) {
+ u64 delta = rq_clock(rq) - sleep_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(stats->sleep_max)))
+ __schedstat_set(stats->sleep_max, delta);
+
+ __schedstat_set(stats->sleep_start, 0);
+ __schedstat_add(stats->sum_sleep_runtime, delta);
+
+ if (p) {
+ account_scheduler_latency(p, delta >> 10, 1);
+ trace_sched_stat_sleep(p, delta);
+ }
+ }
+
+ if (block_start) {
+ u64 delta = rq_clock(rq) - block_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(stats->block_max)))
+ __schedstat_set(stats->block_max, delta);
+
+ __schedstat_set(stats->block_start, 0);
+ __schedstat_add(stats->sum_sleep_runtime, delta);
+ __schedstat_add(stats->sum_block_runtime, delta);
+
+ if (p) {
+ if (p->in_iowait) {
+ __schedstat_add(stats->iowait_sum, delta);
+ __schedstat_inc(stats->iowait_count);
+ trace_sched_stat_iowait(p, delta);
+ }
+
+ trace_sched_stat_blocked(p, delta);
+
+ /*
+ * Blocking time is in units of nanosecs, so shift by
+ * 20 to get a milliseconds-range estimation of the
+ * amount of time that the task spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ profile_hits(SLEEP_PROFILING,
+ (void *)get_wchan(p),
+ delta >> 20);
+ }
+ account_scheduler_latency(p, delta >> 10, 0);
+ }
+ }
+}
+
/*
* Current schedstat API version.
*
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index d8f8eb0c655b..3a3c826dd83a 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -2,6 +2,8 @@
#ifdef CONFIG_SCHEDSTATS
+extern struct static_key_false sched_schedstats;
+
/*
* Expects runqueue lock to be held for atomicity of update
*/
@@ -40,7 +42,31 @@ rq_sched_info_dequeue(struct rq *rq, unsigned long long delta)
#define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
+void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats);
+
+void __update_stats_wait_end(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats);
+void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats);
+
+static inline void
+check_schedstat_required(void)
+{
+ if (schedstat_enabled())
+ return;
+
+ /* Force schedstat enabled if a dependent tracepoint is active */
+ if (trace_sched_stat_wait_enabled() ||
+ trace_sched_stat_sleep_enabled() ||
+ trace_sched_stat_iowait_enabled() ||
+ trace_sched_stat_blocked_enabled() ||
+ trace_sched_stat_runtime_enabled())
+ printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, stat_blocked and stat_runtime require the kernel parameter schedstats=enable or kernel.sched_schedstats=1\n");
+}
+
#else /* !CONFIG_SCHEDSTATS: */
+
static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { }
static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
@@ -53,8 +79,31 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt
# define schedstat_set(var, val) do { } while (0)
# define schedstat_val(var) 0
# define schedstat_val_or_zero(var) 0
+
+# define __update_stats_wait_start(rq, p, stats) do { } while (0)
+# define __update_stats_wait_end(rq, p, stats) do { } while (0)
+# define __update_stats_enqueue_sleeper(rq, p, stats) do { } while (0)
+# define check_schedstat_required() do { } while (0)
+
#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+struct sched_entity_stats {
+ struct sched_entity se;
+ struct sched_statistics stats;
+} __no_randomize_layout;
+#endif
+
+static inline struct sched_statistics *
+__schedstats_from_se(struct sched_entity *se)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (!entity_is_task(se))
+ return &container_of(se, struct sched_entity_stats, se)->stats;
+#endif
+ return &task_of(se)->stats;
+}
+
#ifdef CONFIG_PSI
/*
* PSI tracks state that persists across sleeps, such as iowaits and
@@ -69,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
if (static_branch_likely(&psi_disabled))
return;
+ if (p->in_memstall)
+ set |= TSK_MEMSTALL_RUNNING;
+
if (!wakeup || p->sched_psi_wake_requeue) {
if (p->in_memstall)
set |= TSK_MEMSTALL;
@@ -99,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
return;
if (p->in_memstall)
- clear |= TSK_MEMSTALL;
+ clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
psi_task_change(p, clear, 0);
}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index f988ebe3febb..0b165a25f22f 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -78,8 +78,8 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
+ schedstat_set(curr->stats.exec_max,
+ max(curr->stats.exec_max, delta_exec));
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 4e8698e62f07..d201a7052a29 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -526,7 +526,7 @@ static int init_rootdomain(struct root_domain *rd)
#ifdef HAVE_RT_PUSH_IPI
rd->rto_cpu = -1;
raw_spin_lock_init(&rd->rto_lock);
- init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+ rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);
#endif
rd->visit_gen = 0;
@@ -688,7 +688,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
- int numa_distance = 0;
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
@@ -716,13 +715,22 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
tmp = sd;
sd = sd->parent;
destroy_sched_domain(tmp);
- if (sd)
+ if (sd) {
+ struct sched_group *sg = sd->groups;
+
+ /*
+ * sched groups hold the flags of the child sched
+ * domain for convenience. Clear such flags since
+ * the child is being destroyed.
+ */
+ do {
+ sg->flags = 0;
+ } while (sg != sd->groups);
+
sd->child = NULL;
+ }
}
- for (tmp = sd; tmp; tmp = tmp->parent)
- numa_distance += !!(tmp->flags & SD_NUMA);
-
sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);
@@ -916,10 +924,12 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
return NULL;
sg_span = sched_group_span(sg);
- if (sd->child)
+ if (sd->child) {
cpumask_copy(sg_span, sched_domain_span(sd->child));
- else
+ sg->flags = sd->child->flags;
+ } else {
cpumask_copy(sg_span, sched_domain_span(sd));
+ }
atomic_inc(&sg->ref);
return sg;
@@ -1169,6 +1179,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
if (child) {
cpumask_copy(sched_group_span(sg), sched_domain_span(child));
cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
+ sg->flags = child->flags;
} else {
cpumask_set_cpu(cpu, sched_group_span(sg));
cpumask_set_cpu(cpu, group_balance_mask(sg));
@@ -1481,7 +1492,6 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
-int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
static unsigned long __read_mostly *sched_numa_onlined_nodes;
#endif
@@ -1557,7 +1567,7 @@ sd_init(struct sched_domain_topology_level *tl,
.last_balance = jiffies,
.balance_interval = sd_weight,
.max_newidle_lb_cost = 0,
- .next_decay_max_lb_cost = jiffies,
+ .last_decay_max_lb_cost = jiffies,
.child = child,
#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
@@ -1627,6 +1637,11 @@ static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+ { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
+
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 76577d1642a5..eca38107b32f 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -238,6 +238,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode)
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+void __wake_up_pollfree(struct wait_queue_head *wq_head)
+{
+ __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE));
+ /* POLLFREE must have cleared the queue. */
+ WARN_ON_ONCE(waitqueue_active(wq_head));
+}
+
/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
diff --git a/kernel/scs.c b/kernel/scs.c
index e2a71fc82fa0..579841be8864 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -78,6 +78,7 @@ void scs_free(void *s)
if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
return;
+ kasan_unpoison_vmalloc(s, SCS_SIZE);
vfree_atomic(s);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 952741f6d0f9..dfcee3888b00 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -426,22 +426,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
*/
rcu_read_lock();
ucounts = task_ucounts(t);
- sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1);
- switch (sigpending) {
- case 1:
- if (likely(get_ucounts(ucounts)))
- break;
- fallthrough;
- case LONG_MAX:
- /*
- * we need to decrease the ucount in the userns tree on any
- * failure to avoid counts leaking.
- */
- dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1);
- rcu_read_unlock();
- return NULL;
- }
+ sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
+ if (!sigpending)
+ return NULL;
if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
@@ -450,8 +438,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
}
if (unlikely(q == NULL)) {
- if (dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1))
- put_ucounts(ucounts);
+ dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
} else {
INIT_LIST_HEAD(&q->list);
q->flags = sigqueue_flags;
@@ -464,8 +451,8 @@ static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
- if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) {
- put_ucounts(q->ucounts);
+ if (q->ucounts) {
+ dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING);
q->ucounts = NULL;
}
kmem_cache_free(sigqueue_cachep, q);
@@ -1311,6 +1298,12 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p
return ret;
}
+enum sig_handler {
+ HANDLER_CURRENT, /* If reachable use the current handler */
+ HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */
+ HANDLER_EXIT, /* Only visible as the process exit code */
+};
+
/*
* Force a signal that the process can't ignore: if necessary
* we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1323,7 +1316,8 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p
* that is why we also clear SIGNAL_UNKILLABLE.
*/
static int
-force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool sigdfl)
+force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
+ enum sig_handler handler)
{
unsigned long int flags;
int ret, blocked, ignored;
@@ -1334,8 +1328,10 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool
action = &t->sighand->action[sig-1];
ignored = action->sa.sa_handler == SIG_IGN;
blocked = sigismember(&t->blocked, sig);
- if (blocked || ignored || sigdfl) {
+ if (blocked || ignored || (handler != HANDLER_CURRENT)) {
action->sa.sa_handler = SIG_DFL;
+ if (handler == HANDLER_EXIT)
+ action->sa.sa_flags |= SA_IMMUTABLE;
if (blocked) {
sigdelset(&t->blocked, sig);
recalc_sigpending_and_wake(t);
@@ -1355,7 +1351,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool
int force_sig_info(struct kernel_siginfo *info)
{
- return force_sig_info_to_task(info, current, false);
+ return force_sig_info_to_task(info, current, HANDLER_CURRENT);
}
/*
@@ -1662,6 +1658,32 @@ void force_sig(int sig)
}
EXPORT_SYMBOL(force_sig);
+void force_fatal_sig(int sig)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_KERNEL;
+ info.si_pid = 0;
+ info.si_uid = 0;
+ force_sig_info_to_task(&info, current, HANDLER_SIG_DFL);
+}
+
+void force_exit_sig(int sig)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_KERNEL;
+ info.si_pid = 0;
+ info.si_uid = 0;
+ force_sig_info_to_task(&info, current, HANDLER_EXIT);
+}
+
/*
* When things go south during signal handling, we
* will force a SIGSEGV. And if the signal that caused
@@ -1670,15 +1692,10 @@ EXPORT_SYMBOL(force_sig);
*/
void force_sigsegv(int sig)
{
- struct task_struct *p = current;
-
- if (sig == SIGSEGV) {
- unsigned long flags;
- spin_lock_irqsave(&p->sighand->siglock, flags);
- p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
- }
- force_sig(SIGSEGV);
+ if (sig == SIGSEGV)
+ force_fatal_sig(SIGSEGV);
+ else
+ force_sig(SIGSEGV);
}
int force_sig_fault_to_task(int sig, int code, void __user *addr
@@ -1697,7 +1714,7 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr
info.si_flags = flags;
info.si_isr = isr;
#endif
- return force_sig_info_to_task(&info, t, false);
+ return force_sig_info_to_task(&info, t, HANDLER_CURRENT);
}
int force_sig_fault(int sig, int code, void __user *addr
@@ -1817,7 +1834,8 @@ int force_sig_seccomp(int syscall, int reason, bool force_coredump)
info.si_errno = reason;
info.si_arch = syscall_get_arch(current);
info.si_syscall = syscall;
- return force_sig_info_to_task(&info, current, force_coredump);
+ return force_sig_info_to_task(&info, current,
+ force_coredump ? HANDLER_EXIT : HANDLER_CURRENT);
}
/* For the crazy architectures that include trap information in
@@ -2158,40 +2176,6 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
spin_unlock_irqrestore(&sighand->siglock, flags);
}
-static inline bool may_ptrace_stop(void)
-{
- if (!likely(current->ptrace))
- return false;
- /*
- * Are we in the middle of do_coredump?
- * If so and our tracer is also part of the coredump stopping
- * is a deadlock situation, and pointless because our tracer
- * is dead so don't allow us to stop.
- * If SIGKILL was already sent before the caller unlocked
- * ->siglock we must see ->core_state != NULL. Otherwise it
- * is safe to enter schedule().
- *
- * This is almost outdated, a task with the pending SIGKILL can't
- * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
- * after SIGKILL was already dequeued.
- */
- if (unlikely(current->mm->core_state) &&
- unlikely(current->mm == current->parent->mm))
- return false;
-
- return true;
-}
-
-/*
- * Return non-zero if there is a SIGKILL that should be waking us up.
- * Called with the siglock held.
- */
-static bool sigkill_pending(struct task_struct *tsk)
-{
- return sigismember(&tsk->pending.signal, SIGKILL) ||
- sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
-}
-
/*
* This must be called with current->sighand->siglock held.
*
@@ -2209,7 +2193,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
{
bool gstop_done = false;
- if (arch_ptrace_stop_needed(exit_code, info)) {
+ if (arch_ptrace_stop_needed()) {
/*
* The arch code has something special to do before a
* ptrace stop. This is allowed to block, e.g. for faults
@@ -2217,17 +2201,16 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
* calling arch_ptrace_stop, so we must release it now.
* To preserve proper semantics, we must do this before
* any signal bookkeeping like checking group_stop_count.
- * Meanwhile, a SIGKILL could come in before we retake the
- * siglock. That must prevent us from sleeping in TASK_TRACED.
- * So after regaining the lock, we must check for SIGKILL.
*/
spin_unlock_irq(&current->sighand->siglock);
- arch_ptrace_stop(exit_code, info);
+ arch_ptrace_stop();
spin_lock_irq(&current->sighand->siglock);
- if (sigkill_pending(current))
- return;
}
+ /*
+ * schedule() will not sleep if there is a pending signal that
+ * can awaken the task.
+ */
set_special_state(TASK_TRACED);
/*
@@ -2273,7 +2256,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
spin_unlock_irq(&current->sighand->siglock);
read_lock(&tasklist_lock);
- if (may_ptrace_stop()) {
+ if (likely(current->ptrace)) {
/*
* Notify parents of the stop.
*
@@ -2752,7 +2735,8 @@ relock:
if (!signr)
break; /* will return 0 */
- if (unlikely(current->ptrace) && signr != SIGKILL) {
+ if (unlikely(current->ptrace) && (signr != SIGKILL) &&
+ !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) {
signr = ptrace_signal(signr, &ksig->info);
if (!signr)
continue;
@@ -4102,6 +4086,10 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
k = &p->sighand->action[sig-1];
spin_lock_irq(&p->sighand->siglock);
+ if (k->sa.sa_flags & SA_IMMUTABLE) {
+ spin_unlock_irq(&p->sighand->siglock);
+ return -EINVAL;
+ }
if (oact)
*oact = *k;
@@ -4151,11 +4139,29 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
return 0;
}
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+static inline void sigaltstack_lock(void)
+ __acquires(&current->sighand->siglock)
+{
+ spin_lock_irq(&current->sighand->siglock);
+}
+
+static inline void sigaltstack_unlock(void)
+ __releases(&current->sighand->siglock)
+{
+ spin_unlock_irq(&current->sighand->siglock);
+}
+#else
+static inline void sigaltstack_lock(void) { }
+static inline void sigaltstack_unlock(void) { }
+#endif
+
static int
do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
size_t min_ss_size)
{
struct task_struct *t = current;
+ int ret = 0;
if (oss) {
memset(oss, 0, sizeof(stack_t));
@@ -4179,19 +4185,33 @@ do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
ss_mode != 0))
return -EINVAL;
+ /*
+ * Return before taking any locks if no actual
+ * sigaltstack changes were requested.
+ */
+ if (t->sas_ss_sp == (unsigned long)ss_sp &&
+ t->sas_ss_size == ss_size &&
+ t->sas_ss_flags == ss_flags)
+ return 0;
+
+ sigaltstack_lock();
if (ss_mode == SS_DISABLE) {
ss_size = 0;
ss_sp = NULL;
} else {
if (unlikely(ss_size < min_ss_size))
- return -ENOMEM;
+ ret = -ENOMEM;
+ if (!sigaltstack_size_valid(ss_size))
+ ret = -ENOMEM;
}
-
- t->sas_ss_sp = (unsigned long) ss_sp;
- t->sas_ss_size = ss_size;
- t->sas_ss_flags = ss_flags;
+ if (!ret) {
+ t->sas_ss_sp = (unsigned long) ss_sp;
+ t->sas_ss_size = ss_size;
+ t->sas_ss_flags = ss_flags;
+ }
+ sigaltstack_unlock();
}
- return 0;
+ return ret;
}
SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
diff --git a/kernel/smp.c b/kernel/smp.c
index f43ede0ab183..01a7c1706a58 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -1170,14 +1170,12 @@ void wake_up_all_idle_cpus(void)
{
int cpu;
- preempt_disable();
- for_each_online_cpu(cpu) {
- if (cpu == smp_processor_id())
- continue;
-
- wake_up_if_idle(cpu);
+ for_each_possible_cpu(cpu) {
+ preempt_disable();
+ if (cpu != smp_processor_id() && cpu_online(cpu))
+ wake_up_if_idle(cpu);
+ preempt_enable();
}
- preempt_enable();
}
EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 322b65d45676..41f470929e99 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -595,7 +595,8 @@ void irq_enter_rcu(void)
{
__irq_enter_raw();
- if (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET))
+ if (tick_nohz_full_cpu(smp_processor_id()) ||
+ (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET)))
tick_irq_enter();
account_hardirq_enter(current);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9f8117c7cfdd..9c625257023d 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -13,6 +13,7 @@
#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
+#include <linux/interrupt.h>
/**
* stack_trace_print - Print the entries in the stack trace
@@ -373,3 +374,32 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
#endif /* CONFIG_USER_STACKTRACE_SUPPORT */
#endif /* !CONFIG_ARCH_STACKWALK */
+
+static inline bool in_irqentry_text(unsigned long ptr)
+{
+ return (ptr >= (unsigned long)&__irqentry_text_start &&
+ ptr < (unsigned long)&__irqentry_text_end) ||
+ (ptr >= (unsigned long)&__softirqentry_text_start &&
+ ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+/**
+ * filter_irq_stacks - Find first IRQ stack entry in trace
+ * @entries: Pointer to stack trace array
+ * @nr_entries: Number of entries in the storage array
+ *
+ * Return: Number of trace entries until IRQ stack starts.
+ */
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr_entries; i++) {
+ if (in_irqentry_text(entries[i])) {
+ /* Include the irqentry function into the stack. */
+ return i + 1;
+ }
+ }
+ return nr_entries;
+}
+EXPORT_SYMBOL_GPL(filter_irq_stacks);
diff --git a/kernel/sys.c b/kernel/sys.c
index 72c7639e3c98..2450a9f33cb0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1847,7 +1847,6 @@ SYSCALL_DEFINE1(umask, int, mask)
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
struct fd exe;
- struct file *old_exe, *exe_file;
struct inode *inode;
int err;
@@ -1870,40 +1869,10 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (err)
goto exit;
- /*
- * Forbid mm->exe_file change if old file still mapped.
- */
- exe_file = get_mm_exe_file(mm);
- err = -EBUSY;
- if (exe_file) {
- struct vm_area_struct *vma;
-
- mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (!vma->vm_file)
- continue;
- if (path_equal(&vma->vm_file->f_path,
- &exe_file->f_path))
- goto exit_err;
- }
-
- mmap_read_unlock(mm);
- fput(exe_file);
- }
-
- err = 0;
- /* set the new file, lockless */
- get_file(exe.file);
- old_exe = xchg(&mm->exe_file, exe.file);
- if (old_exe)
- fput(old_exe);
+ err = replace_mm_exe_file(mm, exe.file);
exit:
fdput(exe);
return err;
-exit_err:
- mmap_read_unlock(mm);
- fput(exe_file);
- goto exit;
}
/*
@@ -1961,13 +1930,6 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
error = -EINVAL;
/*
- * @brk should be after @end_data in traditional maps.
- */
- if (prctl_map->start_brk <= prctl_map->end_data ||
- prctl_map->brk <= prctl_map->end_data)
- goto out;
-
- /*
* Neither we should allow to override limits if they set.
*/
if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
@@ -2299,6 +2261,66 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
+#ifdef CONFIG_ANON_VMA_NAME
+
+#define ANON_VMA_NAME_MAX_LEN 80
+#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]"
+
+static inline bool is_valid_name_char(char ch)
+{
+ /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
+ return ch > 0x1f && ch < 0x7f &&
+ !strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
+}
+
+static int prctl_set_vma(unsigned long opt, unsigned long addr,
+ unsigned long size, unsigned long arg)
+{
+ struct mm_struct *mm = current->mm;
+ const char __user *uname;
+ char *name, *pch;
+ int error;
+
+ switch (opt) {
+ case PR_SET_VMA_ANON_NAME:
+ uname = (const char __user *)arg;
+ if (uname) {
+ name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
+
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ for (pch = name; *pch != '\0'; pch++) {
+ if (!is_valid_name_char(*pch)) {
+ kfree(name);
+ return -EINVAL;
+ }
+ }
+ } else {
+ /* Reset the name */
+ name = NULL;
+ }
+
+ mmap_write_lock(mm);
+ error = madvise_set_anon_name(mm, addr, size, name);
+ mmap_write_unlock(mm);
+ kfree(name);
+ break;
+ default:
+ error = -EINVAL;
+ }
+
+ return error;
+}
+
+#else /* CONFIG_ANON_VMA_NAME */
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long size, unsigned long arg)
+{
+ return -EINVAL;
+}
+#endif /* CONFIG_ANON_VMA_NAME */
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2568,6 +2590,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break;
#endif
+ case PR_SET_VMA:
+ error = prctl_set_vma(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 64578adfe115..a492f159624f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -143,13 +143,14 @@ COND_SYSCALL(capset);
/* __ARCH_WANT_SYS_CLONE3 */
COND_SYSCALL(clone3);
-/* kernel/futex.c */
+/* kernel/futex/syscalls.c */
COND_SYSCALL(futex);
COND_SYSCALL(futex_time32);
COND_SYSCALL(set_robust_list);
COND_SYSCALL_COMPAT(set_robust_list);
COND_SYSCALL(get_robust_list);
COND_SYSCALL_COMPAT(get_robust_list);
+COND_SYSCALL(futex_waitv);
/* kernel/hrtimer.c */
@@ -292,15 +293,11 @@ COND_SYSCALL(process_madvise);
COND_SYSCALL(process_mrelease);
COND_SYSCALL(remap_file_pages);
COND_SYSCALL(mbind);
-COND_SYSCALL_COMPAT(mbind);
COND_SYSCALL(get_mempolicy);
-COND_SYSCALL_COMPAT(get_mempolicy);
COND_SYSCALL(set_mempolicy);
-COND_SYSCALL_COMPAT(set_mempolicy);
COND_SYSCALL(migrate_pages);
-COND_SYSCALL_COMPAT(migrate_pages);
COND_SYSCALL(move_pages);
-COND_SYSCALL_COMPAT(move_pages);
+COND_SYSCALL(set_mempolicy_home_node);
COND_SYSCALL(perf_event_open);
COND_SYSCALL(accept4);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 083be6af29d7..ef77be575d87 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -33,6 +33,7 @@
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/kmemleak.h>
+#include <linux/filter.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -122,6 +123,7 @@ static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
static int two_hundred = 200;
static int one_thousand = 1000;
+static int three_thousand = 3000;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
@@ -2959,7 +2961,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = watermark_scale_factor_sysctl_handler,
.extra1 = SYSCTL_ONE,
- .extra2 = &one_thousand,
+ .extra2 = &three_thousand,
},
{
.procname = "percpu_pagelist_high_fraction",
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
deleted file mode 100644
index 76c997fdbc9d..000000000000
--- a/kernel/test_kprobes.c
+++ /dev/null
@@ -1,313 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * test_kprobes.c - simple sanity test for *probes
- *
- * Copyright IBM Corp. 2008
- */
-
-#define pr_fmt(fmt) "Kprobe smoke test: " fmt
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/random.h>
-
-#define div_factor 3
-
-static u32 rand1, preh_val, posth_val;
-static int errors, handler_errors, num_tests;
-static u32 (*target)(u32 value);
-static u32 (*target2)(u32 value);
-
-static noinline u32 kprobe_target(u32 value)
-{
- return (value / div_factor);
-}
-
-static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
- if (preemptible()) {
- handler_errors++;
- pr_err("pre-handler is preemptible\n");
- }
- preh_val = (rand1 / div_factor);
- return 0;
-}
-
-static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
- unsigned long flags)
-{
- if (preemptible()) {
- handler_errors++;
- pr_err("post-handler is preemptible\n");
- }
- if (preh_val != (rand1 / div_factor)) {
- handler_errors++;
- pr_err("incorrect value in post_handler\n");
- }
- posth_val = preh_val + div_factor;
-}
-
-static struct kprobe kp = {
- .symbol_name = "kprobe_target",
- .pre_handler = kp_pre_handler,
- .post_handler = kp_post_handler
-};
-
-static int test_kprobe(void)
-{
- int ret;
-
- ret = register_kprobe(&kp);
- if (ret < 0) {
- pr_err("register_kprobe returned %d\n", ret);
- return ret;
- }
-
- ret = target(rand1);
- unregister_kprobe(&kp);
-
- if (preh_val == 0) {
- pr_err("kprobe pre_handler not called\n");
- handler_errors++;
- }
-
- if (posth_val == 0) {
- pr_err("kprobe post_handler not called\n");
- handler_errors++;
- }
-
- return 0;
-}
-
-static noinline u32 kprobe_target2(u32 value)
-{
- return (value / div_factor) + 1;
-}
-
-static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs)
-{
- preh_val = (rand1 / div_factor) + 1;
- return 0;
-}
-
-static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
- unsigned long flags)
-{
- if (preh_val != (rand1 / div_factor) + 1) {
- handler_errors++;
- pr_err("incorrect value in post_handler2\n");
- }
- posth_val = preh_val + div_factor;
-}
-
-static struct kprobe kp2 = {
- .symbol_name = "kprobe_target2",
- .pre_handler = kp_pre_handler2,
- .post_handler = kp_post_handler2
-};
-
-static int test_kprobes(void)
-{
- int ret;
- struct kprobe *kps[2] = {&kp, &kp2};
-
- /* addr and flags should be cleard for reusing kprobe. */
- kp.addr = NULL;
- kp.flags = 0;
- ret = register_kprobes(kps, 2);
- if (ret < 0) {
- pr_err("register_kprobes returned %d\n", ret);
- return ret;
- }
-
- preh_val = 0;
- posth_val = 0;
- ret = target(rand1);
-
- if (preh_val == 0) {
- pr_err("kprobe pre_handler not called\n");
- handler_errors++;
- }
-
- if (posth_val == 0) {
- pr_err("kprobe post_handler not called\n");
- handler_errors++;
- }
-
- preh_val = 0;
- posth_val = 0;
- ret = target2(rand1);
-
- if (preh_val == 0) {
- pr_err("kprobe pre_handler2 not called\n");
- handler_errors++;
- }
-
- if (posth_val == 0) {
- pr_err("kprobe post_handler2 not called\n");
- handler_errors++;
- }
-
- unregister_kprobes(kps, 2);
- return 0;
-
-}
-
-#ifdef CONFIG_KRETPROBES
-static u32 krph_val;
-
-static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
-{
- if (preemptible()) {
- handler_errors++;
- pr_err("kretprobe entry handler is preemptible\n");
- }
- krph_val = (rand1 / div_factor);
- return 0;
-}
-
-static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
-{
- unsigned long ret = regs_return_value(regs);
-
- if (preemptible()) {
- handler_errors++;
- pr_err("kretprobe return handler is preemptible\n");
- }
- if (ret != (rand1 / div_factor)) {
- handler_errors++;
- pr_err("incorrect value in kretprobe handler\n");
- }
- if (krph_val == 0) {
- handler_errors++;
- pr_err("call to kretprobe entry handler failed\n");
- }
-
- krph_val = rand1;
- return 0;
-}
-
-static struct kretprobe rp = {
- .handler = return_handler,
- .entry_handler = entry_handler,
- .kp.symbol_name = "kprobe_target"
-};
-
-static int test_kretprobe(void)
-{
- int ret;
-
- ret = register_kretprobe(&rp);
- if (ret < 0) {
- pr_err("register_kretprobe returned %d\n", ret);
- return ret;
- }
-
- ret = target(rand1);
- unregister_kretprobe(&rp);
- if (krph_val != rand1) {
- pr_err("kretprobe handler not called\n");
- handler_errors++;
- }
-
- return 0;
-}
-
-static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
-{
- unsigned long ret = regs_return_value(regs);
-
- if (ret != (rand1 / div_factor) + 1) {
- handler_errors++;
- pr_err("incorrect value in kretprobe handler2\n");
- }
- if (krph_val == 0) {
- handler_errors++;
- pr_err("call to kretprobe entry handler failed\n");
- }
-
- krph_val = rand1;
- return 0;
-}
-
-static struct kretprobe rp2 = {
- .handler = return_handler2,
- .entry_handler = entry_handler,
- .kp.symbol_name = "kprobe_target2"
-};
-
-static int test_kretprobes(void)
-{
- int ret;
- struct kretprobe *rps[2] = {&rp, &rp2};
-
- /* addr and flags should be cleard for reusing kprobe. */
- rp.kp.addr = NULL;
- rp.kp.flags = 0;
- ret = register_kretprobes(rps, 2);
- if (ret < 0) {
- pr_err("register_kretprobe returned %d\n", ret);
- return ret;
- }
-
- krph_val = 0;
- ret = target(rand1);
- if (krph_val != rand1) {
- pr_err("kretprobe handler not called\n");
- handler_errors++;
- }
-
- krph_val = 0;
- ret = target2(rand1);
- if (krph_val != rand1) {
- pr_err("kretprobe handler2 not called\n");
- handler_errors++;
- }
- unregister_kretprobes(rps, 2);
- return 0;
-}
-#endif /* CONFIG_KRETPROBES */
-
-int init_test_probes(void)
-{
- int ret;
-
- target = kprobe_target;
- target2 = kprobe_target2;
-
- do {
- rand1 = prandom_u32();
- } while (rand1 <= div_factor);
-
- pr_info("started\n");
- num_tests++;
- ret = test_kprobe();
- if (ret < 0)
- errors++;
-
- num_tests++;
- ret = test_kprobes();
- if (ret < 0)
- errors++;
-
-#ifdef CONFIG_KRETPROBES
- num_tests++;
- ret = test_kretprobe();
- if (ret < 0)
- errors++;
-
- num_tests++;
- ret = test_kretprobes();
- if (ret < 0)
- errors++;
-#endif /* CONFIG_KRETPROBES */
-
- if (errors)
- pr_err("BUG: %d out of %d tests failed\n", errors, num_tests);
- else if (handler_errors)
- pr_err("BUG: %d error(s) running handlers\n", handler_errors);
- else
- pr_info("passed successfully\n");
-
- return 0;
-}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b8a14d2fb5ba..b7e52a642948 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -107,7 +107,7 @@ static u64 suspend_start;
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
* a lower bound for cs->uncertainty_margin values when registering clocks.
*/
-#define WATCHDOG_MAX_SKEW (50 * NSEC_PER_USEC)
+#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
@@ -199,23 +199,30 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-ulong max_cswd_read_retries = 3;
+ulong max_cswd_read_retries = 2;
module_param(max_cswd_read_retries, ulong, 0644);
EXPORT_SYMBOL_GPL(max_cswd_read_retries);
static int verify_n_cpus = 8;
module_param(verify_n_cpus, int, 0644);
-static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
+enum wd_read_status {
+ WD_READ_SUCCESS,
+ WD_READ_UNSTABLE,
+ WD_READ_SKIP
+};
+
+static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
{
unsigned int nretries;
- u64 wd_end, wd_delta;
- int64_t wd_delay;
+ u64 wd_end, wd_end2, wd_delta;
+ int64_t wd_delay, wd_seq_delay;
for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
local_irq_disable();
*wdnow = watchdog->read(watchdog);
*csnow = cs->read(cs);
wd_end = watchdog->read(watchdog);
+ wd_end2 = watchdog->read(watchdog);
local_irq_enable();
wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
@@ -226,13 +233,34 @@ static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
smp_processor_id(), watchdog->name, nretries);
}
- return true;
+ return WD_READ_SUCCESS;
}
+
+ /*
+ * Now compute delay in consecutive watchdog read to see if
+ * there is too much external interferences that cause
+ * significant delay in reading both clocksource and watchdog.
+ *
+ * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2,
+ * report system busy, reinit the watchdog and skip the current
+ * watchdog test.
+ */
+ wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
+ wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift);
+ if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
+ goto skip_test;
}
pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
smp_processor_id(), watchdog->name, wd_delay, nretries);
- return false;
+ return WD_READ_UNSTABLE;
+
+skip_test:
+ pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
+ smp_processor_id(), watchdog->name, wd_seq_delay);
+ pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
+ cs->name, wd_delay);
+ return WD_READ_SKIP;
}
static u64 csnow_mid;
@@ -356,6 +384,7 @@ static void clocksource_watchdog(struct timer_list *unused)
int next_cpu, reset_pending;
int64_t wd_nsec, cs_nsec;
struct clocksource *cs;
+ enum wd_read_status read_ret;
u32 md;
spin_lock(&watchdog_lock);
@@ -373,9 +402,12 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
}
- if (!cs_watchdog_read(cs, &csnow, &wdnow)) {
- /* Clock readout unreliable, so give it up. */
- __clocksource_unstable(cs);
+ read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
+
+ if (read_ret != WD_READ_SUCCESS) {
+ if (read_ret == WD_READ_UNSTABLE)
+ /* Clock readout unreliable, so give it up. */
+ __clocksource_unstable(cs);
continue;
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index ee736861b18f..96b4e7810426 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1159,13 +1159,28 @@ static void posix_cpu_timers_work(struct callback_head *work)
}
/*
+ * Clear existing posix CPU timers task work.
+ */
+void clear_posix_cputimers_work(struct task_struct *p)
+{
+ /*
+ * A copied work entry from the old task is not meaningful, clear it.
+ * N.B. init_task_work will not do this.
+ */
+ memset(&p->posix_cputimers_work.work, 0,
+ sizeof(p->posix_cputimers_work.work));
+ init_task_work(&p->posix_cputimers_work.work,
+ posix_cpu_timers_work);
+ p->posix_cputimers_work.scheduled = false;
+}
+
+/*
* Initialize posix CPU timers task work in init task. Out of line to
* keep the callback static and to avoid header recursion hell.
*/
void __init posix_cputimers_init_work(void)
{
- init_task_work(&current->posix_cputimers_work.work,
- posix_cpu_timers_work);
+ clear_posix_cputimers_work(current);
}
/*
@@ -1404,7 +1419,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
}
}
- *newval += now;
+ if (*newval)
+ *newval += now;
}
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6bffe5af8cb1..17a283ce2b20 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1375,6 +1375,13 @@ static inline void tick_nohz_irq_enter(void)
now = ktime_get();
if (ts->idle_active)
tick_nohz_stop_idle(ts, now);
+ /*
+ * If all CPUs are idle. We may need to update a stale jiffies value.
+ * Note nohz_full is a special case: a timekeeper is guaranteed to stay
+ * alive but it might be busy looping with interrupts disabled in some
+ * rare case (typically stop machine). So we must make sure we have a
+ * last resort.
+ */
if (ts->tick_stopped)
tick_nohz_update_jiffies(now);
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b348749a9fc6..dcdcb85121e4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1306,8 +1306,7 @@ int do_settimeofday64(const struct timespec64 *ts)
timekeeping_forward_now(tk);
xt = tk_xtime(tk);
- ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
- ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
+ ts_delta = timespec64_sub(*ts, xt);
if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
ret = -EINVAL;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index e3d2c23c413d..85f1021ad459 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2054,26 +2054,28 @@ unsigned long msleep_interruptible(unsigned int msecs)
EXPORT_SYMBOL(msleep_interruptible);
/**
- * usleep_range - Sleep for an approximate time
- * @min: Minimum time in usecs to sleep
- * @max: Maximum time in usecs to sleep
+ * usleep_range_state - Sleep for an approximate time in a given state
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ * @state: State of the current task that will be while sleeping
*
* In non-atomic context where the exact wakeup time is flexible, use
- * usleep_range() instead of udelay(). The sleep improves responsiveness
+ * usleep_range_state() instead of udelay(). The sleep improves responsiveness
* by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
* power usage by allowing hrtimers to take advantage of an already-
* scheduled interrupt instead of scheduling a new one just for this sleep.
*/
-void __sched usleep_range(unsigned long min, unsigned long max)
+void __sched usleep_range_state(unsigned long min, unsigned long max,
+ unsigned int state)
{
ktime_t exp = ktime_add_us(ktime_get(), min);
u64 delta = (u64)(max - min) * NSEC_PER_USEC;
for (;;) {
- __set_current_state(TASK_UNINTERRUPTIBLE);
+ __set_current_state(state);
/* Do not return before the requested sleep time has elapsed */
if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
break;
}
}
-EXPORT_SYMBOL(usleep_range);
+EXPORT_SYMBOL(usleep_range_state);
diff --git a/kernel/torture.c b/kernel/torture.c
index bb8f411c974b..ef27a6c82451 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -570,7 +570,7 @@ int torture_shuffle_init(long shuffint)
shuffle_idle_cpu = -1;
if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
- VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask");
+ TOROUT_ERRSTRING("Failed to alloc mask");
return -ENOMEM;
}
@@ -934,7 +934,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
*tp = kthread_run(fn, arg, "%s", s);
if (IS_ERR(*tp)) {
ret = PTR_ERR(*tp);
- VERBOSE_TOROUT_ERRSTRING(f);
+ TOROUT_ERRSTRING(f);
*tp = NULL;
}
torture_shuffle_task_register(*tp);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3ee23f4d437f..420ff4bc67fd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -135,10 +135,9 @@ config TRACING_SUPPORT
depends on STACKTRACE_SUPPORT
default y
-if TRACING_SUPPORT
-
menuconfig FTRACE
bool "Tracers"
+ depends on TRACING_SUPPORT
default y if DEBUG_KERNEL
help
Enable the kernel tracing infrastructure.
@@ -1037,6 +1036,3 @@ config HIST_TRIGGERS_DEBUG
If unsure, say N.
endif # FTRACE
-
-endif # TRACING_SUPPORT
-
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index b1c47ccf4f73..bedc5caceec7 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_TRACING) += trace_output.o
obj-$(CONFIG_TRACING) += trace_seq.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
+obj-$(CONFIG_TRACING) += pid_list.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o
obj-$(CONFIG_SYNTH_EVENT_GEN_TEST) += synth_event_gen_test.o
@@ -77,6 +78,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c221e4c3f625..af68a67179b4 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -34,7 +34,7 @@ static struct trace_array *blk_tr;
static bool blk_tracer_enabled __read_mostly;
static LIST_HEAD(running_trace_list);
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
+static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock);
/* Select an alternative, minimalistic output than the original one */
#define TRACE_BLK_OPT_CLASSIC 0x1
@@ -121,12 +121,12 @@ static void trace_note_tsk(struct task_struct *tsk)
struct blk_trace *bt;
tsk->btrace_seq = blktrace_seq;
- spin_lock_irqsave(&running_trace_lock, flags);
+ raw_spin_lock_irqsave(&running_trace_lock, flags);
list_for_each_entry(bt, &running_trace_list, running_list) {
trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
sizeof(tsk->comm), 0);
}
- spin_unlock_irqrestore(&running_trace_lock, flags);
+ raw_spin_unlock_irqrestore(&running_trace_lock, flags);
}
static void trace_note_time(struct blk_trace *bt)
@@ -666,9 +666,9 @@ static int __blk_trace_startstop(struct request_queue *q, int start)
blktrace_seq++;
smp_mb();
bt->trace_state = Blktrace_running;
- spin_lock_irq(&running_trace_lock);
+ raw_spin_lock_irq(&running_trace_lock);
list_add(&bt->running_list, &running_trace_list);
- spin_unlock_irq(&running_trace_lock);
+ raw_spin_unlock_irq(&running_trace_lock);
trace_note_time(bt);
ret = 0;
@@ -676,9 +676,9 @@ static int __blk_trace_startstop(struct request_queue *q, int start)
} else {
if (bt->trace_state == Blktrace_running) {
bt->trace_state = Blktrace_stopped;
- spin_lock_irq(&running_trace_lock);
+ raw_spin_lock_irq(&running_trace_lock);
list_del_init(&bt->running_list);
- spin_unlock_irq(&running_trace_lock);
+ raw_spin_unlock_irq(&running_trace_lock);
relay_flush(bt->rchan);
ret = 0;
}
@@ -816,7 +816,7 @@ blk_trace_request_get_cgid(struct request *rq)
* Records an action against a request. Will log the bio offset + size.
*
**/
-static void blk_add_trace_rq(struct request *rq, int error,
+static void blk_add_trace_rq(struct request *rq, blk_status_t error,
unsigned int nr_bytes, u32 what, u64 cgid)
{
struct blk_trace *bt;
@@ -834,7 +834,8 @@ static void blk_add_trace_rq(struct request *rq, int error,
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, error, 0, NULL, cgid);
+ rq->cmd_flags, what, blk_status_to_errno(error), 0,
+ NULL, cgid);
rcu_read_unlock();
}
@@ -863,7 +864,7 @@ static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
}
static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
- int error, unsigned int nr_bytes)
+ blk_status_t error, unsigned int nr_bytes)
{
blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
blk_trace_request_get_cgid(rq));
@@ -1044,7 +1045,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
}
r.device_from = cpu_to_be32(dev);
- r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
+ r.device_to = cpu_to_be32(disk_devt(rq->q->disk));
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
@@ -1605,6 +1606,14 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (bt == NULL)
return -EINVAL;
+ if (bt->trace_state == Blktrace_running) {
+ bt->trace_state = Blktrace_stopped;
+ raw_spin_lock_irq(&running_trace_lock);
+ list_del_init(&bt->running_list);
+ raw_spin_unlock_irq(&running_trace_lock);
+ relay_flush(bt->rchan);
+ }
+
put_probe_ref();
synchronize_rcu();
blk_trace_free(bt);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8e2eb950aa82..21aa30644219 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -345,7 +345,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@@ -394,11 +394,11 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
.func = bpf_trace_printk,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
};
-const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
+static void __set_printk_clr_event(void)
{
/*
* This program might be calling bpf_trace_printk,
@@ -410,11 +410,57 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
*/
if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
pr_warn_ratelimited("could not enable bpf_trace_printk events");
+}
+const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
+{
+ __set_printk_clr_event();
return &bpf_trace_printk_proto;
}
-#define MAX_SEQ_PRINTF_VARARGS 12
+BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, data,
+ u32, data_len)
+{
+ static char buf[BPF_TRACE_PRINTK_SIZE];
+ unsigned long flags;
+ int ret, num_args;
+ u32 *bin_args;
+
+ if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
+ (data_len && !data))
+ return -EINVAL;
+ num_args = data_len / 8;
+
+ ret = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args);
+ if (ret < 0)
+ return ret;
+
+ raw_spin_lock_irqsave(&trace_printk_lock, flags);
+ ret = bstr_printf(buf, sizeof(buf), fmt, bin_args);
+
+ trace_bpf_trace_printk(buf);
+ raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
+
+ bpf_bprintf_cleanup();
+
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_trace_vprintk_proto = {
+ .func = bpf_trace_vprintk,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
+{
+ __set_printk_clr_event();
+ return &bpf_trace_vprintk_proto;
+}
BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
const void *, data, u32, data_len)
@@ -422,7 +468,7 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
int err, num_args;
u32 *bin_args;
- if (data_len & 7 || data_len > MAX_SEQ_PRINTF_VARARGS * 8 ||
+ if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
(data_len && !data))
return -EINVAL;
num_args = data_len / 8;
@@ -446,9 +492,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &btf_seq_file_ids[0],
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
- .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -463,7 +509,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &btf_seq_file_ids[0],
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -487,7 +533,7 @@ static const struct bpf_func_proto bpf_seq_printf_btf_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &btf_seq_file_ids[0],
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -648,7 +694,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -718,7 +764,7 @@ const struct bpf_func_proto bpf_get_current_task_btf_proto = {
.func = bpf_get_current_task_btf,
.gpl_only = true,
.ret_type = RET_PTR_TO_BTF_ID,
- .ret_btf_id = &btf_task_struct_ids[0],
+ .ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
@@ -733,7 +779,7 @@ const struct bpf_func_proto bpf_task_pt_regs_proto = {
.func = bpf_task_pt_regs,
.gpl_only = true,
.arg1_type = ARG_PTR_TO_BTF_ID,
- .arg1_btf_id = &btf_task_struct_ids[0],
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.ret_type = RET_PTR_TO_BTF_ID,
.ret_btf_id = &bpf_task_pt_regs_ids[0],
};
@@ -958,7 +1004,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_CONST_SIZE,
- .arg3_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE,
.arg5_type = ARG_ANYTHING,
};
@@ -966,7 +1012,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx)
{
/* This helper call is inlined by verifier. */
- return ((u64 *)ctx)[-1];
+ return ((u64 *)ctx)[-2];
}
static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
@@ -1017,6 +1063,81 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
+{
+#ifndef CONFIG_X86
+ return -ENOENT;
+#else
+ static const u32 br_entry_size = sizeof(struct perf_branch_entry);
+ u32 entry_cnt = size / br_entry_size;
+
+ entry_cnt = static_call(perf_snapshot_branch_stack)(buf, entry_cnt);
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (!entry_cnt)
+ return -ENOENT;
+
+ return entry_cnt * br_entry_size;
+#endif
+}
+
+static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
+ .func = bpf_get_branch_snapshot,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value)
+{
+ /* This helper call is inlined by verifier. */
+ u64 nr_args = ((u64 *)ctx)[-1];
+
+ if ((u64) n >= nr_args)
+ return -EINVAL;
+ *value = ((u64 *)ctx)[n];
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_arg_proto = {
+ .func = get_func_arg,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
+{
+ /* This helper call is inlined by verifier. */
+ u64 nr_args = ((u64 *)ctx)[-1];
+
+ *value = ((u64 *)ctx)[nr_args];
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_get_func_ret_proto = {
+ .func = get_func_ret,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_1(get_func_arg_cnt, void *, ctx)
+{
+ /* This helper call is inlined by verifier. */
+ return ((u64 *)ctx)[-1];
+}
+
+static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
+ .func = get_func_arg_cnt,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1037,8 +1158,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
- case BPF_FUNC_ktime_get_coarse_ns:
- return &bpf_ktime_get_coarse_ns_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
@@ -1132,6 +1251,12 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_snprintf_proto;
case BPF_FUNC_get_func_ip:
return &bpf_get_func_ip_proto_tracing;
+ case BPF_FUNC_get_branch_snapshot:
+ return &bpf_get_branch_snapshot_proto;
+ case BPF_FUNC_find_vma:
+ return &bpf_find_vma_proto;
+ case BPF_FUNC_trace_vprintk:
+ return bpf_get_trace_vprintk_proto();
default:
return bpf_base_func_proto(func_id);
}
@@ -1209,7 +1334,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -1324,9 +1449,6 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
void *, buf, u32, size, u64, flags)
{
-#ifndef CONFIG_X86
- return -ENOENT;
-#else
static const u32 br_entry_size = sizeof(struct perf_branch_entry);
struct perf_branch_stack *br_stack = ctx->data->br_stack;
u32 to_copy;
@@ -1335,7 +1457,7 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
return -EINVAL;
if (unlikely(!br_stack))
- return -EINVAL;
+ return -ENOENT;
if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE)
return br_stack->nr * br_entry_size;
@@ -1347,7 +1469,6 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
memcpy(buf, br_stack->entries, to_copy);
return to_copy;
-#endif
}
static const struct bpf_func_proto bpf_read_branch_records_proto = {
@@ -1435,7 +1556,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -1489,7 +1610,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -1530,6 +1651,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skc_to_tcp_request_sock_proto;
case BPF_FUNC_skc_to_udp6_sock:
return &bpf_skc_to_udp6_sock_proto;
+ case BPF_FUNC_skc_to_unix_sock:
+ return &bpf_skc_to_unix_sock_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_tracing_proto;
case BPF_FUNC_sk_storage_delete:
@@ -1553,6 +1676,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
NULL;
case BPF_FUNC_d_path:
return &bpf_d_path_proto;
+ case BPF_FUNC_get_func_arg:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL;
+ case BPF_FUNC_get_func_ret:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
+ case BPF_FUNC_get_func_arg_cnt:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
default:
fn = raw_tp_prog_func_proto(func_id, prog);
if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
@@ -1566,13 +1695,7 @@ static bool raw_tp_prog_is_valid_access(int off, int size,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
- if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
- return false;
- if (type != BPF_READ)
- return false;
- if (off % size != 0)
- return false;
- return true;
+ return bpf_tracing_ctx_access(off, size, type);
}
static bool tracing_prog_is_valid_access(int off, int size,
@@ -1580,13 +1703,7 @@ static bool tracing_prog_is_valid_access(int off, int size,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
- if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
- return false;
- if (type != BPF_READ)
- return false;
- if (off % size != 0)
- return false;
- return btf_ctx_access(off, size, type, prog, info);
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
}
int __weak bpf_prog_test_run_tracing(struct bpf_prog *prog,
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index b8a0d1d564fb..22061d38fc00 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -115,6 +115,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
{
struct ftrace_graph_ent trace;
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
/*
* Skip graph tracing if the return location is served by direct trampoline,
* since call sequence and return addresses are unpredictable anyway.
@@ -124,6 +125,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
if (ftrace_direct_func_count &&
ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE))
return -EBUSY;
+#endif
trace.func = func;
trace.depth = ++current->curr_ret_depth;
@@ -333,10 +335,10 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
static struct ftrace_ops graph_ops = {
- .func = ftrace_stub,
+ .func = ftrace_graph_func,
.flags = FTRACE_OPS_FL_INITIALIZED |
FTRACE_OPS_FL_PID |
- FTRACE_OPS_FL_STUB,
+ FTRACE_OPS_GRAPH_STUB,
#ifdef FTRACE_GRAPH_TRAMP_ADDR
.trampoline = FTRACE_GRAPH_TRAMP_ADDR,
/* trampoline_size is only needed for dynamically allocated tramps */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7efbc8aaf7f6..be5f6b32a012 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -119,14 +119,9 @@ struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
struct ftrace_ops global_ops;
-#if ARCH_SUPPORTS_FTRACE_OPS
-static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct ftrace_regs *fregs);
-#else
-/* See comment below, where ftrace_ops_list_func is defined */
-static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
-#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
-#endif
+/* Defined by vmlinux.lds.h see the commment above arch_ftrace_ops_list_func for details */
+void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
static inline void ftrace_ops_init(struct ftrace_ops *ops)
{
@@ -323,7 +318,7 @@ int __register_ftrace_function(struct ftrace_ops *ops)
if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT))
return -EBUSY;
- if (!core_kernel_data((unsigned long)ops))
+ if (!is_kernel_core_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
add_ftrace_ops(&ftrace_ops_list, ops);
@@ -581,7 +576,7 @@ static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
}
-int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
+static int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
{
struct ftrace_profile_page *pg;
int functions;
@@ -988,8 +983,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
}
}
- entry = tracefs_create_file("function_profile_enabled", 0644,
- d_tracer, NULL, &ftrace_profile_fops);
+ entry = tracefs_create_file("function_profile_enabled",
+ TRACE_MODE_WRITE, d_tracer, NULL,
+ &ftrace_profile_fops);
if (!entry)
pr_warn("Could not create tracefs 'function_profile_enabled' entry\n");
}
@@ -2208,7 +2204,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
}
/**
- * ftrace_update_record, set a record that now is tracing or not
+ * ftrace_update_record - set a record that now is tracing or not
* @rec: the record to update
* @enable: set to true if the record is tracing, false to force disable
*
@@ -2221,7 +2217,7 @@ int ftrace_update_record(struct dyn_ftrace *rec, bool enable)
}
/**
- * ftrace_test_record, check if the record has been enabled or not
+ * ftrace_test_record - check if the record has been enabled or not
* @rec: the record to test
* @enable: set to true to check if enabled, false if it is disabled
*
@@ -2394,6 +2390,39 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
return entry->direct;
}
+static struct ftrace_func_entry*
+ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
+ struct ftrace_hash **free_hash)
+{
+ struct ftrace_func_entry *entry;
+
+ if (ftrace_hash_empty(direct_functions) ||
+ direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
+ struct ftrace_hash *new_hash;
+ int size = ftrace_hash_empty(direct_functions) ? 0 :
+ direct_functions->count + 1;
+
+ if (size < 32)
+ size = 32;
+
+ new_hash = dup_hash(direct_functions, size);
+ if (!new_hash)
+ return NULL;
+
+ *free_hash = direct_functions;
+ direct_functions = new_hash;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return NULL;
+
+ entry->ip = ip;
+ entry->direct = addr;
+ __add_hash_entry(direct_functions, entry);
+ return entry;
+}
+
static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
@@ -2574,7 +2603,7 @@ struct ftrace_rec_iter {
};
/**
- * ftrace_rec_iter_start, start up iterating over traced functions
+ * ftrace_rec_iter_start - start up iterating over traced functions
*
* Returns an iterator handle that is used to iterate over all
* the records that represent address locations where functions
@@ -2605,7 +2634,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void)
}
/**
- * ftrace_rec_iter_next, get the next record to process.
+ * ftrace_rec_iter_next - get the next record to process.
* @iter: The handle to the iterator.
*
* Returns the next iterator after the given iterator @iter.
@@ -2630,7 +2659,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
}
/**
- * ftrace_rec_iter_record, get the record at the iterator location
+ * ftrace_rec_iter_record - get the record at the iterator location
* @iter: The current iterator location
*
* Returns the record that the current @iter is at.
@@ -2733,7 +2762,7 @@ static int __ftrace_modify_code(void *data)
}
/**
- * ftrace_run_stop_machine, go back to the stop machine method
+ * ftrace_run_stop_machine - go back to the stop machine method
* @command: The command to tell ftrace what to do
*
* If an arch needs to fall back to the stop machine method, the
@@ -2745,7 +2774,7 @@ void ftrace_run_stop_machine(int command)
}
/**
- * arch_ftrace_update_code, modify the code to trace or not trace
+ * arch_ftrace_update_code - modify the code to trace or not trace
* @command: The command that needs to be done
*
* Archs can override this function if it does not need to
@@ -5110,39 +5139,16 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
}
ret = -ENOMEM;
- if (ftrace_hash_empty(direct_functions) ||
- direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
- struct ftrace_hash *new_hash;
- int size = ftrace_hash_empty(direct_functions) ? 0 :
- direct_functions->count + 1;
-
- if (size < 32)
- size = 32;
-
- new_hash = dup_hash(direct_functions, size);
- if (!new_hash)
- goto out_unlock;
-
- free_hash = direct_functions;
- direct_functions = new_hash;
- }
-
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- goto out_unlock;
-
direct = ftrace_find_direct_func(addr);
if (!direct) {
direct = ftrace_alloc_direct_func(addr);
- if (!direct) {
- kfree(entry);
+ if (!direct)
goto out_unlock;
- }
}
- entry->ip = ip;
- entry->direct = addr;
- __add_hash_entry(direct_functions, entry);
+ entry = ftrace_add_rec_direct(ip, addr, &free_hash);
+ if (!entry)
+ goto out_unlock;
ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
if (ret)
@@ -5211,6 +5217,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
{
struct ftrace_direct_func *direct;
struct ftrace_func_entry *entry;
+ struct ftrace_hash *hash;
int ret = -ENODEV;
mutex_lock(&direct_mutex);
@@ -5219,7 +5226,8 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
if (!entry)
goto out_unlock;
- if (direct_functions->count == 1)
+ hash = direct_ops.func_hash->filter_hash;
+ if (hash->count == 1)
unregister_ftrace_function(&direct_ops);
ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
@@ -5395,6 +5403,221 @@ int modify_ftrace_direct(unsigned long ip,
return ret;
}
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
+
+#define MULTI_FLAGS (FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | \
+ FTRACE_OPS_FL_SAVE_REGS)
+
+static int check_direct_multi(struct ftrace_ops *ops)
+{
+ if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED))
+ return -EINVAL;
+ if ((ops->flags & MULTI_FLAGS) != MULTI_FLAGS)
+ return -EINVAL;
+ return 0;
+}
+
+static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long addr)
+{
+ struct ftrace_func_entry *entry, *del;
+ int size, i;
+
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ del = __ftrace_lookup_ip(direct_functions, entry->ip);
+ if (del && del->direct == addr) {
+ remove_hash_entry(direct_functions, del);
+ kfree(del);
+ }
+ }
+ }
+}
+
+/**
+ * register_ftrace_direct_multi - Call a custom trampoline directly
+ * for multiple functions registered in @ops
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the trampoline to call at @ops functions
+ *
+ * This is used to connect a direct calls to @addr from the nop locations
+ * of the functions registered in @ops (with by ftrace_set_filter_ip
+ * function).
+ *
+ * The location that it calls (@addr) must be able to handle a direct call,
+ * and save the parameters of the function being traced, and restore them
+ * (or inject new ones if needed), before returning.
+ *
+ * Returns:
+ * 0 on success
+ * -EINVAL - The @ops object was already registered with this call or
+ * when there are no functions in @ops object.
+ * -EBUSY - Another direct function is already attached (there can be only one)
+ * -ENODEV - @ip does not point to a ftrace nop location (or not supported)
+ * -ENOMEM - There was an allocation failure.
+ */
+int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+{
+ struct ftrace_hash *hash, *free_hash = NULL;
+ struct ftrace_func_entry *entry, *new;
+ int err = -EBUSY, size, i;
+
+ if (ops->func || ops->trampoline)
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED))
+ return -EINVAL;
+ if (ops->flags & FTRACE_OPS_FL_ENABLED)
+ return -EINVAL;
+
+ hash = ops->func_hash->filter_hash;
+ if (ftrace_hash_empty(hash))
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+
+ /* Make sure requested entries are not already registered.. */
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ if (ftrace_find_rec_direct(entry->ip))
+ goto out_unlock;
+ }
+ }
+
+ /* ... and insert them to direct_functions hash. */
+ err = -ENOMEM;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ new = ftrace_add_rec_direct(entry->ip, addr, &free_hash);
+ if (!new)
+ goto out_remove;
+ entry->direct = addr;
+ }
+ }
+
+ ops->func = call_direct_funcs;
+ ops->flags = MULTI_FLAGS;
+ ops->trampoline = FTRACE_REGS_ADDR;
+
+ err = register_ftrace_function(ops);
+
+ out_remove:
+ if (err)
+ remove_direct_functions_hash(hash, addr);
+
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ if (free_hash) {
+ synchronize_rcu_tasks();
+ free_ftrace_hash(free_hash);
+ }
+ return err;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_direct_multi);
+
+/**
+ * unregister_ftrace_direct_multi - Remove calls to custom trampoline
+ * previously registered by register_ftrace_direct_multi for @ops object.
+ * @ops: The address of the struct ftrace_ops object
+ *
+ * This is used to remove a direct calls to @addr from the nop locations
+ * of the functions registered in @ops (with by ftrace_set_filter_ip
+ * function).
+ *
+ * Returns:
+ * 0 on success
+ * -EINVAL - The @ops object was not properly registered.
+ */
+int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+ int err;
+
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+ err = unregister_ftrace_function(ops);
+ remove_direct_functions_hash(hash, addr);
+ mutex_unlock(&direct_mutex);
+
+ /* cleanup for possible another register call */
+ ops->func = NULL;
+ ops->trampoline = 0;
+ return err;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi);
+
+/**
+ * modify_ftrace_direct_multi - Modify an existing direct 'multi' call
+ * to call something else
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the new trampoline to call at @ops functions
+ *
+ * This is used to unregister currently registered direct caller and
+ * register new one @addr on functions registered in @ops object.
+ *
+ * Note there's window between ftrace_shutdown and ftrace_startup calls
+ * where there will be no callbacks called.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @ops object was not properly registered.
+ */
+int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+{
+ struct ftrace_hash *hash;
+ struct ftrace_func_entry *entry, *iter;
+ static struct ftrace_ops tmp_ops = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_STUB,
+ };
+ int i, size;
+ int err;
+
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+
+ /* Enable the tmp_ops to have the same functions as the direct ops */
+ ftrace_ops_init(&tmp_ops);
+ tmp_ops.func_hash = ops->func_hash;
+
+ err = register_ftrace_function(&tmp_ops);
+ if (err)
+ goto out_direct;
+
+ /*
+ * Now the ftrace_ops_list_func() is called to do the direct callers.
+ * We can safely change the direct functions attached to each entry.
+ */
+ mutex_lock(&ftrace_lock);
+
+ hash = ops->func_hash->filter_hash;
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(iter, &hash->buckets[i], hlist) {
+ entry = __ftrace_lookup_ip(direct_functions, iter->ip);
+ if (!entry)
+ continue;
+ entry->direct = addr;
+ }
+ }
+
+ mutex_unlock(&ftrace_lock);
+
+ /* Removing the tmp_ops will add the updated direct callers to the functions */
+ unregister_ftrace_function(&tmp_ops);
+
+ out_direct:
+ mutex_unlock(&direct_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi);
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
@@ -6109,10 +6332,10 @@ void ftrace_create_filter_files(struct ftrace_ops *ops,
struct dentry *parent)
{
- trace_create_file("set_ftrace_filter", 0644, parent,
+ trace_create_file("set_ftrace_filter", TRACE_MODE_WRITE, parent,
ops, &ftrace_filter_fops);
- trace_create_file("set_ftrace_notrace", 0644, parent,
+ trace_create_file("set_ftrace_notrace", TRACE_MODE_WRITE, parent,
ops, &ftrace_notrace_fops);
}
@@ -6139,19 +6362,19 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
{
- trace_create_file("available_filter_functions", 0444,
+ trace_create_file("available_filter_functions", TRACE_MODE_READ,
d_tracer, NULL, &ftrace_avail_fops);
- trace_create_file("enabled_functions", 0444,
+ trace_create_file("enabled_functions", TRACE_MODE_READ,
d_tracer, NULL, &ftrace_enabled_fops);
ftrace_create_filter_files(&global_ops, d_tracer);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- trace_create_file("set_graph_function", 0644, d_tracer,
+ trace_create_file("set_graph_function", TRACE_MODE_WRITE, d_tracer,
NULL,
&ftrace_graph_fops);
- trace_create_file("set_graph_notrace", 0644, d_tracer,
+ trace_create_file("set_graph_notrace", TRACE_MODE_WRITE, d_tracer,
NULL,
&ftrace_graph_notrace_fops);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -6846,6 +7069,11 @@ void __init ftrace_free_init_mem(void)
ftrace_free_mem(NULL, start, end);
}
+int __init __weak ftrace_dyn_arch_init(void)
+{
+ return 0;
+}
+
void __init ftrace_init(void)
{
extern unsigned long __start_mcount_loc[];
@@ -6977,15 +7205,14 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op;
int bit;
- bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
- if (bit < 0)
- return;
-
/*
- * Some of the ops may be dynamically allocated,
- * they must be freed after a synchronize_rcu().
+ * The ftrace_test_and_set_recursion() will disable preemption,
+ * which is required since some of the ops may be dynamically
+ * allocated, they must be freed after a synchronize_rcu().
*/
- preempt_disable_notrace();
+ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START);
+ if (bit < 0)
+ return;
do_for_each_ftrace_op(op, ftrace_ops_list) {
/* Stub functions don't need to be called nor tested */
@@ -7010,7 +7237,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
}
} while_for_each_ftrace_op(op);
out:
- preempt_enable_notrace();
trace_clear_recursion(bit);
}
@@ -7026,21 +7252,23 @@ out:
* Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
* An architecture can pass partial regs with ftrace_ops and still
* set the ARCH_SUPPORTS_FTRACE_OPS.
+ *
+ * In vmlinux.lds.h, ftrace_ops_list_func() is defined to be
+ * arch_ftrace_ops_list_func.
*/
#if ARCH_SUPPORTS_FTRACE_OPS
-static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct ftrace_regs *fregs)
+void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
{
__ftrace_ops_list_func(ip, parent_ip, NULL, fregs);
}
-NOKPROBE_SYMBOL(ftrace_ops_list_func);
#else
-static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
+void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
{
__ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
}
-NOKPROBE_SYMBOL(ftrace_ops_no_ops);
#endif
+NOKPROBE_SYMBOL(arch_ftrace_ops_list_func);
/*
* If there's only one function registered but it does not support
@@ -7052,16 +7280,13 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
{
int bit;
- bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
+ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START);
if (bit < 0)
return;
- preempt_disable_notrace();
-
if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching())
op->func(ip, parent_ip, op, fregs);
- preempt_enable_notrace();
trace_clear_recursion(bit);
}
NOKPROBE_SYMBOL(ftrace_ops_assist_func);
@@ -7184,10 +7409,10 @@ static void clear_ftrace_pids(struct trace_array *tr, int type)
synchronize_rcu();
if ((type & TRACE_PIDS) && pid_list)
- trace_free_pid_list(pid_list);
+ trace_pid_list_free(pid_list);
if ((type & TRACE_NO_PIDS) && no_pid_list)
- trace_free_pid_list(no_pid_list);
+ trace_pid_list_free(no_pid_list);
}
void ftrace_clear_pids(struct trace_array *tr)
@@ -7428,7 +7653,7 @@ pid_write(struct file *filp, const char __user *ubuf,
if (filtered_pids) {
synchronize_rcu();
- trace_free_pid_list(filtered_pids);
+ trace_pid_list_free(filtered_pids);
} else if (pid_list && !other_pids) {
/* Register a probe to set whether to ignore the tracing of a task */
register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
@@ -7494,10 +7719,10 @@ static const struct file_operations ftrace_no_pid_fops = {
void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
- trace_create_file("set_ftrace_pid", 0644, d_tracer,
+ trace_create_file("set_ftrace_pid", TRACE_MODE_WRITE, d_tracer,
tr, &ftrace_pid_fops);
- trace_create_file("set_ftrace_notrace_pid", 0644, d_tracer,
- tr, &ftrace_no_pid_fops);
+ trace_create_file("set_ftrace_notrace_pid", TRACE_MODE_WRITE,
+ d_tracer, tr, &ftrace_no_pid_fops);
}
void __init ftrace_init_tracefs_toplevel(struct trace_array *tr,
@@ -7525,7 +7750,9 @@ void ftrace_kill(void)
}
/**
- * Test if ftrace is dead or not.
+ * ftrace_is_dead - Test if ftrace is dead or not.
+ *
+ * Returns 1 if ftrace is "dead", zero otherwise.
*/
int ftrace_is_dead(void)
{
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
new file mode 100644
index 000000000000..a2ef1d18126a
--- /dev/null
+++ b/kernel/trace/pid_list.c
@@ -0,0 +1,495 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org>
+ */
+#include <linux/spinlock.h>
+#include <linux/irq_work.h>
+#include <linux/slab.h>
+#include "trace.h"
+
+/* See pid_list.h for details */
+
+static inline union lower_chunk *get_lower_chunk(struct trace_pid_list *pid_list)
+{
+ union lower_chunk *chunk;
+
+ lockdep_assert_held(&pid_list->lock);
+
+ if (!pid_list->lower_list)
+ return NULL;
+
+ chunk = pid_list->lower_list;
+ pid_list->lower_list = chunk->next;
+ pid_list->free_lower_chunks--;
+ WARN_ON_ONCE(pid_list->free_lower_chunks < 0);
+ chunk->next = NULL;
+ /*
+ * If a refill needs to happen, it can not happen here
+ * as the scheduler run queue locks are held.
+ */
+ if (pid_list->free_lower_chunks <= CHUNK_REALLOC)
+ irq_work_queue(&pid_list->refill_irqwork);
+
+ return chunk;
+}
+
+static inline union upper_chunk *get_upper_chunk(struct trace_pid_list *pid_list)
+{
+ union upper_chunk *chunk;
+
+ lockdep_assert_held(&pid_list->lock);
+
+ if (!pid_list->upper_list)
+ return NULL;
+
+ chunk = pid_list->upper_list;
+ pid_list->upper_list = chunk->next;
+ pid_list->free_upper_chunks--;
+ WARN_ON_ONCE(pid_list->free_upper_chunks < 0);
+ chunk->next = NULL;
+ /*
+ * If a refill needs to happen, it can not happen here
+ * as the scheduler run queue locks are held.
+ */
+ if (pid_list->free_upper_chunks <= CHUNK_REALLOC)
+ irq_work_queue(&pid_list->refill_irqwork);
+
+ return chunk;
+}
+
+static inline void put_lower_chunk(struct trace_pid_list *pid_list,
+ union lower_chunk *chunk)
+{
+ lockdep_assert_held(&pid_list->lock);
+
+ chunk->next = pid_list->lower_list;
+ pid_list->lower_list = chunk;
+ pid_list->free_lower_chunks++;
+}
+
+static inline void put_upper_chunk(struct trace_pid_list *pid_list,
+ union upper_chunk *chunk)
+{
+ lockdep_assert_held(&pid_list->lock);
+
+ chunk->next = pid_list->upper_list;
+ pid_list->upper_list = chunk;
+ pid_list->free_upper_chunks++;
+}
+
+static inline bool upper_empty(union upper_chunk *chunk)
+{
+ /*
+ * If chunk->data has no lower chunks, it will be the same
+ * as a zeroed bitmask. Use find_first_bit() to test it
+ * and if it doesn't find any bits set, then the array
+ * is empty.
+ */
+ int bit = find_first_bit((unsigned long *)chunk->data,
+ sizeof(chunk->data) * 8);
+ return bit >= sizeof(chunk->data) * 8;
+}
+
+static inline int pid_split(unsigned int pid, unsigned int *upper1,
+ unsigned int *upper2, unsigned int *lower)
+{
+ /* MAX_PID should cover all pids */
+ BUILD_BUG_ON(MAX_PID < PID_MAX_LIMIT);
+
+ /* In case a bad pid is passed in, then fail */
+ if (unlikely(pid >= MAX_PID))
+ return -1;
+
+ *upper1 = (pid >> UPPER1_SHIFT) & UPPER_MASK;
+ *upper2 = (pid >> UPPER2_SHIFT) & UPPER_MASK;
+ *lower = pid & LOWER_MASK;
+
+ return 0;
+}
+
+static inline unsigned int pid_join(unsigned int upper1,
+ unsigned int upper2, unsigned int lower)
+{
+ return ((upper1 & UPPER_MASK) << UPPER1_SHIFT) |
+ ((upper2 & UPPER_MASK) << UPPER2_SHIFT) |
+ (lower & LOWER_MASK);
+}
+
+/**
+ * trace_pid_list_is_set - test if the pid is set in the list
+ * @pid_list: The pid list to test
+ * @pid: The pid to to see if set in the list.
+ *
+ * Tests if @pid is is set in the @pid_list. This is usually called
+ * from the scheduler when a task is scheduled. Its pid is checked
+ * if it should be traced or not.
+ *
+ * Return true if the pid is in the list, false otherwise.
+ */
+bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid)
+{
+ union upper_chunk *upper_chunk;
+ union lower_chunk *lower_chunk;
+ unsigned long flags;
+ unsigned int upper1;
+ unsigned int upper2;
+ unsigned int lower;
+ bool ret = false;
+
+ if (!pid_list)
+ return false;
+
+ if (pid_split(pid, &upper1, &upper2, &lower) < 0)
+ return false;
+
+ raw_spin_lock_irqsave(&pid_list->lock, flags);
+ upper_chunk = pid_list->upper[upper1];
+ if (upper_chunk) {
+ lower_chunk = upper_chunk->data[upper2];
+ if (lower_chunk)
+ ret = test_bit(lower, lower_chunk->data);
+ }
+ raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+
+ return ret;
+}
+
+/**
+ * trace_pid_list_set - add a pid to the list
+ * @pid_list: The pid list to add the @pid to.
+ * @pid: The pid to add.
+ *
+ * Adds @pid to @pid_list. This is usually done explicitly by a user
+ * adding a task to be traced, or indirectly by the fork function
+ * when children should be traced and a task's pid is in the list.
+ *
+ * Return 0 on success, negative otherwise.
+ */
+int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid)
+{
+ union upper_chunk *upper_chunk;
+ union lower_chunk *lower_chunk;
+ unsigned long flags;
+ unsigned int upper1;
+ unsigned int upper2;
+ unsigned int lower;
+ int ret;
+
+ if (!pid_list)
+ return -ENODEV;
+
+ if (pid_split(pid, &upper1, &upper2, &lower) < 0)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&pid_list->lock, flags);
+ upper_chunk = pid_list->upper[upper1];
+ if (!upper_chunk) {
+ upper_chunk = get_upper_chunk(pid_list);
+ if (!upper_chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pid_list->upper[upper1] = upper_chunk;
+ }
+ lower_chunk = upper_chunk->data[upper2];
+ if (!lower_chunk) {
+ lower_chunk = get_lower_chunk(pid_list);
+ if (!lower_chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ upper_chunk->data[upper2] = lower_chunk;
+ }
+ set_bit(lower, lower_chunk->data);
+ ret = 0;
+ out:
+ raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ return ret;
+}
+
+/**
+ * trace_pid_list_clear - remove a pid from the list
+ * @pid_list: The pid list to remove the @pid from.
+ * @pid: The pid to remove.
+ *
+ * Removes @pid from @pid_list. This is usually done explicitly by a user
+ * removing tasks from tracing, or indirectly by the exit function
+ * when a task that is set to be traced exits.
+ *
+ * Return 0 on success, negative otherwise.
+ */
+int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid)
+{
+ union upper_chunk *upper_chunk;
+ union lower_chunk *lower_chunk;
+ unsigned long flags;
+ unsigned int upper1;
+ unsigned int upper2;
+ unsigned int lower;
+
+ if (!pid_list)
+ return -ENODEV;
+
+ if (pid_split(pid, &upper1, &upper2, &lower) < 0)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&pid_list->lock, flags);
+ upper_chunk = pid_list->upper[upper1];
+ if (!upper_chunk)
+ goto out;
+
+ lower_chunk = upper_chunk->data[upper2];
+ if (!lower_chunk)
+ goto out;
+
+ clear_bit(lower, lower_chunk->data);
+
+ /* if there's no more bits set, add it to the free list */
+ if (find_first_bit(lower_chunk->data, LOWER_MAX) >= LOWER_MAX) {
+ put_lower_chunk(pid_list, lower_chunk);
+ upper_chunk->data[upper2] = NULL;
+ if (upper_empty(upper_chunk)) {
+ put_upper_chunk(pid_list, upper_chunk);
+ pid_list->upper[upper1] = NULL;
+ }
+ }
+ out:
+ raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ return 0;
+}
+
+/**
+ * trace_pid_list_next - return the next pid in the list
+ * @pid_list: The pid list to examine.
+ * @pid: The pid to start from
+ * @next: The pointer to place the pid that is set starting from @pid.
+ *
+ * Looks for the next consecutive pid that is in @pid_list starting
+ * at the pid specified by @pid. If one is set (including @pid), then
+ * that pid is placed into @next.
+ *
+ * Return 0 when a pid is found, -1 if there are no more pids included.
+ */
+int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
+ unsigned int *next)
+{
+ union upper_chunk *upper_chunk;
+ union lower_chunk *lower_chunk;
+ unsigned long flags;
+ unsigned int upper1;
+ unsigned int upper2;
+ unsigned int lower;
+
+ if (!pid_list)
+ return -ENODEV;
+
+ if (pid_split(pid, &upper1, &upper2, &lower) < 0)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&pid_list->lock, flags);
+ for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) {
+ upper_chunk = pid_list->upper[upper1];
+
+ if (!upper_chunk)
+ continue;
+
+ for (; upper2 <= UPPER_MASK; upper2++, lower = 0) {
+ lower_chunk = upper_chunk->data[upper2];
+ if (!lower_chunk)
+ continue;
+
+ lower = find_next_bit(lower_chunk->data, LOWER_MAX,
+ lower);
+ if (lower < LOWER_MAX)
+ goto found;
+ }
+ }
+
+ found:
+ raw_spin_unlock_irqrestore(&pid_list->lock, flags);
+ if (upper1 > UPPER_MASK)
+ return -1;
+
+ *next = pid_join(upper1, upper2, lower);
+ return 0;
+}
+
+/**
+ * trace_pid_list_first - return the first pid in the list
+ * @pid_list: The pid list to examine.
+ * @pid: The pointer to place the pid first found pid that is set.
+ *
+ * Looks for the first pid that is set in @pid_list, and places it
+ * into @pid if found.
+ *
+ * Return 0 when a pid is found, -1 if there are no pids set.
+ */
+int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid)
+{
+ return trace_pid_list_next(pid_list, 0, pid);
+}
+
+static void pid_list_refill_irq(struct irq_work *iwork)
+{
+ struct trace_pid_list *pid_list = container_of(iwork, struct trace_pid_list,
+ refill_irqwork);
+ union upper_chunk *upper = NULL;
+ union lower_chunk *lower = NULL;
+ union upper_chunk **upper_next = &upper;
+ union lower_chunk **lower_next = &lower;
+ int upper_count;
+ int lower_count;
+ int ucnt = 0;
+ int lcnt = 0;
+
+ again:
+ raw_spin_lock(&pid_list->lock);
+ upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks;
+ lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks;
+ raw_spin_unlock(&pid_list->lock);
+
+ if (upper_count <= 0 && lower_count <= 0)
+ return;
+
+ while (upper_count-- > 0) {
+ union upper_chunk *chunk;
+
+ chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ if (!chunk)
+ break;
+ *upper_next = chunk;
+ upper_next = &chunk->next;
+ ucnt++;
+ }
+
+ while (lower_count-- > 0) {
+ union lower_chunk *chunk;
+
+ chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ if (!chunk)
+ break;
+ *lower_next = chunk;
+ lower_next = &chunk->next;
+ lcnt++;
+ }
+
+ raw_spin_lock(&pid_list->lock);
+ if (upper) {
+ *upper_next = pid_list->upper_list;
+ pid_list->upper_list = upper;
+ pid_list->free_upper_chunks += ucnt;
+ }
+ if (lower) {
+ *lower_next = pid_list->lower_list;
+ pid_list->lower_list = lower;
+ pid_list->free_lower_chunks += lcnt;
+ }
+ raw_spin_unlock(&pid_list->lock);
+
+ /*
+ * On success of allocating all the chunks, both counters
+ * will be less than zero. If they are not, then an allocation
+ * failed, and we should not try again.
+ */
+ if (upper_count >= 0 || lower_count >= 0)
+ return;
+ /*
+ * When the locks were released, free chunks could have
+ * been used and allocation needs to be done again. Might as
+ * well allocate it now.
+ */
+ goto again;
+}
+
+/**
+ * trace_pid_list_alloc - create a new pid_list
+ *
+ * Allocates a new pid_list to store pids into.
+ *
+ * Returns the pid_list on success, NULL otherwise.
+ */
+struct trace_pid_list *trace_pid_list_alloc(void)
+{
+ struct trace_pid_list *pid_list;
+ int i;
+
+ /* According to linux/thread.h, pids can be no bigger that 30 bits */
+ WARN_ON_ONCE(pid_max > (1 << 30));
+
+ pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
+ if (!pid_list)
+ return NULL;
+
+ init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq);
+
+ raw_spin_lock_init(&pid_list->lock);
+
+ for (i = 0; i < CHUNK_ALLOC; i++) {
+ union upper_chunk *chunk;
+
+ chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ if (!chunk)
+ break;
+ chunk->next = pid_list->upper_list;
+ pid_list->upper_list = chunk;
+ pid_list->free_upper_chunks++;
+ }
+
+ for (i = 0; i < CHUNK_ALLOC; i++) {
+ union lower_chunk *chunk;
+
+ chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ if (!chunk)
+ break;
+ chunk->next = pid_list->lower_list;
+ pid_list->lower_list = chunk;
+ pid_list->free_lower_chunks++;
+ }
+
+ return pid_list;
+}
+
+/**
+ * trace_pid_list_free - Frees an allocated pid_list.
+ *
+ * Frees the memory for a pid_list that was allocated.
+ */
+void trace_pid_list_free(struct trace_pid_list *pid_list)
+{
+ union upper_chunk *upper;
+ union lower_chunk *lower;
+ int i, j;
+
+ if (!pid_list)
+ return;
+
+ irq_work_sync(&pid_list->refill_irqwork);
+
+ while (pid_list->lower_list) {
+ union lower_chunk *chunk;
+
+ chunk = pid_list->lower_list;
+ pid_list->lower_list = pid_list->lower_list->next;
+ kfree(chunk);
+ }
+
+ while (pid_list->upper_list) {
+ union upper_chunk *chunk;
+
+ chunk = pid_list->upper_list;
+ pid_list->upper_list = pid_list->upper_list->next;
+ kfree(chunk);
+ }
+
+ for (i = 0; i < UPPER1_SIZE; i++) {
+ upper = pid_list->upper[i];
+ if (upper) {
+ for (j = 0; j < UPPER2_SIZE; j++) {
+ lower = upper->data[j];
+ kfree(lower);
+ }
+ kfree(upper);
+ }
+ }
+ kfree(pid_list);
+}
diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h
new file mode 100644
index 000000000000..62e73f1ac85f
--- /dev/null
+++ b/kernel/trace/pid_list.h
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Do not include this file directly. */
+
+#ifndef _TRACE_INTERNAL_PID_LIST_H
+#define _TRACE_INTERNAL_PID_LIST_H
+
+/*
+ * In order to keep track of what pids to trace, a tree is created much
+ * like page tables are used. This creates a sparse bit map, where
+ * the tree is filled in when needed. A PID is at most 30 bits (see
+ * linux/thread.h), and is broken up into 3 sections based on the bit map
+ * of the bits. The 8 MSB is the "upper1" section. The next 8 MSB is the
+ * "upper2" section and the 14 LSB is the "lower" section.
+ *
+ * A trace_pid_list structure holds the "upper1" section, in an
+ * array of 256 pointers (1 or 2K in size) to "upper_chunk" unions, where
+ * each has an array of 256 pointers (1 or 2K in size) to the "lower_chunk"
+ * structures, where each has an array of size 2K bytes representing a bitmask
+ * of the 14 LSB of the PID (256 * 8 = 2048)
+ *
+ * When a trace_pid_list is allocated, it includes the 256 pointer array
+ * of the upper1 unions. Then a "cache" of upper and lower is allocated
+ * where these will be assigned as needed.
+ *
+ * When a bit is set in the pid_list bitmask, the pid to use has
+ * the 8 MSB masked, and this is used to index the array in the
+ * pid_list to find the next upper union. If the element is NULL,
+ * then one is retrieved from the upper_list cache. If none is
+ * available, then -ENOMEM is returned.
+ *
+ * The next 8 MSB is used to index into the "upper2" section. If this
+ * element is NULL, then it is retrieved from the lower_list cache.
+ * Again, if one is not available -ENOMEM is returned.
+ *
+ * Finally the 14 LSB of the PID is used to set the bit in the 16384
+ * bitmask (made up of 2K bytes).
+ *
+ * When the second upper section or the lower section has their last
+ * bit cleared, they are added back to the free list to be reused
+ * when needed.
+ */
+
+#define UPPER_BITS 8
+#define UPPER_MAX (1 << UPPER_BITS)
+#define UPPER1_SIZE (1 << UPPER_BITS)
+#define UPPER2_SIZE (1 << UPPER_BITS)
+
+#define LOWER_BITS 14
+#define LOWER_MAX (1 << LOWER_BITS)
+#define LOWER_SIZE (LOWER_MAX / BITS_PER_LONG)
+
+#define UPPER1_SHIFT (LOWER_BITS + UPPER_BITS)
+#define UPPER2_SHIFT LOWER_BITS
+#define LOWER_MASK (LOWER_MAX - 1)
+
+#define UPPER_MASK (UPPER_MAX - 1)
+
+/* According to linux/thread.h pids can not be bigger than or equal to 1 << 30 */
+#define MAX_PID (1 << 30)
+
+/* Just keep 6 chunks of both upper and lower in the cache on alloc */
+#define CHUNK_ALLOC 6
+
+/* Have 2 chunks free, trigger a refill of the cache */
+#define CHUNK_REALLOC 2
+
+union lower_chunk {
+ union lower_chunk *next;
+ unsigned long data[LOWER_SIZE]; // 2K in size
+};
+
+union upper_chunk {
+ union upper_chunk *next;
+ union lower_chunk *data[UPPER2_SIZE]; // 1 or 2K in size
+};
+
+struct trace_pid_list {
+ raw_spinlock_t lock;
+ struct irq_work refill_irqwork;
+ union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size
+ union upper_chunk *upper_list;
+ union lower_chunk *lower_list;
+ int free_upper_chunks;
+ int free_lower_chunks;
+};
+
+#endif /* _TRACE_INTERNAL_PID_LIST_H */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e592d1df6f88..05dfc7a12d3d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2111,7 +2111,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
}
- get_online_cpus();
+ cpus_read_lock();
/*
* Fire off all the required work handlers
* We can't schedule on offline CPUs, but it's not necessary
@@ -2143,7 +2143,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cpu_buffer->nr_pages_to_update = 0;
}
- put_online_cpus();
+ cpus_read_unlock();
} else {
cpu_buffer = buffer->buffers[cpu_id];
@@ -2171,7 +2171,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
goto out_err;
}
- get_online_cpus();
+ cpus_read_lock();
/* Can't run something on an offline CPU. */
if (!cpu_online(cpu_id))
@@ -2183,7 +2183,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
cpu_buffer->nr_pages_to_update = 0;
- put_online_cpus();
+ cpus_read_unlock();
}
out:
@@ -3167,14 +3167,9 @@ static __always_inline int
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned int val = cpu_buffer->current_context;
- unsigned long pc = preempt_count();
- int bit;
+ int bit = interrupt_context_level();
- if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
- bit = RB_CTX_NORMAL;
- else
- bit = pc & NMI_MASK ? RB_CTX_NMI :
- pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
+ bit = RB_CTX_NORMAL - bit;
if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) {
/*
@@ -5233,6 +5228,9 @@ void ring_buffer_reset(struct trace_buffer *buffer)
struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
@@ -5251,6 +5249,8 @@ void ring_buffer_reset(struct trace_buffer *buffer)
atomic_dec(&cpu_buffer->record_disabled);
atomic_dec(&cpu_buffer->resize_disabled);
}
+
+ mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset);
@@ -5898,16 +5898,13 @@ static __init int test_ringbuffer(void)
rb_data[cpu].buffer = buffer;
rb_data[cpu].cpu = cpu;
rb_data[cpu].cnt = cpu;
- rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
- "rbtester/%d", cpu);
+ rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu],
+ cpu, "rbtester/%u");
if (WARN_ON(IS_ERR(rb_threads[cpu]))) {
pr_cont("FAILED\n");
ret = PTR_ERR(rb_threads[cpu]);
goto out_free;
}
-
- kthread_bind(rb_threads[cpu], cpu);
- wake_up_process(rb_threads[cpu]);
}
/* Now create the rb hammer! */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2755534b0737..78ea542ce3bc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -512,12 +512,6 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
return 0;
}
-void trace_free_pid_list(struct trace_pid_list *pid_list)
-{
- vfree(pid_list->pids);
- kfree(pid_list);
-}
-
/**
* trace_find_filtered_pid - check if a pid exists in a filtered_pid list
* @filtered_pids: The list of pids to check
@@ -528,14 +522,7 @@ void trace_free_pid_list(struct trace_pid_list *pid_list)
bool
trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
{
- /*
- * If pid_max changed after filtered_pids was created, we
- * by default ignore all pids greater than the previous pid_max.
- */
- if (search_pid >= filtered_pids->pid_max)
- return false;
-
- return test_bit(search_pid, filtered_pids->pids);
+ return trace_pid_list_is_set(filtered_pids, search_pid);
}
/**
@@ -592,15 +579,11 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
return;
}
- /* Sorry, but we don't support pid_max changing after setting */
- if (task->pid >= pid_list->pid_max)
- return;
-
/* "self" is set for forks, and NULL for exits */
if (self)
- set_bit(task->pid, pid_list->pids);
+ trace_pid_list_set(pid_list, task->pid);
else
- clear_bit(task->pid, pid_list->pids);
+ trace_pid_list_clear(pid_list, task->pid);
}
/**
@@ -617,18 +600,19 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
*/
void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
{
- unsigned long pid = (unsigned long)v;
+ long pid = (unsigned long)v;
+ unsigned int next;
(*pos)++;
/* pid already is +1 of the actual previous bit */
- pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+ if (trace_pid_list_next(pid_list, pid, &next) < 0)
+ return NULL;
- /* Return pid + 1 to allow zero to be represented */
- if (pid < pid_list->pid_max)
- return (void *)(pid + 1);
+ pid = next;
- return NULL;
+ /* Return pid + 1 to allow zero to be represented */
+ return (void *)(pid + 1);
}
/**
@@ -645,12 +629,14 @@ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
{
unsigned long pid;
+ unsigned int first;
loff_t l = 0;
- pid = find_first_bit(pid_list->pids, pid_list->pid_max);
- if (pid >= pid_list->pid_max)
+ if (trace_pid_list_first(pid_list, &first) < 0)
return NULL;
+ pid = first;
+
/* Return pid + 1 so that zero can be the exit value */
for (pid++; pid && l < *pos;
pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
@@ -686,7 +672,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
unsigned long val;
int nr_pids = 0;
ssize_t read = 0;
- ssize_t ret = 0;
+ ssize_t ret;
loff_t pos;
pid_t pid;
@@ -699,34 +685,23 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
* the user. If the operation fails, then the current list is
* not modified.
*/
- pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+ pid_list = trace_pid_list_alloc();
if (!pid_list) {
trace_parser_put(&parser);
return -ENOMEM;
}
- pid_list->pid_max = READ_ONCE(pid_max);
-
- /* Only truncating will shrink pid_max */
- if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
- pid_list->pid_max = filtered_pids->pid_max;
-
- pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
- if (!pid_list->pids) {
- trace_parser_put(&parser);
- kfree(pid_list);
- return -ENOMEM;
- }
-
if (filtered_pids) {
/* copy the current bits to the new max */
- for_each_set_bit(pid, filtered_pids->pids,
- filtered_pids->pid_max) {
- set_bit(pid, pid_list->pids);
+ ret = trace_pid_list_first(filtered_pids, &pid);
+ while (!ret) {
+ trace_pid_list_set(pid_list, pid);
+ ret = trace_pid_list_next(filtered_pids, pid + 1, &pid);
nr_pids++;
}
}
+ ret = 0;
while (cnt > 0) {
pos = 0;
@@ -742,12 +717,13 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
ret = -EINVAL;
if (kstrtoul(parser.buffer, 0, &val))
break;
- if (val >= pid_list->pid_max)
- break;
pid = (pid_t)val;
- set_bit(pid, pid_list->pids);
+ if (trace_pid_list_set(pid_list, pid) < 0) {
+ ret = -1;
+ break;
+ }
nr_pids++;
trace_parser_clear(&parser);
@@ -756,13 +732,13 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
trace_parser_put(&parser);
if (ret < 0) {
- trace_free_pid_list(pid_list);
+ trace_pid_list_free(pid_list);
return ret;
}
if (!nr_pids) {
/* Cleared the list of pids */
- trace_free_pid_list(pid_list);
+ trace_pid_list_free(pid_list);
read = ret;
pid_list = NULL;
}
@@ -1714,7 +1690,8 @@ static void trace_create_maxlat_file(struct trace_array *tr,
{
INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
- tr->d_max_latency = trace_create_file("tracing_max_latency", 0644,
+ tr->d_max_latency = trace_create_file("tracing_max_latency",
+ TRACE_MODE_WRITE,
d_tracer, &tr->max_latency,
&tracing_max_lat_fops);
}
@@ -1744,16 +1721,15 @@ void latency_fsnotify(struct trace_array *tr)
irq_work_queue(&tr->fsnotify_irqwork);
}
-/*
- * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
- * defined(CONFIG_FSNOTIFY)
- */
-#else
+#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
+ || defined(CONFIG_OSNOISE_TRACER)
#define trace_create_maxlat_file(tr, d_tracer) \
- trace_create_file("tracing_max_latency", 0644, d_tracer, \
- &tr->max_latency, &tracing_max_lat_fops)
+ trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \
+ d_tracer, &tr->max_latency, &tracing_max_lat_fops)
+#else
+#define trace_create_maxlat_file(tr, d_tracer) do { } while (0)
#endif
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -2603,6 +2579,15 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
}
EXPORT_SYMBOL_GPL(trace_handle_return);
+static unsigned short migration_disable_value(void)
+{
+#if defined(CONFIG_SMP)
+ return current->migration_disabled;
+#else
+ return 0;
+#endif
+}
+
unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
{
unsigned int trace_flags = irqs_status;
@@ -2621,7 +2606,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
trace_flags |= TRACE_FLAG_NEED_RESCHED;
if (test_preempt_need_resched())
trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
- return (trace_flags << 16) | (pc & 0xff);
+ return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
+ (min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
}
struct ring_buffer_event *
@@ -3221,7 +3207,7 @@ struct trace_buffer_struct {
char buffer[4][TRACE_BUF_SIZE];
};
-static struct trace_buffer_struct *trace_percpu_buffer;
+static struct trace_buffer_struct __percpu *trace_percpu_buffer;
/*
* This allows for lockless recording. If we're nested too deeply, then
@@ -3231,7 +3217,7 @@ static char *get_trace_buf(void)
{
struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
- if (!buffer || buffer->nesting >= 4)
+ if (!trace_percpu_buffer || buffer->nesting >= 4)
return NULL;
buffer->nesting++;
@@ -3250,7 +3236,7 @@ static void put_trace_buf(void)
static int alloc_percpu_trace_buffer(void)
{
- struct trace_buffer_struct *buffers;
+ struct trace_buffer_struct __percpu *buffers;
if (trace_percpu_buffer)
return 0;
@@ -3697,11 +3683,11 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str)
return false;
event = container_of(trace_event, struct trace_event_call, event);
- if (!event->mod)
+ if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module)
return false;
/* Would rather have rodata, but this will suffice */
- if (within_module_core(addr, event->mod))
+ if (within_module_core(addr, event->module))
return true;
return false;
@@ -3826,6 +3812,18 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
iter->fmt[i] = '\0';
trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+ /*
+ * If iter->seq is full, the above call no longer guarantees
+ * that ap is in sync with fmt processing, and further calls
+ * to va_arg() can return wrong positional arguments.
+ *
+ * Ensure that ap is no longer used in this case.
+ */
+ if (iter->seq.full) {
+ p = "";
+ break;
+ }
+
if (star)
len = va_arg(ap, int);
@@ -4189,9 +4187,10 @@ static void print_lat_help_header(struct seq_file *m)
"# | / _----=> need-resched \n"
"# || / _---=> hardirq/softirq \n"
"# ||| / _--=> preempt-depth \n"
- "# |||| / delay \n"
- "# cmd pid ||||| time | caller \n"
- "# \\ / ||||| \\ | / \n");
+ "# |||| / _-=> migrate-disable \n"
+ "# ||||| / delay \n"
+ "# cmd pid |||||| time | caller \n"
+ "# \\ / |||||| \\ | / \n");
}
static void print_event_info(struct array_buffer *buf, struct seq_file *m)
@@ -4229,9 +4228,10 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file
seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
- seq_printf(m, "# %.*s||| / delay\n", prec, space);
- seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID ");
- seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | ");
+ seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space);
+ seq_printf(m, "# %.*s|||| / delay\n", prec, space);
+ seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID ");
+ seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | ");
}
void
@@ -5543,6 +5543,7 @@ static const char readme_msg[] =
#ifdef CONFIG_HIST_TRIGGERS
"\t s:[synthetic/]<event> <field> [<field>]\n"
#endif
+ "\t e[:[<group>/]<event>] <attached-group>.<attached-event> [<args>]\n"
"\t -:[<group>/]<event>\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
@@ -5552,7 +5553,7 @@ static const char readme_msg[] =
" place (uprobe): <path>:<offset>[%return][(ref_ctr_offset)]\n"
#endif
"\t args: <name>=fetcharg[:type]\n"
- "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"
+ "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n"
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#else
@@ -5567,6 +5568,8 @@ static const char readme_msg[] =
"\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
"\t [unsigned] char/int/long\n"
#endif
+ "\t efield: For event probes ('e' types), the field is on of the fields\n"
+ "\t of the <attached-group>/<attached-event>.\n"
#endif
" events/\t\t- Directory containing all trace event subsystems:\n"
" enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
@@ -5614,6 +5617,7 @@ static const char readme_msg[] =
#ifdef CONFIG_HIST_TRIGGERS
" hist trigger\t- If set, event hits are aggregated into a hash table\n"
"\t Format: hist:keys=<field1[,field2,...]>\n"
+ "\t [:<var1>=<field|var_ref|numeric_literal>[,<var2>=...]]\n"
"\t [:values=<field1[,field2,...]>]\n"
"\t [:sort=<field1[,field2,...]>]\n"
"\t [:size=#entries]\n"
@@ -5625,6 +5629,16 @@ static const char readme_msg[] =
"\t common_timestamp - to record current timestamp\n"
"\t common_cpu - to record the CPU the event happened on\n"
"\n"
+ "\t A hist trigger variable can be:\n"
+ "\t - a reference to a field e.g. x=current_timestamp,\n"
+ "\t - a reference to another variable e.g. y=$x,\n"
+ "\t - a numeric literal: e.g. ms_per_sec=1000,\n"
+ "\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n"
+ "\n"
+ "\t hist trigger aritmethic expressions support addition(+), subtraction(-),\n"
+ "\t multiplication(*) and division(/) operators. An operand can be either a\n"
+ "\t variable reference, field or numeric literal.\n"
+ "\n"
"\t When a matching event is hit, an entry is added to a hash\n"
"\t table using the key(s) and value(s) named, and the value of a\n"
"\t sum called 'hitcount' is incremented. Keys and values\n"
@@ -5654,6 +5668,7 @@ static const char readme_msg[] =
"\t .execname display a common_pid as a program name\n"
"\t .syscall display a syscall id as a syscall name\n"
"\t .log2 display log2 value rather than raw number\n"
+ "\t .buckets=size display values in groups of size rather than raw number\n"
"\t .usecs display a common_timestamp in microseconds\n\n"
"\t The 'pause' parameter can be used to pause an existing hist\n"
"\t trigger or to start a hist trigger but not log any events\n"
@@ -6062,7 +6077,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
static void trace_create_eval_file(struct dentry *d_tracer)
{
- trace_create_file("eval_map", 0444, d_tracer,
+ trace_create_file("eval_map", TRACE_MODE_READ, d_tracer,
NULL, &tracing_eval_map_fops);
}
@@ -6703,9 +6718,7 @@ waitagain:
cnt = PAGE_SIZE - 1;
/* reset all but tr, trace, and overruns */
- memset(&iter->seq, 0,
- sizeof(struct trace_iterator) -
- offsetof(struct trace_iterator, seq));
+ memset_startat(iter, 0, seq);
cpumask_clear(iter->started);
trace_seq_init(&iter->seq);
iter->pos = -1;
@@ -8575,27 +8588,27 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
}
/* per cpu trace_pipe */
- trace_create_cpu_file("trace_pipe", 0444, d_cpu,
+ trace_create_cpu_file("trace_pipe", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_pipe_fops);
/* per cpu trace */
- trace_create_cpu_file("trace", 0644, d_cpu,
+ trace_create_cpu_file("trace", TRACE_MODE_WRITE, d_cpu,
tr, cpu, &tracing_fops);
- trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu,
+ trace_create_cpu_file("trace_pipe_raw", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_buffers_fops);
- trace_create_cpu_file("stats", 0444, d_cpu,
+ trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_stats_fops);
- trace_create_cpu_file("buffer_size_kb", 0444, d_cpu,
+ trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_entries_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_cpu_file("snapshot", 0644, d_cpu,
+ trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
tr, cpu, &snapshot_fops);
- trace_create_cpu_file("snapshot_raw", 0444, d_cpu,
+ trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
tr, cpu, &snapshot_raw_fops);
#endif
}
@@ -8801,8 +8814,8 @@ create_trace_option_file(struct trace_array *tr,
topt->opt = opt;
topt->tr = tr;
- topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
- &trace_options_fops);
+ topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE,
+ t_options, topt, &trace_options_fops);
}
@@ -8877,7 +8890,7 @@ create_trace_option_core_file(struct trace_array *tr,
if (!t_options)
return NULL;
- return trace_create_file(option, 0644, t_options,
+ return trace_create_file(option, TRACE_MODE_WRITE, t_options,
(void *)&tr->trace_flags_index[index],
&trace_options_core_fops);
}
@@ -9402,28 +9415,28 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
struct trace_event_file *file;
int cpu;
- trace_create_file("available_tracers", 0444, d_tracer,
+ trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
tr, &show_traces_fops);
- trace_create_file("current_tracer", 0644, d_tracer,
+ trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer,
tr, &set_tracer_fops);
- trace_create_file("tracing_cpumask", 0644, d_tracer,
+ trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_cpumask_fops);
- trace_create_file("trace_options", 0644, d_tracer,
+ trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_iter_fops);
- trace_create_file("trace", 0644, d_tracer,
+ trace_create_file("trace", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_fops);
- trace_create_file("trace_pipe", 0444, d_tracer,
+ trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer,
tr, &tracing_pipe_fops);
- trace_create_file("buffer_size_kb", 0644, d_tracer,
+ trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_entries_fops);
- trace_create_file("buffer_total_size_kb", 0444, d_tracer,
+ trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer,
tr, &tracing_total_entries_fops);
trace_create_file("free_buffer", 0200, d_tracer,
@@ -9434,42 +9447,40 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
file = __find_event_file(tr, "ftrace", "print");
if (file && file->dir)
- trace_create_file("trigger", 0644, file->dir, file,
- &event_trigger_fops);
+ trace_create_file("trigger", TRACE_MODE_WRITE, file->dir,
+ file, &event_trigger_fops);
tr->trace_marker_file = file;
trace_create_file("trace_marker_raw", 0220, d_tracer,
tr, &tracing_mark_raw_fops);
- trace_create_file("trace_clock", 0644, d_tracer, tr,
+ trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr,
&trace_clock_fops);
- trace_create_file("tracing_on", 0644, d_tracer,
+ trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
tr, &rb_simple_fops);
- trace_create_file("timestamp_mode", 0444, d_tracer, tr,
+ trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
&trace_time_stamp_mode_fops);
tr->buffer_percent = 50;
- trace_create_file("buffer_percent", 0444, d_tracer,
+ trace_create_file("buffer_percent", TRACE_MODE_READ, d_tracer,
tr, &buffer_percent_fops);
create_trace_options_dir(tr);
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
trace_create_maxlat_file(tr, d_tracer);
-#endif
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_file("snapshot", 0644, d_tracer,
+ trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
tr, &snapshot_fops);
#endif
- trace_create_file("error_log", 0644, d_tracer,
+ trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_err_log_fops);
for_each_tracing_cpu(cpu)
@@ -9662,19 +9673,19 @@ static __init int tracer_init_tracefs(void)
init_tracer_tracefs(&global_trace, NULL);
ftrace_init_tracefs_toplevel(&global_trace, NULL);
- trace_create_file("tracing_thresh", 0644, NULL,
+ trace_create_file("tracing_thresh", TRACE_MODE_WRITE, NULL,
&global_trace, &tracing_thresh_fops);
- trace_create_file("README", 0444, NULL,
+ trace_create_file("README", TRACE_MODE_READ, NULL,
NULL, &tracing_readme_fops);
- trace_create_file("saved_cmdlines", 0444, NULL,
+ trace_create_file("saved_cmdlines", TRACE_MODE_READ, NULL,
NULL, &tracing_saved_cmdlines_fops);
- trace_create_file("saved_cmdlines_size", 0644, NULL,
+ trace_create_file("saved_cmdlines_size", TRACE_MODE_WRITE, NULL,
NULL, &tracing_saved_cmdlines_size_fops);
- trace_create_file("saved_tgids", 0444, NULL,
+ trace_create_file("saved_tgids", TRACE_MODE_READ, NULL,
NULL, &tracing_saved_tgids_fops);
trace_eval_init();
@@ -9686,7 +9697,7 @@ static __init int tracer_init_tracefs(void)
#endif
#ifdef CONFIG_DYNAMIC_FTRACE
- trace_create_file("dyn_ftrace_total_info", 0444, NULL,
+ trace_create_file("dyn_ftrace_total_info", TRACE_MODE_READ, NULL,
NULL, &tracing_dyn_info_fops);
#endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4a0e693000c6..38715aa6cfdf 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,11 +22,16 @@
#include <linux/ctype.h>
#include <linux/once_lite.h>
+#include "pid_list.h"
+
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
#include <asm/syscall.h> /* some archs define it here */
#endif
+#define TRACE_MODE_WRITE 0640
+#define TRACE_MODE_READ 0440
+
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -126,6 +131,11 @@ struct kprobe_trace_entry_head {
unsigned long ip;
};
+struct eprobe_trace_entry_head {
+ struct trace_entry ent;
+ unsigned int type;
+};
+
struct kretprobe_trace_entry_head {
struct trace_entry ent;
unsigned long func;
@@ -183,10 +193,14 @@ struct trace_options {
struct trace_option_dentry *topts;
};
-struct trace_pid_list {
- int pid_max;
- unsigned long *pids;
-};
+struct trace_pid_list *trace_pid_list_alloc(void);
+void trace_pid_list_free(struct trace_pid_list *pid_list);
+bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid);
+int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid);
+int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid);
+int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid);
+int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid,
+ unsigned int *next);
enum {
TRACE_PIDS = BIT(0),
@@ -876,7 +890,7 @@ static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
* is set, and called by an interrupt handler, we still
* want to trace it.
*/
- if (in_irq())
+ if (in_hardirq())
trace_recursion_set(TRACE_IRQ_BIT);
else
trace_recursion_clear(TRACE_IRQ_BIT);
@@ -1352,14 +1366,26 @@ __event_trigger_test_discard(struct trace_event_file *file,
if (eflags & EVENT_FILE_FL_TRIGGER_COND)
*tt = event_triggers_call(file, buffer, entry, event);
- if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
- (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
- !filter_match_preds(file->filter, entry))) {
- __trace_event_discard_commit(buffer, event);
- return true;
- }
+ if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED |
+ EVENT_FILE_FL_FILTERED |
+ EVENT_FILE_FL_PID_FILTER))))
+ return false;
+
+ if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
+ goto discard;
+
+ if (file->flags & EVENT_FILE_FL_FILTERED &&
+ !filter_match_preds(file->filter, entry))
+ goto discard;
+
+ if ((file->flags & EVENT_FILE_FL_PID_FILTER) &&
+ trace_event_ignore_this_pid(file))
+ goto discard;
return false;
+ discard:
+ __trace_event_discard_commit(buffer, event);
+ return true;
}
/**
@@ -1508,9 +1534,14 @@ static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; }
extern int register_trigger_cmds(void);
extern void clear_event_triggers(struct trace_array *tr);
+enum {
+ EVENT_TRIGGER_FL_PROBE = BIT(0),
+};
+
struct event_trigger_data {
unsigned long count;
int ref;
+ int flags;
struct event_trigger_ops *ops;
struct event_command *cmd_ops;
struct event_filter __rcu *filter;
@@ -1918,6 +1949,14 @@ static inline bool is_good_name(const char *name)
return true;
}
+/* Convert certain expected symbols into '_' when generating event names */
+static inline void sanitize_event_name(char *name)
+{
+ while (*name++ != '\0')
+ if (*name == ':' || *name == '.')
+ *name = '_';
+}
+
/*
* This is a generic way to read and write a u64 value from a file in tracefs.
*
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 94ef2d099e32..0580287d7a0d 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -171,6 +171,293 @@ trace_boot_add_synth_event(struct xbc_node *node, const char *event)
}
#endif
+#ifdef CONFIG_HIST_TRIGGERS
+static int __init __printf(3, 4)
+append_printf(char **bufp, char *end, const char *fmt, ...)
+{
+ va_list args;
+ int ret;
+
+ if (*bufp == end)
+ return -ENOSPC;
+
+ va_start(args, fmt);
+ ret = vsnprintf(*bufp, end - *bufp, fmt, args);
+ if (ret < end - *bufp) {
+ *bufp += ret;
+ } else {
+ *bufp = end;
+ ret = -ERANGE;
+ }
+ va_end(args);
+
+ return ret;
+}
+
+static int __init
+append_str_nospace(char **bufp, char *end, const char *str)
+{
+ char *p = *bufp;
+ int len;
+
+ while (p < end - 1 && *str != '\0') {
+ if (!isspace(*str))
+ *(p++) = *str;
+ str++;
+ }
+ *p = '\0';
+ if (p == end - 1) {
+ *bufp = end;
+ return -ENOSPC;
+ }
+ len = p - *bufp;
+ *bufp = p;
+ return (int)len;
+}
+
+static int __init
+trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp,
+ char *end, const char *key)
+{
+ struct xbc_node *anode;
+ const char *p;
+ char sep;
+
+ p = xbc_node_find_value(hnode, key, &anode);
+ if (p) {
+ if (!anode) {
+ pr_err("hist.%s requires value(s).\n", key);
+ return -EINVAL;
+ }
+
+ append_printf(bufp, end, ":%s", key);
+ sep = '=';
+ xbc_array_for_each_value(anode, p) {
+ append_printf(bufp, end, "%c%s", sep, p);
+ if (sep == '=')
+ sep = ',';
+ }
+ } else
+ return -ENOENT;
+
+ return 0;
+}
+
+static int __init
+trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp,
+ char *end, const char *handler,
+ const char *param)
+{
+ struct xbc_node *knode, *anode;
+ const char *p;
+ char sep;
+
+ /* Compose 'handler' parameter */
+ p = xbc_node_find_value(hnode, param, NULL);
+ if (!p) {
+ pr_err("hist.%s requires '%s' option.\n",
+ xbc_node_get_data(hnode), param);
+ return -EINVAL;
+ }
+ append_printf(bufp, end, ":%s(%s)", handler, p);
+
+ /* Compose 'action' parameter */
+ knode = xbc_node_find_subkey(hnode, "trace");
+ if (!knode)
+ knode = xbc_node_find_subkey(hnode, "save");
+
+ if (knode) {
+ anode = xbc_node_get_child(knode);
+ if (!anode || !xbc_node_is_value(anode)) {
+ pr_err("hist.%s.%s requires value(s).\n",
+ xbc_node_get_data(hnode),
+ xbc_node_get_data(knode));
+ return -EINVAL;
+ }
+
+ append_printf(bufp, end, ".%s", xbc_node_get_data(knode));
+ sep = '(';
+ xbc_array_for_each_value(anode, p) {
+ append_printf(bufp, end, "%c%s", sep, p);
+ if (sep == '(')
+ sep = ',';
+ }
+ append_printf(bufp, end, ")");
+ } else if (xbc_node_find_subkey(hnode, "snapshot")) {
+ append_printf(bufp, end, ".snapshot()");
+ } else {
+ pr_err("hist.%s requires an action.\n",
+ xbc_node_get_data(hnode));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __init
+trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp,
+ char *end, const char *param)
+{
+ struct xbc_node *node;
+ const char *p, *handler;
+ int ret;
+
+ handler = xbc_node_get_data(hnode);
+
+ xbc_node_for_each_subkey(hnode, node) {
+ p = xbc_node_get_data(node);
+ if (!isdigit(p[0]))
+ continue;
+ /* All digit started node should be instances. */
+ ret = trace_boot_hist_add_one_handler(node, bufp, end, handler, param);
+ if (ret < 0)
+ break;
+ }
+
+ if (xbc_node_find_subkey(hnode, param))
+ ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param);
+
+ return ret;
+}
+
+/*
+ * Histogram boottime tracing syntax.
+ *
+ * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist[.N] {
+ * keys = <KEY>[,...]
+ * values = <VAL>[,...]
+ * sort = <SORT-KEY>[,...]
+ * size = <ENTRIES>
+ * name = <HISTNAME>
+ * var { <VAR> = <EXPR> ... }
+ * pause|continue|clear
+ * onmax|onchange[.N] { var = <VAR>; <ACTION> [= <PARAM>] }
+ * onmatch[.N] { event = <EVENT>; <ACTION> [= <PARAM>] }
+ * filter = <FILTER>
+ * }
+ *
+ * Where <ACTION> are;
+ *
+ * trace = <EVENT>, <ARG1>[, ...]
+ * save = <ARG1>[, ...]
+ * snapshot
+ */
+static int __init
+trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size)
+{
+ struct xbc_node *node, *knode;
+ char *end = buf + size;
+ const char *p;
+ int ret = 0;
+
+ append_printf(&buf, end, "hist");
+
+ ret = trace_boot_hist_add_array(hnode, &buf, end, "keys");
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ pr_err("hist requires keys.\n");
+ return -EINVAL;
+ }
+
+ ret = trace_boot_hist_add_array(hnode, &buf, end, "values");
+ if (ret == -EINVAL)
+ return ret;
+ ret = trace_boot_hist_add_array(hnode, &buf, end, "sort");
+ if (ret == -EINVAL)
+ return ret;
+
+ p = xbc_node_find_value(hnode, "size", NULL);
+ if (p)
+ append_printf(&buf, end, ":size=%s", p);
+
+ p = xbc_node_find_value(hnode, "name", NULL);
+ if (p)
+ append_printf(&buf, end, ":name=%s", p);
+
+ node = xbc_node_find_subkey(hnode, "var");
+ if (node) {
+ xbc_node_for_each_key_value(node, knode, p) {
+ /* Expression must not include spaces. */
+ append_printf(&buf, end, ":%s=",
+ xbc_node_get_data(knode));
+ append_str_nospace(&buf, end, p);
+ }
+ }
+
+ /* Histogram control attributes (mutual exclusive) */
+ if (xbc_node_find_value(hnode, "pause", NULL))
+ append_printf(&buf, end, ":pause");
+ else if (xbc_node_find_value(hnode, "continue", NULL))
+ append_printf(&buf, end, ":continue");
+ else if (xbc_node_find_value(hnode, "clear", NULL))
+ append_printf(&buf, end, ":clear");
+
+ /* Histogram handler and actions */
+ node = xbc_node_find_subkey(hnode, "onmax");
+ if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0)
+ return -EINVAL;
+ node = xbc_node_find_subkey(hnode, "onchange");
+ if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0)
+ return -EINVAL;
+ node = xbc_node_find_subkey(hnode, "onmatch");
+ if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0)
+ return -EINVAL;
+
+ p = xbc_node_find_value(hnode, "filter", NULL);
+ if (p)
+ append_printf(&buf, end, " if %s", p);
+
+ if (buf == end) {
+ pr_err("hist exceeds the max command length.\n");
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
+static void __init
+trace_boot_init_histograms(struct trace_event_file *file,
+ struct xbc_node *hnode, char *buf, size_t size)
+{
+ struct xbc_node *node;
+ const char *p;
+ char *tmp;
+
+ xbc_node_for_each_subkey(hnode, node) {
+ p = xbc_node_get_data(node);
+ if (!isdigit(p[0]))
+ continue;
+ /* All digit started node should be instances. */
+ if (trace_boot_compose_hist_cmd(node, buf, size) == 0) {
+ tmp = kstrdup(buf, GFP_KERNEL);
+ if (!tmp)
+ return;
+ if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply hist trigger: %s\n", tmp);
+ kfree(tmp);
+ }
+ }
+
+ if (xbc_node_find_subkey(hnode, "keys")) {
+ if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) {
+ tmp = kstrdup(buf, GFP_KERNEL);
+ if (!tmp)
+ return;
+ if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply hist trigger: %s\n", tmp);
+ kfree(tmp);
+ }
+ }
+}
+#else
+static void __init
+trace_boot_init_histograms(struct trace_event_file *file,
+ struct xbc_node *hnode, char *buf, size_t size)
+{
+ /* do nothing */
+}
+#endif
+
static void __init
trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
struct xbc_node *enode)
@@ -205,12 +492,18 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
pr_err("Failed to apply filter: %s\n", buf);
}
- xbc_node_for_each_array_value(enode, "actions", anode, p) {
- if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
- pr_err("action string is too long: %s\n", p);
- else if (trigger_process_regex(file, buf) < 0)
- pr_err("Failed to apply an action: %s\n", buf);
- }
+ if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) {
+ xbc_node_for_each_array_value(enode, "actions", anode, p) {
+ if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
+ pr_err("action string is too long: %s\n", p);
+ else if (trigger_process_regex(file, buf) < 0)
+ pr_err("Failed to apply an action: %s\n", p);
+ }
+ anode = xbc_node_find_subkey(enode, "hist");
+ if (anode)
+ trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf));
+ } else if (xbc_node_find_value(enode, "actions", NULL))
+ pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n");
if (xbc_node_find_value(enode, "enable", NULL)) {
if (trace_event_enable_disable(file, 1, 0) < 0)
@@ -228,18 +521,18 @@ trace_boot_init_events(struct trace_array *tr, struct xbc_node *node)
bool enable, enable_all = false;
const char *data;
- node = xbc_node_find_child(node, "event");
+ node = xbc_node_find_subkey(node, "event");
if (!node)
return;
/* per-event key starts with "event.GROUP.EVENT" */
- xbc_node_for_each_child(node, gnode) {
+ xbc_node_for_each_subkey(node, gnode) {
data = xbc_node_get_data(gnode);
if (!strcmp(data, "enable")) {
enable_all = true;
continue;
}
enable = false;
- xbc_node_for_each_child(gnode, enode) {
+ xbc_node_for_each_subkey(gnode, enode) {
data = xbc_node_get_data(enode);
if (!strcmp(data, "enable")) {
enable = true;
@@ -331,11 +624,11 @@ trace_boot_init_instances(struct xbc_node *node)
struct trace_array *tr;
const char *p;
- node = xbc_node_find_child(node, "instance");
+ node = xbc_node_find_subkey(node, "instance");
if (!node)
return;
- xbc_node_for_each_child(node, inode) {
+ xbc_node_for_each_subkey(node, inode) {
p = xbc_node_get_data(inode);
if (!p || *p == '\0')
continue;
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index e57cc0870892..e34e8182ee4b 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -13,11 +13,49 @@
#include <linux/tracefs.h>
#include "trace.h"
+#include "trace_output.h" /* for trace_event_sem */
#include "trace_dynevent.h"
static DEFINE_MUTEX(dyn_event_ops_mutex);
static LIST_HEAD(dyn_event_ops_list);
+bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call)
+{
+ struct trace_event_call *call;
+ bool ret = false;
+
+ if (WARN_ON_ONCE(!(dyn_call->flags & TRACE_EVENT_FL_DYNAMIC)))
+ return false;
+
+ down_read(&trace_event_sem);
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (call == dyn_call) {
+ atomic_inc(&dyn_call->refcnt);
+ ret = true;
+ }
+ }
+ up_read(&trace_event_sem);
+ return ret;
+}
+
+void trace_event_dyn_put_ref(struct trace_event_call *call)
+{
+ if (WARN_ON_ONCE(!(call->flags & TRACE_EVENT_FL_DYNAMIC)))
+ return;
+
+ if (WARN_ON_ONCE(atomic_read(&call->refcnt) <= 0)) {
+ atomic_set(&call->refcnt, 0);
+ return;
+ }
+
+ atomic_dec(&call->refcnt);
+}
+
+bool trace_event_dyn_busy(struct trace_event_call *call)
+{
+ return atomic_read(&call->refcnt) != 0;
+}
+
int dyn_event_register(struct dyn_event_operations *ops)
{
if (!ops || !ops->create || !ops->show || !ops->is_busy ||
@@ -224,7 +262,7 @@ static __init int init_dynamic_event(void)
if (ret)
return 0;
- entry = tracefs_create_file("dynamic_events", 0644, NULL,
+ entry = tracefs_create_file("dynamic_events", TRACE_MODE_WRITE, NULL,
NULL, &dynamic_events_ops);
/* Event list interface */
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index 7754936b57ee..936477a111d3 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -76,13 +76,15 @@ int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops)
return 0;
}
-static inline int dyn_event_add(struct dyn_event *ev)
+static inline int dyn_event_add(struct dyn_event *ev,
+ struct trace_event_call *call)
{
lockdep_assert_held(&event_mutex);
if (!ev || !ev->ops)
return -EINVAL;
+ call->flags |= TRACE_EVENT_FL_DYNAMIC;
list_add_tail(&ev->list, &dyn_event_list);
return 0;
}
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
new file mode 100644
index 000000000000..928867f527e7
--- /dev/null
+++ b/kernel/trace/trace_eprobe.c
@@ -0,0 +1,959 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * event probes
+ *
+ * Part of this code was copied from kernel/trace/trace_kprobe.c written by
+ * Masami Hiramatsu <mhiramat@kernel.org>
+ *
+ * Copyright (C) 2021, VMware Inc, Steven Rostedt <rostedt@goodmis.org>
+ * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/ftrace.h>
+
+#include "trace_dynevent.h"
+#include "trace_probe.h"
+#include "trace_probe_tmpl.h"
+
+#define EPROBE_EVENT_SYSTEM "eprobes"
+
+struct trace_eprobe {
+ /* tracepoint system */
+ const char *event_system;
+
+ /* tracepoint event */
+ const char *event_name;
+
+ struct trace_event_call *event;
+
+ struct dyn_event devent;
+ struct trace_probe tp;
+};
+
+struct eprobe_data {
+ struct trace_event_file *file;
+ struct trace_eprobe *ep;
+};
+
+static int __trace_eprobe_create(int argc, const char *argv[]);
+
+static void trace_event_probe_cleanup(struct trace_eprobe *ep)
+{
+ if (!ep)
+ return;
+ trace_probe_cleanup(&ep->tp);
+ kfree(ep->event_name);
+ kfree(ep->event_system);
+ if (ep->event)
+ trace_event_put_ref(ep->event);
+ kfree(ep);
+}
+
+static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev)
+{
+ return container_of(ev, struct trace_eprobe, devent);
+}
+
+static int eprobe_dyn_event_create(const char *raw_command)
+{
+ return trace_probe_create(raw_command, __trace_eprobe_create);
+}
+
+static int eprobe_dyn_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+ int i;
+
+ seq_printf(m, "e:%s/%s", trace_probe_group_name(&ep->tp),
+ trace_probe_name(&ep->tp));
+ seq_printf(m, " %s.%s", ep->event_system, ep->event_name);
+
+ for (i = 0; i < ep->tp.nr_args; i++)
+ seq_printf(m, " %s=%s", ep->tp.args[i].name, ep->tp.args[i].comm);
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static int unregister_trace_eprobe(struct trace_eprobe *ep)
+{
+ /* If other probes are on the event, just unregister eprobe */
+ if (trace_probe_has_sibling(&ep->tp))
+ goto unreg;
+
+ /* Enabled event can not be unregistered */
+ if (trace_probe_is_enabled(&ep->tp))
+ return -EBUSY;
+
+ /* Will fail if probe is being used by ftrace or perf */
+ if (trace_probe_unregister_event_call(&ep->tp))
+ return -EBUSY;
+
+unreg:
+ dyn_event_remove(&ep->devent);
+ trace_probe_unlink(&ep->tp);
+
+ return 0;
+}
+
+static int eprobe_dyn_event_release(struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+ int ret = unregister_trace_eprobe(ep);
+
+ if (!ret)
+ trace_event_probe_cleanup(ep);
+ return ret;
+}
+
+static bool eprobe_dyn_event_is_busy(struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+
+ return trace_probe_is_enabled(&ep->tp);
+}
+
+static bool eprobe_dyn_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct trace_eprobe *ep = to_trace_eprobe(ev);
+ const char *slash;
+
+ /*
+ * We match the following:
+ * event only - match all eprobes with event name
+ * system and event only - match all system/event probes
+ *
+ * The below has the above satisfied with more arguments:
+ *
+ * attached system/event - If the arg has the system and event
+ * the probe is attached to, match
+ * probes with the attachment.
+ *
+ * If any more args are given, then it requires a full match.
+ */
+
+ /*
+ * If system exists, but this probe is not part of that system
+ * do not match.
+ */
+ if (system && strcmp(trace_probe_group_name(&ep->tp), system) != 0)
+ return false;
+
+ /* Must match the event name */
+ if (strcmp(trace_probe_name(&ep->tp), event) != 0)
+ return false;
+
+ /* No arguments match all */
+ if (argc < 1)
+ return true;
+
+ /* First argument is the system/event the probe is attached to */
+
+ slash = strchr(argv[0], '/');
+ if (!slash)
+ slash = strchr(argv[0], '.');
+ if (!slash)
+ return false;
+
+ if (strncmp(ep->event_system, argv[0], slash - argv[0]))
+ return false;
+ if (strcmp(ep->event_name, slash + 1))
+ return false;
+
+ argc--;
+ argv++;
+
+ /* If there are no other args, then match */
+ if (argc < 1)
+ return true;
+
+ return trace_probe_match_command_args(&ep->tp, argc, argv);
+}
+
+static struct dyn_event_operations eprobe_dyn_event_ops = {
+ .create = eprobe_dyn_event_create,
+ .show = eprobe_dyn_event_show,
+ .is_busy = eprobe_dyn_event_is_busy,
+ .free = eprobe_dyn_event_release,
+ .match = eprobe_dyn_event_match,
+};
+
+static struct trace_eprobe *alloc_event_probe(const char *group,
+ const char *this_event,
+ struct trace_event_call *event,
+ int nargs)
+{
+ struct trace_eprobe *ep;
+ const char *event_name;
+ const char *sys_name;
+ int ret = -ENOMEM;
+
+ if (!event)
+ return ERR_PTR(-ENODEV);
+
+ sys_name = event->class->system;
+ event_name = trace_event_name(event);
+
+ ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL);
+ if (!ep) {
+ trace_event_put_ref(event);
+ goto error;
+ }
+ ep->event = event;
+ ep->event_name = kstrdup(event_name, GFP_KERNEL);
+ if (!ep->event_name)
+ goto error;
+ ep->event_system = kstrdup(sys_name, GFP_KERNEL);
+ if (!ep->event_system)
+ goto error;
+
+ ret = trace_probe_init(&ep->tp, this_event, group, false);
+ if (ret < 0)
+ goto error;
+
+ dyn_event_init(&ep->devent, &eprobe_dyn_event_ops);
+ return ep;
+error:
+ trace_event_probe_cleanup(ep);
+ return ERR_PTR(ret);
+}
+
+static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i)
+{
+ struct probe_arg *parg = &ep->tp.args[i];
+ struct ftrace_event_field *field;
+ struct list_head *head;
+
+ head = trace_get_fields(ep->event);
+ list_for_each_entry(field, head, link) {
+ if (!strcmp(parg->code->data, field->name)) {
+ kfree(parg->code->data);
+ parg->code->data = field;
+ return 0;
+ }
+ }
+ kfree(parg->code->data);
+ parg->code->data = NULL;
+ return -ENOENT;
+}
+
+static int eprobe_event_define_fields(struct trace_event_call *event_call)
+{
+ int ret;
+ struct eprobe_trace_entry_head field;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(event_call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENOENT;
+
+ DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0);
+
+ return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
+}
+
+static struct trace_event_fields eprobe_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = eprobe_event_define_fields },
+ {}
+};
+
+/* Event entry printers */
+static enum print_line_t
+print_eprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct eprobe_trace_entry_head *field;
+ struct trace_event_call *pevent;
+ struct trace_event *probed_event;
+ struct trace_seq *s = &iter->seq;
+ struct trace_probe *tp;
+
+ field = (struct eprobe_trace_entry_head *)iter->ent;
+ tp = trace_probe_primary_from_call(
+ container_of(event, struct trace_event_call, event));
+ if (WARN_ON_ONCE(!tp))
+ goto out;
+
+ trace_seq_printf(s, "%s: (", trace_probe_name(tp));
+
+ probed_event = ftrace_find_event(field->type);
+ if (probed_event) {
+ pevent = container_of(probed_event, struct trace_event_call, event);
+ trace_seq_printf(s, "%s.%s", pevent->class->system,
+ trace_event_name(pevent));
+ } else {
+ trace_seq_printf(s, "%u", field->type);
+ }
+
+ trace_seq_putc(s, ')');
+
+ if (print_probe_args(s, tp->args, tp->nr_args,
+ (u8 *)&field[1], field) < 0)
+ goto out;
+
+ trace_seq_putc(s, '\n');
+ out:
+ return trace_handle_return(s);
+}
+
+static unsigned long get_event_field(struct fetch_insn *code, void *rec)
+{
+ struct ftrace_event_field *field = code->data;
+ unsigned long val;
+ void *addr;
+
+ addr = rec + field->offset;
+
+ switch (field->size) {
+ case 1:
+ if (field->is_signed)
+ val = *(char *)addr;
+ else
+ val = *(unsigned char *)addr;
+ break;
+ case 2:
+ if (field->is_signed)
+ val = *(short *)addr;
+ else
+ val = *(unsigned short *)addr;
+ break;
+ case 4:
+ if (field->is_signed)
+ val = *(int *)addr;
+ else
+ val = *(unsigned int *)addr;
+ break;
+ default:
+ if (field->is_signed)
+ val = *(long *)addr;
+ else
+ val = *(unsigned long *)addr;
+ break;
+ }
+ return val;
+}
+
+static int get_eprobe_size(struct trace_probe *tp, void *rec)
+{
+ struct probe_arg *arg;
+ int i, len, ret = 0;
+
+ for (i = 0; i < tp->nr_args; i++) {
+ arg = tp->args + i;
+ if (unlikely(arg->dynamic)) {
+ unsigned long val;
+
+ val = get_event_field(arg->code, rec);
+ len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL);
+ if (len > 0)
+ ret += len;
+ }
+ }
+
+ return ret;
+}
+
+/* Kprobe specific fetch functions */
+
+/* Note that we don't verify it, since the code does not come from user space */
+static int
+process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
+ void *base)
+{
+ unsigned long val;
+
+ val = get_event_field(code, rec);
+ return process_fetch_insn_bottom(code + 1, val, dest, base);
+}
+NOKPROBE_SYMBOL(process_fetch_insn)
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+fetch_store_strlen_user(unsigned long addr)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+
+ return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+}
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
+fetch_store_strlen(unsigned long addr)
+{
+ int ret, len = 0;
+ u8 c;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if (addr < TASK_SIZE)
+ return fetch_store_strlen_user(addr);
+#endif
+
+ do {
+ ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
+ len++;
+ } while (c && ret == 0 && len < MAX_STRING_SIZE);
+
+ return (ret < 0) ? ret : len;
+}
+
+/*
+ * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
+ */
+static nokprobe_inline int
+fetch_store_string_user(unsigned long addr, void *dest, void *base)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
+ if (ret >= 0)
+ *(u32 *)dest = make_data_loc(ret, __dest - base);
+
+ return ret;
+}
+
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
+ * length and relative data location.
+ */
+static nokprobe_inline int
+fetch_store_string(unsigned long addr, void *dest, void *base)
+{
+ int maxlen = get_loc_len(*(u32 *)dest);
+ void *__dest;
+ long ret;
+
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)addr < TASK_SIZE)
+ return fetch_store_string_user(addr, dest, base);
+#endif
+
+ if (unlikely(!maxlen))
+ return -ENOMEM;
+
+ __dest = get_loc_data(dest, base);
+
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
+ if (ret >= 0)
+ *(u32 *)dest = make_data_loc(ret, __dest - base);
+
+ return ret;
+}
+
+static nokprobe_inline int
+probe_mem_read_user(void *dest, void *src, size_t size)
+{
+ const void __user *uaddr = (__force const void __user *)src;
+
+ return copy_from_user_nofault(dest, uaddr, size);
+}
+
+static nokprobe_inline int
+probe_mem_read(void *dest, void *src, size_t size)
+{
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)src < TASK_SIZE)
+ return probe_mem_read_user(dest, src, size);
+#endif
+ return copy_from_kernel_nofault(dest, src, size);
+}
+
+/* eprobe handler */
+static inline void
+__eprobe_trace_func(struct eprobe_data *edata, void *rec)
+{
+ struct eprobe_trace_entry_head *entry;
+ struct trace_event_call *call = trace_probe_event_call(&edata->ep->tp);
+ struct trace_event_buffer fbuffer;
+ int dsize;
+
+ if (WARN_ON_ONCE(call != edata->file->event_call))
+ return;
+
+ if (trace_trigger_soft_disabled(edata->file))
+ return;
+
+ fbuffer.trace_ctx = tracing_gen_ctx();
+ fbuffer.trace_file = edata->file;
+
+ dsize = get_eprobe_size(&edata->ep->tp, rec);
+ fbuffer.regs = NULL;
+
+ fbuffer.event =
+ trace_event_buffer_lock_reserve(&fbuffer.buffer, edata->file,
+ call->event.type,
+ sizeof(*entry) + edata->ep->tp.size + dsize,
+ fbuffer.trace_ctx);
+ if (!fbuffer.event)
+ return;
+
+ entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
+ if (edata->ep->event)
+ entry->type = edata->ep->event->event.type;
+ else
+ entry->type = 0;
+ store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize);
+
+ trace_event_buffer_commit(&fbuffer);
+}
+
+/*
+ * The event probe implementation uses event triggers to get access to
+ * the event it is attached to, but is not an actual trigger. The below
+ * functions are just stubs to fulfill what is needed to use the trigger
+ * infrastructure.
+ */
+static int eprobe_trigger_init(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ return 0;
+}
+
+static void eprobe_trigger_free(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+
+}
+
+static int eprobe_trigger_print(struct seq_file *m,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ /* Do not print eprobe event triggers */
+ return 0;
+}
+
+static void eprobe_trigger_func(struct event_trigger_data *data,
+ struct trace_buffer *buffer, void *rec,
+ struct ring_buffer_event *rbe)
+{
+ struct eprobe_data *edata = data->private_data;
+
+ __eprobe_trace_func(edata, rec);
+}
+
+static struct event_trigger_ops eprobe_trigger_ops = {
+ .func = eprobe_trigger_func,
+ .print = eprobe_trigger_print,
+ .init = eprobe_trigger_init,
+ .free = eprobe_trigger_free,
+};
+
+static int eprobe_trigger_cmd_func(struct event_command *cmd_ops,
+ struct trace_event_file *file,
+ char *glob, char *cmd, char *param)
+{
+ return -1;
+}
+
+static int eprobe_trigger_reg_func(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ return -1;
+}
+
+static void eprobe_trigger_unreg_func(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+
+}
+
+static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd,
+ char *param)
+{
+ return &eprobe_trigger_ops;
+}
+
+static struct event_command event_trigger_cmd = {
+ .name = "eprobe",
+ .trigger_type = ETT_EVENT_EPROBE,
+ .flags = EVENT_CMD_FL_NEEDS_REC,
+ .func = eprobe_trigger_cmd_func,
+ .reg = eprobe_trigger_reg_func,
+ .unreg = eprobe_trigger_unreg_func,
+ .unreg_all = NULL,
+ .get_trigger_ops = eprobe_trigger_get_ops,
+ .set_filter = NULL,
+};
+
+static struct event_trigger_data *
+new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
+{
+ struct event_trigger_data *trigger;
+ struct eprobe_data *edata;
+
+ edata = kzalloc(sizeof(*edata), GFP_KERNEL);
+ trigger = kzalloc(sizeof(*trigger), GFP_KERNEL);
+ if (!trigger || !edata) {
+ kfree(edata);
+ kfree(trigger);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ trigger->flags = EVENT_TRIGGER_FL_PROBE;
+ trigger->count = -1;
+ trigger->ops = &eprobe_trigger_ops;
+
+ /*
+ * EVENT PROBE triggers are not registered as commands with
+ * register_event_command(), as they are not controlled by the user
+ * from the trigger file
+ */
+ trigger->cmd_ops = &event_trigger_cmd;
+
+ INIT_LIST_HEAD(&trigger->list);
+ RCU_INIT_POINTER(trigger->filter, NULL);
+
+ edata->file = file;
+ edata->ep = ep;
+ trigger->private_data = edata;
+
+ return trigger;
+}
+
+static int enable_eprobe(struct trace_eprobe *ep,
+ struct trace_event_file *eprobe_file)
+{
+ struct event_trigger_data *trigger;
+ struct trace_event_file *file;
+ struct trace_array *tr = eprobe_file->tr;
+
+ file = find_event_file(tr, ep->event_system, ep->event_name);
+ if (!file)
+ return -ENOENT;
+ trigger = new_eprobe_trigger(ep, eprobe_file);
+ if (IS_ERR(trigger))
+ return PTR_ERR(trigger);
+
+ list_add_tail_rcu(&trigger->list, &file->triggers);
+
+ trace_event_trigger_enable_disable(file, 1);
+ update_cond_flag(file);
+
+ return 0;
+}
+
+static struct trace_event_functions eprobe_funcs = {
+ .trace = print_eprobe_event
+};
+
+static int disable_eprobe(struct trace_eprobe *ep,
+ struct trace_array *tr)
+{
+ struct event_trigger_data *trigger;
+ struct trace_event_file *file;
+ struct eprobe_data *edata;
+
+ file = find_event_file(tr, ep->event_system, ep->event_name);
+ if (!file)
+ return -ENOENT;
+
+ list_for_each_entry(trigger, &file->triggers, list) {
+ if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE))
+ continue;
+ edata = trigger->private_data;
+ if (edata->ep == ep)
+ break;
+ }
+ if (list_entry_is_head(trigger, &file->triggers, list))
+ return -ENODEV;
+
+ list_del_rcu(&trigger->list);
+
+ trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
+
+ /* Make sure nothing is using the edata or trigger */
+ tracepoint_synchronize_unregister();
+
+ kfree(edata);
+ kfree(trigger);
+
+ return 0;
+}
+
+static int enable_trace_eprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *pos, *tp;
+ struct trace_eprobe *ep;
+ bool enabled;
+ int ret = 0;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+ enabled = trace_probe_is_enabled(tp);
+
+ /* This also changes "enabled" state */
+ if (file) {
+ ret = trace_probe_add_file(tp, file);
+ if (ret)
+ return ret;
+ } else
+ trace_probe_set_flag(tp, TP_FLAG_PROFILE);
+
+ if (enabled)
+ return 0;
+
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ ep = container_of(pos, struct trace_eprobe, tp);
+ ret = enable_eprobe(ep, file);
+ if (ret)
+ break;
+ enabled = true;
+ }
+
+ if (ret) {
+ /* Failed to enable one of them. Roll back all */
+ if (enabled)
+ disable_eprobe(ep, file->tr);
+ if (file)
+ trace_probe_remove_file(tp, file);
+ else
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+ }
+
+ return ret;
+}
+
+static int disable_trace_eprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *pos, *tp;
+ struct trace_eprobe *ep;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ if (file) {
+ if (!trace_probe_get_file_link(tp, file))
+ return -ENOENT;
+ if (!trace_probe_has_single_file(tp))
+ goto out;
+ trace_probe_clear_flag(tp, TP_FLAG_TRACE);
+ } else
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+
+ if (!trace_probe_is_enabled(tp)) {
+ list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ ep = container_of(pos, struct trace_eprobe, tp);
+ disable_eprobe(ep, file->tr);
+ }
+ }
+
+ out:
+ if (file)
+ /*
+ * Synchronization is done in below function. For perf event,
+ * file == NULL and perf_trace_event_unreg() calls
+ * tracepoint_synchronize_unregister() to ensure synchronize
+ * event. We don't need to care about it.
+ */
+ trace_probe_remove_file(tp, file);
+
+ return 0;
+}
+
+static int eprobe_register(struct trace_event_call *event,
+ enum trace_reg type, void *data)
+{
+ struct trace_event_file *file = data;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return enable_trace_eprobe(event, file);
+ case TRACE_REG_UNREGISTER:
+ return disable_trace_eprobe(event, file);
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ case TRACE_REG_PERF_UNREGISTER:
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ return 0;
+#endif
+ }
+ return 0;
+}
+
+static inline void init_trace_eprobe_call(struct trace_eprobe *ep)
+{
+ struct trace_event_call *call = trace_probe_event_call(&ep->tp);
+
+ call->flags = TRACE_EVENT_FL_EPROBE;
+ call->event.funcs = &eprobe_funcs;
+ call->class->fields_array = eprobe_fields_array;
+ call->class->reg = eprobe_register;
+}
+
+static struct trace_event_call *
+find_and_get_event(const char *system, const char *event_name)
+{
+ struct trace_event_call *tp_event;
+ const char *name;
+
+ list_for_each_entry(tp_event, &ftrace_events, list) {
+ /* Skip other probes and ftrace events */
+ if (tp_event->flags &
+ (TRACE_EVENT_FL_IGNORE_ENABLE |
+ TRACE_EVENT_FL_KPROBE |
+ TRACE_EVENT_FL_UPROBE |
+ TRACE_EVENT_FL_EPROBE))
+ continue;
+ if (!tp_event->class->system ||
+ strcmp(system, tp_event->class->system))
+ continue;
+ name = trace_event_name(tp_event);
+ if (!name || strcmp(event_name, name))
+ continue;
+ if (!trace_event_try_get_ref(tp_event)) {
+ return NULL;
+ break;
+ }
+ return tp_event;
+ break;
+ }
+ return NULL;
+}
+
+static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i)
+{
+ unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TPOINT;
+ int ret;
+
+ ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags);
+ if (ret)
+ return ret;
+
+ if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG)
+ ret = trace_eprobe_tp_arg_update(ep, i);
+
+ return ret;
+}
+
+static int __trace_eprobe_create(int argc, const char *argv[])
+{
+ /*
+ * Argument syntax:
+ * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS]
+ * Fetch args:
+ * <name>=$<field>[:TYPE]
+ */
+ const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
+ const char *sys_event = NULL, *sys_name = NULL;
+ struct trace_event_call *event_call;
+ struct trace_eprobe *ep = NULL;
+ char buf1[MAX_EVENT_NAME_LEN];
+ char buf2[MAX_EVENT_NAME_LEN];
+ int ret = 0;
+ int i;
+
+ if (argc < 2 || argv[0][0] != 'e')
+ return -ECANCELED;
+
+ trace_probe_log_init("event_probe", argc, argv);
+
+ event = strchr(&argv[0][1], ':');
+ if (event) {
+ event++;
+ ret = traceprobe_parse_event_name(&event, &group, buf1,
+ event - argv[0]);
+ if (ret)
+ goto parse_error;
+ } else {
+ strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN);
+ sanitize_event_name(buf1);
+ event = buf1;
+ }
+ if (!is_good_name(event) || !is_good_name(group))
+ goto parse_error;
+
+ sys_event = argv[1];
+ ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2,
+ sys_event - argv[1]);
+ if (ret || !sys_name)
+ goto parse_error;
+ if (!is_good_name(sys_event) || !is_good_name(sys_name))
+ goto parse_error;
+
+ mutex_lock(&event_mutex);
+ event_call = find_and_get_event(sys_name, sys_event);
+ ep = alloc_event_probe(group, event, event_call, argc - 2);
+ mutex_unlock(&event_mutex);
+
+ if (IS_ERR(ep)) {
+ ret = PTR_ERR(ep);
+ /* This must return -ENOMEM or missing event, else there is a bug */
+ WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV);
+ ep = NULL;
+ goto error;
+ }
+
+ argc -= 2; argv += 2;
+ /* parse arguments */
+ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ trace_probe_log_set_index(i + 2);
+ ret = trace_eprobe_tp_update_arg(ep, argv, i);
+ if (ret)
+ goto error;
+ }
+ ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT);
+ if (ret < 0)
+ goto error;
+ init_trace_eprobe_call(ep);
+ mutex_lock(&event_mutex);
+ ret = trace_probe_register_event_call(&ep->tp);
+ if (ret) {
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ }
+ mutex_unlock(&event_mutex);
+ goto error;
+ }
+ ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
+ mutex_unlock(&event_mutex);
+ return ret;
+parse_error:
+ ret = -EINVAL;
+error:
+ trace_event_probe_cleanup(ep);
+ return ret;
+}
+
+/*
+ * Register dynevent at core_initcall. This allows kernel to setup eprobe
+ * events in postcore_initcall without tracefs.
+ */
+static __init int trace_events_eprobe_init_early(void)
+{
+ int err = 0;
+
+ err = dyn_event_register(&eprobe_dyn_event_ops);
+ if (err)
+ pr_warn("Could not register eprobe_dyn_event_ops\n");
+
+ return err;
+}
+core_initcall(trace_events_eprobe_init_early);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 03be4435d103..a114549720d6 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -177,7 +177,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event)
}
}
out:
- module_put(tp_event->mod);
+ trace_event_put_ref(tp_event);
}
static int perf_trace_event_open(struct perf_event *p_event)
@@ -224,10 +224,10 @@ int perf_trace_init(struct perf_event *p_event)
list_for_each_entry(tp_event, &ftrace_events, list) {
if (tp_event->event.type == event_id &&
tp_event->class && tp_event->class->reg &&
- try_module_get(tp_event->mod)) {
+ trace_event_try_get_ref(tp_event)) {
ret = perf_trace_event_init(tp_event, p_event);
if (ret)
- module_put(tp_event->mod);
+ trace_event_put_ref(tp_event);
break;
}
}
@@ -400,7 +400,8 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
- "perf buffer not large enough"))
+ "perf buffer not large enough, wanted %d, have %d",
+ size, PERF_MAX_TRACE_SIZE))
return NULL;
*rctxp = rctx = perf_swevent_get_recursion_context();
@@ -441,13 +442,13 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
if (!rcu_is_watching())
return;
- if ((unsigned long)ops->private != smp_processor_id())
- return;
-
bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (bit < 0)
return;
+ if ((unsigned long)ops->private != smp_processor_id())
+ goto out;
+
event = container_of(ops, struct perf_event, ftrace_ops);
/*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 80e96989770e..92be9cb1d7d4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -181,6 +181,7 @@ static int trace_define_common_fields(void)
__common_field(unsigned short, type);
__common_field(unsigned char, flags);
+ /* Holds both preempt_count and migrate_disable */
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
@@ -884,10 +885,10 @@ static void __ftrace_clear_event_pids(struct trace_array *tr, int type)
tracepoint_synchronize_unregister();
if ((type & TRACE_PIDS) && pid_list)
- trace_free_pid_list(pid_list);
+ trace_pid_list_free(pid_list);
if ((type & TRACE_NO_PIDS) && no_pid_list)
- trace_free_pid_list(no_pid_list);
+ trace_pid_list_free(no_pid_list);
}
static void ftrace_clear_event_pids(struct trace_array *tr, int type)
@@ -1966,7 +1967,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
if (filtered_pids) {
tracepoint_synchronize_unregister();
- trace_free_pid_list(filtered_pids);
+ trace_pid_list_free(filtered_pids);
} else if (pid_list && !other_pids) {
register_pid_events(tr);
}
@@ -2311,7 +2312,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
/* the ftrace system is special, do not create enable or filter files */
if (strcmp(name, "ftrace") != 0) {
- entry = tracefs_create_file("filter", 0644, dir->entry, dir,
+ entry = tracefs_create_file("filter", TRACE_MODE_WRITE,
+ dir->entry, dir,
&ftrace_subsystem_filter_fops);
if (!entry) {
kfree(system->filter);
@@ -2319,7 +2321,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
pr_warn("Could not create tracefs '%s/filter' entry\n", name);
}
- trace_create_file("enable", 0644, dir->entry, dir,
+ trace_create_file("enable", TRACE_MODE_WRITE, dir->entry, dir,
&ftrace_system_enable_fops);
}
@@ -2401,12 +2403,12 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
}
if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
- trace_create_file("enable", 0644, file->dir, file,
+ trace_create_file("enable", TRACE_MODE_WRITE, file->dir, file,
&ftrace_enable_fops);
#ifdef CONFIG_PERF_EVENTS
if (call->event.type && call->class->reg)
- trace_create_file("id", 0444, file->dir,
+ trace_create_file("id", TRACE_MODE_READ, file->dir,
(void *)(long)call->event.type,
&ftrace_event_id_fops);
#endif
@@ -2422,22 +2424,22 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
* triggers or filters.
*/
if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
- trace_create_file("filter", 0644, file->dir, file,
- &ftrace_event_filter_fops);
+ trace_create_file("filter", TRACE_MODE_WRITE, file->dir,
+ file, &ftrace_event_filter_fops);
- trace_create_file("trigger", 0644, file->dir, file,
- &event_trigger_fops);
+ trace_create_file("trigger", TRACE_MODE_WRITE, file->dir,
+ file, &event_trigger_fops);
}
#ifdef CONFIG_HIST_TRIGGERS
- trace_create_file("hist", 0444, file->dir, file,
+ trace_create_file("hist", TRACE_MODE_READ, file->dir, file,
&event_hist_fops);
#endif
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
- trace_create_file("hist_debug", 0444, file->dir, file,
+ trace_create_file("hist_debug", TRACE_MODE_READ, file->dir, file,
&event_hist_debug_fops);
#endif
- trace_create_file("format", 0444, file->dir, call,
+ trace_create_file("format", TRACE_MODE_READ, file->dir, call,
&ftrace_event_format_fops);
#ifdef CONFIG_TRACE_EVENT_INJECT
@@ -2525,7 +2527,10 @@ __register_event(struct trace_event_call *call, struct module *mod)
return ret;
list_add(&call->list, &ftrace_events);
- call->mod = mod;
+ if (call->flags & TRACE_EVENT_FL_DYNAMIC)
+ atomic_set(&call->refcnt, 0);
+ else
+ call->module = mod;
return 0;
}
@@ -2673,12 +2678,24 @@ static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
+ struct trace_pid_list *no_pid_list;
+ struct trace_pid_list *pid_list;
struct trace_event_file *file;
+ unsigned int first;
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
if (!file)
return NULL;
+ pid_list = rcu_dereference_protected(tr->filtered_pids,
+ lockdep_is_held(&event_mutex));
+ no_pid_list = rcu_dereference_protected(tr->filtered_no_pids,
+ lockdep_is_held(&event_mutex));
+
+ if (!trace_pid_list_first(pid_list, &first) ||
+ !trace_pid_list_first(no_pid_list, &first))
+ file->flags |= EVENT_FILE_FL_PID_FILTER;
+
file->event_call = call;
file->tr = tr;
atomic_set(&file->sm_ref, 0);
@@ -2839,7 +2856,9 @@ static void trace_module_remove_events(struct module *mod)
down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
- if (call->mod == mod)
+ if ((call->flags & TRACE_EVENT_FL_DYNAMIC) || !call->module)
+ continue;
+ if (call->module == mod)
__trace_remove_event_call(call);
}
up_write(&trace_event_sem);
@@ -2982,7 +3001,7 @@ struct trace_event_file *trace_get_event_file(const char *instance,
}
/* Don't let event modules unload while in use */
- ret = try_module_get(file->event_call->mod);
+ ret = trace_event_try_get_ref(file->event_call);
if (!ret) {
trace_array_put(tr);
ret = -EBUSY;
@@ -3012,7 +3031,7 @@ EXPORT_SYMBOL_GPL(trace_get_event_file);
void trace_put_event_file(struct trace_event_file *file)
{
mutex_lock(&event_mutex);
- module_put(file->event_call->mod);
+ trace_event_put_ref(file->event_call);
mutex_unlock(&event_mutex);
trace_array_put(file->tr);
@@ -3147,7 +3166,7 @@ static int free_probe_data(void *data)
if (!edata->ref) {
/* Remove the SOFT_MODE flag */
__ftrace_event_enable_disable(edata->file, 0, 1);
- module_put(edata->file->event_call->mod);
+ trace_event_put_ref(edata->file->event_call);
kfree(edata);
}
return 0;
@@ -3280,7 +3299,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
out_reg:
/* Don't let event modules unload while probe registered */
- ret = try_module_get(file->event_call->mod);
+ ret = trace_event_try_get_ref(file->event_call);
if (!ret) {
ret = -EBUSY;
goto out_free;
@@ -3310,7 +3329,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
out_disable:
__ftrace_event_enable_disable(file, 0, 1);
out_put:
- module_put(file->event_call->mod);
+ trace_event_put_ref(file->event_call);
out_free:
kfree(data);
goto out;
@@ -3376,7 +3395,8 @@ void __trace_early_add_events(struct trace_array *tr)
list_for_each_entry(call, &ftrace_events, list) {
/* Early boot up should not have any modules loaded */
- if (WARN_ON_ONCE(call->mod))
+ if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) &&
+ WARN_ON_ONCE(call->module))
continue;
ret = __trace_early_add_new_event(call, tr);
@@ -3426,7 +3446,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
struct dentry *d_events;
struct dentry *entry;
- entry = tracefs_create_file("set_event", 0644, parent,
+ entry = tracefs_create_file("set_event", TRACE_MODE_WRITE, parent,
tr, &ftrace_set_event_fops);
if (!entry) {
pr_warn("Could not create tracefs 'set_event' entry\n");
@@ -3439,7 +3459,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
return -ENOMEM;
}
- entry = trace_create_file("enable", 0644, d_events,
+ entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events,
tr, &ftrace_tr_enable_fops);
if (!entry) {
pr_warn("Could not create tracefs 'enable' entry\n");
@@ -3448,24 +3468,25 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
/* There are not as crucial, just warn if they are not created */
- entry = tracefs_create_file("set_event_pid", 0644, parent,
+ entry = tracefs_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
tr, &ftrace_set_event_pid_fops);
if (!entry)
pr_warn("Could not create tracefs 'set_event_pid' entry\n");
- entry = tracefs_create_file("set_event_notrace_pid", 0644, parent,
- tr, &ftrace_set_event_notrace_pid_fops);
+ entry = tracefs_create_file("set_event_notrace_pid",
+ TRACE_MODE_WRITE, parent, tr,
+ &ftrace_set_event_notrace_pid_fops);
if (!entry)
pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n");
/* ring buffer internal formats */
- entry = trace_create_file("header_page", 0444, d_events,
+ entry = trace_create_file("header_page", TRACE_MODE_READ, d_events,
ring_buffer_print_page_header,
&ftrace_show_header_fops);
if (!entry)
pr_warn("Could not create tracefs 'header_page' entry\n");
- entry = trace_create_file("header_event", 0444, d_events,
+ entry = trace_create_file("header_event", TRACE_MODE_READ, d_events,
ring_buffer_print_entry_header,
&ftrace_show_header_fops);
if (!entry)
@@ -3682,8 +3703,8 @@ __init int event_trace_init(void)
if (!tr)
return -ENODEV;
- entry = tracefs_create_file("available_events", 0444, NULL,
- tr, &ftrace_avail_fops);
+ entry = tracefs_create_file("available_events", TRACE_MODE_READ,
+ NULL, tr, &ftrace_avail_fops);
if (!entry)
pr_warn("Could not create tracefs 'available_events' entry\n");
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index a48aa2a2875b..319f9c8ca7e7 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -66,7 +66,10 @@
C(EMPTY_SORT_FIELD, "Empty sort field"), \
C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \
C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \
- C(INVALID_STR_OPERAND, "String type can not be an operand in expression"),
+ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \
+ C(EXPECT_NUMBER, "Expecting numeric literal"), \
+ C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \
+ C(DIVISION_BY_ZERO, "Division by zero"),
#undef C
#define C(a, b) HIST_ERR_##a
@@ -89,12 +92,16 @@ typedef u64 (*hist_field_fn_t) (struct hist_field *field,
#define HIST_FIELD_OPERANDS_MAX 2
#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
#define HIST_ACTIONS_MAX 8
+#define HIST_CONST_DIGITS_MAX 21
+#define HIST_DIV_SHIFT 20 /* For optimizing division by constants */
enum field_op_id {
FIELD_OP_NONE,
FIELD_OP_PLUS,
FIELD_OP_MINUS,
FIELD_OP_UNARY_MINUS,
+ FIELD_OP_DIV,
+ FIELD_OP_MULT,
};
/*
@@ -121,6 +128,7 @@ struct hist_field {
unsigned int size;
unsigned int offset;
unsigned int is_signed;
+ unsigned long buckets;
const char *type;
struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
struct hist_trigger_data *hist_data;
@@ -151,6 +159,11 @@ struct hist_field {
bool read_once;
unsigned int var_str_idx;
+
+ /* Numeric literals are represented as u64 */
+ u64 constant;
+ /* Used to optimize division by constants */
+ u64 div_multiplier;
};
static u64 hist_field_none(struct hist_field *field,
@@ -162,6 +175,15 @@ static u64 hist_field_none(struct hist_field *field,
return 0;
}
+static u64 hist_field_const(struct hist_field *field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ return field->constant;
+}
+
static u64 hist_field_counter(struct hist_field *field,
struct tracing_map_elt *elt,
struct trace_buffer *buffer,
@@ -219,6 +241,27 @@ static u64 hist_field_log2(struct hist_field *hist_field,
return (u64) ilog2(roundup_pow_of_two(val));
}
+static u64 hist_field_bucket(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand = hist_field->operands[0];
+ unsigned long buckets = hist_field->buckets;
+
+ u64 val = operand->fn(operand, elt, buffer, rbe, event);
+
+ if (WARN_ON_ONCE(!buckets))
+ return val;
+
+ if (val >= LONG_MAX)
+ val = div64_ul(val, buckets);
+ else
+ val = (u64)((unsigned long)val / buckets);
+ return val * buckets;
+}
+
static u64 hist_field_plus(struct hist_field *hist_field,
struct tracing_map_elt *elt,
struct trace_buffer *buffer,
@@ -249,6 +292,106 @@ static u64 hist_field_minus(struct hist_field *hist_field,
return val1 - val2;
}
+static u64 hist_field_div(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+
+ /* Return -1 for the undefined case */
+ if (!val2)
+ return -1;
+
+ /* Use shift if the divisor is a power of 2 */
+ if (!(val2 & (val2 - 1)))
+ return val1 >> __ffs64(val2);
+
+ return div64_u64(val1, val2);
+}
+
+static u64 div_by_power_of_two(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+
+ return val1 >> __ffs64(operand2->constant);
+}
+
+static u64 div_by_not_power_of_two(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+
+ return div64_u64(val1, operand2->constant);
+}
+
+static u64 div_by_mult_and_shift(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+
+ /*
+ * If the divisor is a constant, do a multiplication and shift instead.
+ *
+ * Choose Z = some power of 2. If Y <= Z, then:
+ * X / Y = (X * (Z / Y)) / Z
+ *
+ * (Z / Y) is a constant (mult) which is calculated at parse time, so:
+ * X / Y = (X * mult) / Z
+ *
+ * The division by Z can be replaced by a shift since Z is a power of 2:
+ * X / Y = (X * mult) >> HIST_DIV_SHIFT
+ *
+ * As long, as X < Z the results will not be off by more than 1.
+ */
+ if (val1 < (1 << HIST_DIV_SHIFT)) {
+ u64 mult = operand2->div_multiplier;
+
+ return (val1 * mult + ((1 << HIST_DIV_SHIFT) - 1)) >> HIST_DIV_SHIFT;
+ }
+
+ return div64_u64(val1, operand2->constant);
+}
+
+static u64 hist_field_mult(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+ u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+
+ return val1 * val2;
+}
+
static u64 hist_field_unary_minus(struct hist_field *hist_field,
struct tracing_map_elt *elt,
struct trace_buffer *buffer,
@@ -318,6 +461,8 @@ enum hist_field_flags {
HIST_FIELD_FL_VAR_REF = 1 << 14,
HIST_FIELD_FL_CPU = 1 << 15,
HIST_FIELD_FL_ALIAS = 1 << 16,
+ HIST_FIELD_FL_BUCKET = 1 << 17,
+ HIST_FIELD_FL_CONST = 1 << 18,
};
struct var_defs {
@@ -485,7 +630,8 @@ struct track_data {
struct hist_elt_data {
char *comm;
u64 *var_ref_vals;
- char *field_var_str[SYNTH_FIELDS_MAX];
+ char **field_var_str;
+ int n_field_var_str;
};
struct snapshot_context {
@@ -493,6 +639,25 @@ struct snapshot_context {
void *key;
};
+/*
+ * Returns the specific division function to use if the divisor
+ * is constant. This avoids extra branches when the trigger is hit.
+ */
+static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor)
+{
+ u64 div = divisor->constant;
+
+ if (!(div & (div - 1)))
+ return div_by_power_of_two;
+
+ /* If the divisor is too large, do a regular division */
+ if (div > (1 << HIST_DIV_SHIFT))
+ return div_by_not_power_of_two;
+
+ divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div);
+ return div_by_mult_and_shift;
+}
+
static void track_data_free(struct track_data *track_data)
{
struct hist_elt_data *elt_data;
@@ -1109,7 +1274,8 @@ static const char *hist_field_name(struct hist_field *field,
if (field->field)
field_name = field->field->name;
else if (field->flags & HIST_FIELD_FL_LOG2 ||
- field->flags & HIST_FIELD_FL_ALIAS)
+ field->flags & HIST_FIELD_FL_ALIAS ||
+ field->flags & HIST_FIELD_FL_BUCKET)
field_name = hist_field_name(field->operands[0], ++level);
else if (field->flags & HIST_FIELD_FL_CPU)
field_name = "common_cpu";
@@ -1377,9 +1543,11 @@ static void hist_elt_data_free(struct hist_elt_data *elt_data)
{
unsigned int i;
- for (i = 0; i < SYNTH_FIELDS_MAX; i++)
+ for (i = 0; i < elt_data->n_field_var_str; i++)
kfree(elt_data->field_var_str[i]);
+ kfree(elt_data->field_var_str);
+
kfree(elt_data->comm);
kfree(elt_data);
}
@@ -1396,17 +1564,17 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
struct hist_trigger_data *hist_data = elt->map->private_data;
unsigned int size = TASK_COMM_LEN;
struct hist_elt_data *elt_data;
- struct hist_field *key_field;
+ struct hist_field *hist_field;
unsigned int i, n_str;
elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
if (!elt_data)
return -ENOMEM;
- for_each_hist_key_field(i, hist_data) {
- key_field = hist_data->fields[i];
+ for_each_hist_field(i, hist_data) {
+ hist_field = hist_data->fields[i];
- if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
+ if (hist_field->flags & HIST_FIELD_FL_EXECNAME) {
elt_data->comm = kzalloc(size, GFP_KERNEL);
if (!elt_data->comm) {
kfree(elt_data);
@@ -1427,6 +1595,13 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
size = STR_VAR_LEN_MAX;
+ elt_data->field_var_str = kcalloc(n_str, sizeof(char *), GFP_KERNEL);
+ if (!elt_data->field_var_str) {
+ hist_elt_data_free(elt_data);
+ return -EINVAL;
+ }
+ elt_data->n_field_var_str = n_str;
+
for (i = 0; i < n_str; i++) {
elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
if (!elt_data->field_var_str[i]) {
@@ -1470,6 +1645,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
flags_str = "syscall";
else if (hist_field->flags & HIST_FIELD_FL_LOG2)
flags_str = "log2";
+ else if (hist_field->flags & HIST_FIELD_FL_BUCKET)
+ flags_str = "buckets";
else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
flags_str = "usecs";
@@ -1480,6 +1657,12 @@ static void expr_field_str(struct hist_field *field, char *expr)
{
if (field->flags & HIST_FIELD_FL_VAR_REF)
strcat(expr, "$");
+ else if (field->flags & HIST_FIELD_FL_CONST) {
+ char str[HIST_CONST_DIGITS_MAX];
+
+ snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant);
+ strcat(expr, str);
+ }
strcat(expr, hist_field_name(field, 0));
@@ -1535,6 +1718,12 @@ static char *expr_str(struct hist_field *field, unsigned int level)
case FIELD_OP_PLUS:
strcat(expr, "+");
break;
+ case FIELD_OP_DIV:
+ strcat(expr, "/");
+ break;
+ case FIELD_OP_MULT:
+ strcat(expr, "*");
+ break;
default:
kfree(expr);
return NULL;
@@ -1545,34 +1734,92 @@ static char *expr_str(struct hist_field *field, unsigned int level)
return expr;
}
-static int contains_operator(char *str)
+/*
+ * If field_op != FIELD_OP_NONE, *sep points to the root operator
+ * of the expression tree to be evaluated.
+ */
+static int contains_operator(char *str, char **sep)
{
enum field_op_id field_op = FIELD_OP_NONE;
- char *op;
+ char *minus_op, *plus_op, *div_op, *mult_op;
+
- op = strpbrk(str, "+-");
- if (!op)
- return FIELD_OP_NONE;
+ /*
+ * Report the last occurrence of the operators first, so that the
+ * expression is evaluated left to right. This is important since
+ * subtraction and division are not associative.
+ *
+ * e.g
+ * 64/8/4/2 is 1, i.e 64/8/4/2 = ((64/8)/4)/2
+ * 14-7-5-2 is 0, i.e 14-7-5-2 = ((14-7)-5)-2
+ */
- switch (*op) {
- case '-':
+ /*
+ * First, find lower precedence addition and subtraction
+ * since the expression will be evaluated recursively.
+ */
+ minus_op = strrchr(str, '-');
+ if (minus_op) {
/*
- * Unfortunately, the modifier ".sym-offset"
- * can confuse things.
+ * Unary minus is not supported in sub-expressions. If
+ * present, it is always the next root operator.
*/
- if (op - str >= 4 && !strncmp(op - 4, ".sym-offset", 11))
- return FIELD_OP_NONE;
-
- if (*str == '-')
+ if (minus_op == str) {
field_op = FIELD_OP_UNARY_MINUS;
- else
- field_op = FIELD_OP_MINUS;
- break;
- case '+':
- field_op = FIELD_OP_PLUS;
- break;
- default:
- break;
+ goto out;
+ }
+
+ field_op = FIELD_OP_MINUS;
+ }
+
+ plus_op = strrchr(str, '+');
+ if (plus_op || minus_op) {
+ /*
+ * For operators of the same precedence use to rightmost as the
+ * root, so that the expression is evaluated left to right.
+ */
+ if (plus_op > minus_op)
+ field_op = FIELD_OP_PLUS;
+ goto out;
+ }
+
+ /*
+ * Multiplication and division have higher precedence than addition and
+ * subtraction.
+ */
+ div_op = strrchr(str, '/');
+ if (div_op)
+ field_op = FIELD_OP_DIV;
+
+ mult_op = strrchr(str, '*');
+ /*
+ * For operators of the same precedence use to rightmost as the
+ * root, so that the expression is evaluated left to right.
+ */
+ if (mult_op > div_op)
+ field_op = FIELD_OP_MULT;
+
+out:
+ if (sep) {
+ switch (field_op) {
+ case FIELD_OP_UNARY_MINUS:
+ case FIELD_OP_MINUS:
+ *sep = minus_op;
+ break;
+ case FIELD_OP_PLUS:
+ *sep = plus_op;
+ break;
+ case FIELD_OP_DIV:
+ *sep = div_op;
+ break;
+ case FIELD_OP_MULT:
+ *sep = mult_op;
+ break;
+ case FIELD_OP_NONE:
+ default:
+ *sep = NULL;
+ break;
+ }
}
return field_op;
@@ -1590,7 +1837,9 @@ static void __destroy_hist_field(struct hist_field *hist_field)
kfree(hist_field->var.name);
kfree(hist_field->name);
- kfree(hist_field->type);
+
+ /* Can likely be a const */
+ kfree_const(hist_field->type);
kfree(hist_field->system);
kfree(hist_field->event_name);
@@ -1647,6 +1896,13 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (flags & HIST_FIELD_FL_HITCOUNT) {
hist_field->fn = hist_field_counter;
hist_field->size = sizeof(u64);
+ hist_field->type = "u64";
+ goto out;
+ }
+
+ if (flags & HIST_FIELD_FL_CONST) {
+ hist_field->fn = hist_field_const;
+ hist_field->size = sizeof(u64);
hist_field->type = kstrdup("u64", GFP_KERNEL);
if (!hist_field->type)
goto free;
@@ -1658,12 +1914,13 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
goto out;
}
- if (flags & HIST_FIELD_FL_LOG2) {
- unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
- hist_field->fn = hist_field_log2;
+ if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) {
+ unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET);
+ hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 :
+ hist_field_bucket;
hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
hist_field->size = hist_field->operands[0]->size;
- hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
+ hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);
if (!hist_field->type)
goto free;
goto out;
@@ -1672,18 +1929,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (flags & HIST_FIELD_FL_TIMESTAMP) {
hist_field->fn = hist_field_timestamp;
hist_field->size = sizeof(u64);
- hist_field->type = kstrdup("u64", GFP_KERNEL);
- if (!hist_field->type)
- goto free;
+ hist_field->type = "u64";
goto out;
}
if (flags & HIST_FIELD_FL_CPU) {
hist_field->fn = hist_field_cpu;
hist_field->size = sizeof(int);
- hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
- if (!hist_field->type)
- goto free;
+ hist_field->type = "unsigned int";
goto out;
}
@@ -1696,20 +1949,21 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
flags |= HIST_FIELD_FL_STRING;
hist_field->size = MAX_FILTER_STR_VAL;
- hist_field->type = kstrdup(field->type, GFP_KERNEL);
+ hist_field->type = kstrdup_const(field->type, GFP_KERNEL);
if (!hist_field->type)
goto free;
- if (field->filter_type == FILTER_STATIC_STRING)
+ if (field->filter_type == FILTER_STATIC_STRING) {
hist_field->fn = hist_field_string;
- else if (field->filter_type == FILTER_DYN_STRING)
+ hist_field->size = field->size;
+ } else if (field->filter_type == FILTER_DYN_STRING)
hist_field->fn = hist_field_dynstring;
else
hist_field->fn = hist_field_pstring;
} else {
hist_field->size = field->size;
hist_field->is_signed = field->is_signed;
- hist_field->type = kstrdup(field->type, GFP_KERNEL);
+ hist_field->type = kstrdup_const(field->type, GFP_KERNEL);
if (!hist_field->type)
goto free;
@@ -1795,7 +2049,7 @@ static int init_var_ref(struct hist_field *ref_field,
}
}
- ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
+ ref_field->type = kstrdup_const(var_field->type, GFP_KERNEL);
if (!ref_field->type) {
err = -ENOMEM;
goto free;
@@ -1892,7 +2146,7 @@ static char *field_name_from_var(struct hist_trigger_data *hist_data,
if (strcmp(var_name, name) == 0) {
field = hist_data->attrs->var_defs.expr[i];
- if (contains_operator(field) || is_var_ref(field))
+ if (contains_operator(field, NULL) || is_var_ref(field))
continue;
return field;
}
@@ -1953,7 +2207,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
static struct ftrace_event_field *
parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
- char *field_str, unsigned long *flags)
+ char *field_str, unsigned long *flags, unsigned long *buckets)
{
struct ftrace_event_field *field = NULL;
char *field_name, *modifier, *str;
@@ -1969,7 +2223,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
*flags |= HIST_FIELD_FL_HEX;
else if (strcmp(modifier, "sym") == 0)
*flags |= HIST_FIELD_FL_SYM;
- else if (strcmp(modifier, "sym-offset") == 0)
+ /*
+ * 'sym-offset' occurrences in the trigger string are modified
+ * to 'symXoffset' to simplify arithmetic expression parsing.
+ */
+ else if (strcmp(modifier, "symXoffset") == 0)
*flags |= HIST_FIELD_FL_SYM_OFFSET;
else if ((strcmp(modifier, "execname") == 0) &&
(strcmp(field_name, "common_pid") == 0))
@@ -1980,7 +2238,22 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
*flags |= HIST_FIELD_FL_LOG2;
else if (strcmp(modifier, "usecs") == 0)
*flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
- else {
+ else if (strncmp(modifier, "bucket", 6) == 0) {
+ int ret;
+
+ modifier += 6;
+
+ if (*modifier == 's')
+ modifier++;
+ if (*modifier != '=')
+ goto error;
+ modifier++;
+ ret = kstrtoul(modifier, 0, buckets);
+ if (ret || !(*buckets))
+ goto error;
+ *flags |= HIST_FIELD_FL_BUCKET;
+ } else {
+ error:
hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier));
field = ERR_PTR(-EINVAL);
goto out;
@@ -2042,6 +2315,29 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
return alias;
}
+static struct hist_field *parse_const(struct hist_trigger_data *hist_data,
+ char *str, char *var_name,
+ unsigned long *flags)
+{
+ struct trace_array *tr = hist_data->event_file->tr;
+ struct hist_field *field = NULL;
+ u64 constant;
+
+ if (kstrtoull(str, 0, &constant)) {
+ hist_err(tr, HIST_ERR_EXPECT_NUMBER, errpos(str));
+ return NULL;
+ }
+
+ *flags |= HIST_FIELD_FL_CONST;
+ field = create_hist_field(hist_data, NULL, *flags, var_name);
+ if (!field)
+ return NULL;
+
+ field->constant = constant;
+
+ return field;
+}
+
static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
struct trace_event_file *file, char *str,
unsigned long *flags, char *var_name)
@@ -2049,8 +2345,18 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
struct ftrace_event_field *field = NULL;
struct hist_field *hist_field = NULL;
+ unsigned long buckets = 0;
int ret = 0;
+ if (isdigit(str[0])) {
+ hist_field = parse_const(hist_data, str, var_name, flags);
+ if (!hist_field) {
+ ret = -EINVAL;
+ goto out;
+ }
+ return hist_field;
+ }
+
s = strchr(str, '.');
if (s) {
s = strchr(++s, '.');
@@ -2086,7 +2392,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
} else
str = s;
- field = parse_field(hist_data, file, str, flags);
+ field = parse_field(hist_data, file, str, flags, &buckets);
if (IS_ERR(field)) {
ret = PTR_ERR(field);
goto out;
@@ -2097,6 +2403,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
ret = -ENOMEM;
goto out;
}
+ hist_field->buckets = buckets;
return hist_field;
out:
@@ -2106,21 +2413,24 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
struct trace_event_file *file,
char *str, unsigned long flags,
- char *var_name, unsigned int level);
+ char *var_name, unsigned int *n_subexprs);
static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
struct trace_event_file *file,
char *str, unsigned long flags,
- char *var_name, unsigned int level)
+ char *var_name, unsigned int *n_subexprs)
{
struct hist_field *operand1, *expr = NULL;
unsigned long operand_flags;
int ret = 0;
char *s;
+ /* Unary minus operator, increment n_subexprs */
+ ++*n_subexprs;
+
/* we support only -(xxx) i.e. explicit parens required */
- if (level > 3) {
+ if (*n_subexprs > 3) {
hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
ret = -EINVAL;
goto free;
@@ -2137,8 +2447,16 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
}
s = strrchr(str, ')');
- if (s)
+ if (s) {
+ /* unary minus not supported in sub-expressions */
+ if (*(s+1) != '\0') {
+ hist_err(file->tr, HIST_ERR_UNARY_MINUS_SUBEXPR,
+ errpos(str));
+ ret = -EINVAL;
+ goto free;
+ }
*s = '\0';
+ }
else {
ret = -EINVAL; /* no closing ')' */
goto free;
@@ -2152,7 +2470,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
}
operand_flags = 0;
- operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
+ operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs);
if (IS_ERR(operand1)) {
ret = PTR_ERR(operand1);
goto free;
@@ -2171,7 +2489,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
expr->operands[0] = operand1;
expr->operator = FIELD_OP_UNARY_MINUS;
expr->name = expr_str(expr, 0);
- expr->type = kstrdup(operand1->type, GFP_KERNEL);
+ expr->type = kstrdup_const(operand1->type, GFP_KERNEL);
if (!expr->type) {
ret = -ENOMEM;
goto free;
@@ -2183,9 +2501,15 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
return ERR_PTR(ret);
}
+/*
+ * If the operands are var refs, return pointers the
+ * variable(s) referenced in var1 and var2, else NULL.
+ */
static int check_expr_operands(struct trace_array *tr,
struct hist_field *operand1,
- struct hist_field *operand2)
+ struct hist_field *operand2,
+ struct hist_field **var1,
+ struct hist_field **var2)
{
unsigned long operand1_flags = operand1->flags;
unsigned long operand2_flags = operand2->flags;
@@ -2198,6 +2522,7 @@ static int check_expr_operands(struct trace_array *tr,
if (!var)
return -EINVAL;
operand1_flags = var->flags;
+ *var1 = var;
}
if ((operand2_flags & HIST_FIELD_FL_VAR_REF) ||
@@ -2208,6 +2533,7 @@ static int check_expr_operands(struct trace_array *tr,
if (!var)
return -EINVAL;
operand2_flags = var->flags;
+ *var2 = var;
}
if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
@@ -2222,74 +2548,102 @@ static int check_expr_operands(struct trace_array *tr,
static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
struct trace_event_file *file,
char *str, unsigned long flags,
- char *var_name, unsigned int level)
+ char *var_name, unsigned int *n_subexprs)
{
struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL;
- unsigned long operand_flags;
+ struct hist_field *var1 = NULL, *var2 = NULL;
+ unsigned long operand_flags, operand2_flags;
int field_op, ret = -EINVAL;
char *sep, *operand1_str;
+ hist_field_fn_t op_fn;
+ bool combine_consts;
- if (level > 3) {
+ if (*n_subexprs > 3) {
hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
return ERR_PTR(-EINVAL);
}
- field_op = contains_operator(str);
+ field_op = contains_operator(str, &sep);
if (field_op == FIELD_OP_NONE)
return parse_atom(hist_data, file, str, &flags, var_name);
if (field_op == FIELD_OP_UNARY_MINUS)
- return parse_unary(hist_data, file, str, flags, var_name, ++level);
+ return parse_unary(hist_data, file, str, flags, var_name, n_subexprs);
- switch (field_op) {
- case FIELD_OP_MINUS:
- sep = "-";
- break;
- case FIELD_OP_PLUS:
- sep = "+";
- break;
- default:
- goto free;
- }
+ /* Binary operator found, increment n_subexprs */
+ ++*n_subexprs;
- operand1_str = strsep(&str, sep);
- if (!operand1_str || !str)
- goto free;
+ /* Split the expression string at the root operator */
+ if (!sep)
+ return ERR_PTR(-EINVAL);
+
+ *sep = '\0';
+ operand1_str = str;
+ str = sep+1;
+
+ /* Binary operator requires both operands */
+ if (*operand1_str == '\0' || *str == '\0')
+ return ERR_PTR(-EINVAL);
operand_flags = 0;
- operand1 = parse_atom(hist_data, file, operand1_str,
- &operand_flags, NULL);
- if (IS_ERR(operand1)) {
- ret = PTR_ERR(operand1);
- operand1 = NULL;
- goto free;
- }
+
+ /* LHS of string is an expression e.g. a+b in a+b+c */
+ operand1 = parse_expr(hist_data, file, operand1_str, operand_flags, NULL, n_subexprs);
+ if (IS_ERR(operand1))
+ return ERR_CAST(operand1);
+
if (operand1->flags & HIST_FIELD_FL_STRING) {
hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(operand1_str));
ret = -EINVAL;
- goto free;
+ goto free_op1;
}
- /* rest of string could be another expression e.g. b+c in a+b+c */
+ /* RHS of string is another expression e.g. c in a+b+c */
operand_flags = 0;
- operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
+ operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs);
if (IS_ERR(operand2)) {
ret = PTR_ERR(operand2);
- operand2 = NULL;
- goto free;
+ goto free_op1;
}
if (operand2->flags & HIST_FIELD_FL_STRING) {
hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str));
ret = -EINVAL;
- goto free;
+ goto free_operands;
+ }
+
+ switch (field_op) {
+ case FIELD_OP_MINUS:
+ op_fn = hist_field_minus;
+ break;
+ case FIELD_OP_PLUS:
+ op_fn = hist_field_plus;
+ break;
+ case FIELD_OP_DIV:
+ op_fn = hist_field_div;
+ break;
+ case FIELD_OP_MULT:
+ op_fn = hist_field_mult;
+ break;
+ default:
+ ret = -EINVAL;
+ goto free_operands;
}
- ret = check_expr_operands(file->tr, operand1, operand2);
+ ret = check_expr_operands(file->tr, operand1, operand2, &var1, &var2);
if (ret)
- goto free;
+ goto free_operands;
- flags |= HIST_FIELD_FL_EXPR;
+ operand_flags = var1 ? var1->flags : operand1->flags;
+ operand2_flags = var2 ? var2->flags : operand2->flags;
+
+ /*
+ * If both operands are constant, the expression can be
+ * collapsed to a single constant.
+ */
+ combine_consts = operand_flags & operand2_flags & HIST_FIELD_FL_CONST;
+
+ flags |= combine_consts ? HIST_FIELD_FL_CONST : HIST_FIELD_FL_EXPR;
flags |= operand1->flags &
(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
@@ -2297,44 +2651,79 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
expr = create_hist_field(hist_data, NULL, flags, var_name);
if (!expr) {
ret = -ENOMEM;
- goto free;
+ goto free_operands;
}
operand1->read_once = true;
operand2->read_once = true;
+ /* The operands are now owned and free'd by 'expr' */
expr->operands[0] = operand1;
expr->operands[1] = operand2;
- /* The operand sizes should be the same, so just pick one */
- expr->size = operand1->size;
+ if (field_op == FIELD_OP_DIV &&
+ operand2_flags & HIST_FIELD_FL_CONST) {
+ u64 divisor = var2 ? var2->constant : operand2->constant;
- expr->operator = field_op;
- expr->name = expr_str(expr, 0);
- expr->type = kstrdup(operand1->type, GFP_KERNEL);
- if (!expr->type) {
- ret = -ENOMEM;
- goto free;
+ if (!divisor) {
+ hist_err(file->tr, HIST_ERR_DIVISION_BY_ZERO, errpos(str));
+ ret = -EDOM;
+ goto free_expr;
+ }
+
+ /*
+ * Copy the divisor here so we don't have to look it up
+ * later if this is a var ref
+ */
+ operand2->constant = divisor;
+ op_fn = hist_field_get_div_fn(operand2);
}
- switch (field_op) {
- case FIELD_OP_MINUS:
- expr->fn = hist_field_minus;
- break;
- case FIELD_OP_PLUS:
- expr->fn = hist_field_plus;
- break;
- default:
- ret = -EINVAL;
- goto free;
+ if (combine_consts) {
+ if (var1)
+ expr->operands[0] = var1;
+ if (var2)
+ expr->operands[1] = var2;
+
+ expr->constant = op_fn(expr, NULL, NULL, NULL, NULL);
+
+ expr->operands[0] = NULL;
+ expr->operands[1] = NULL;
+
+ /*
+ * var refs won't be destroyed immediately
+ * See: destroy_hist_field()
+ */
+ destroy_hist_field(operand2, 0);
+ destroy_hist_field(operand1, 0);
+
+ expr->name = expr_str(expr, 0);
+ } else {
+ expr->fn = op_fn;
+
+ /* The operand sizes should be the same, so just pick one */
+ expr->size = operand1->size;
+
+ expr->operator = field_op;
+ expr->type = kstrdup_const(operand1->type, GFP_KERNEL);
+ if (!expr->type) {
+ ret = -ENOMEM;
+ goto free_expr;
+ }
+
+ expr->name = expr_str(expr, 0);
}
return expr;
- free:
- destroy_hist_field(operand1, 0);
+
+free_operands:
destroy_hist_field(operand2, 0);
- destroy_hist_field(expr, 0);
+free_op1:
+ destroy_hist_field(operand1, 0);
+ return ERR_PTR(ret);
+free_expr:
+ destroy_hist_field(expr, 0);
return ERR_PTR(ret);
}
@@ -2456,7 +2845,7 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
* events. However, for convenience, users are allowed to directly
* specify an event field in an action, which will be automatically
* converted into a variable on their behalf.
-
+ *
* If a user specifies a field on an event that isn't the event the
* histogram currently being defined (the target event histogram), the
* only way that can be accomplished is if a new hist trigger is
@@ -2640,8 +3029,10 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
if (val->flags & HIST_FIELD_FL_STRING) {
char *str = elt_data->field_var_str[j++];
char *val_str = (char *)(uintptr_t)var_val;
+ unsigned int size;
- strscpy(str, val_str, STR_VAR_LEN_MAX);
+ size = min(val->size, STR_VAR_LEN_MAX);
+ strscpy(str, val_str, size);
var_val = (u64)(uintptr_t)str;
}
tracing_map_set_var(elt, var_idx, var_val);
@@ -2699,10 +3090,10 @@ static struct hist_field *create_var(struct hist_trigger_data *hist_data,
var->var.hist_data = var->hist_data = hist_data;
var->size = size;
var->var.name = kstrdup(name, GFP_KERNEL);
- var->type = kstrdup(type, GFP_KERNEL);
+ var->type = kstrdup_const(type, GFP_KERNEL);
if (!var->var.name || !var->type) {
+ kfree_const(var->type);
kfree(var->var.name);
- kfree(var->type);
kfree(var);
var = ERR_PTR(-ENOMEM);
}
@@ -3366,7 +3757,7 @@ static int check_synth_field(struct synth_event *event,
if (strcmp(field->type, hist_field->type) != 0) {
if (field->size != hist_field->size ||
- field->is_signed != hist_field->is_signed)
+ (!field->is_string && field->is_signed != hist_field->is_signed))
return -EINVAL;
}
@@ -3701,9 +4092,9 @@ static int __create_val_field(struct hist_trigger_data *hist_data,
unsigned long flags)
{
struct hist_field *hist_field;
- int ret = 0;
+ int ret = 0, n_subexprs = 0;
- hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0);
+ hist_field = parse_expr(hist_data, file, field_str, flags, var_name, &n_subexprs);
if (IS_ERR(hist_field)) {
ret = PTR_ERR(hist_field);
goto out;
@@ -3731,6 +4122,41 @@ static int create_val_field(struct hist_trigger_data *hist_data,
return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
}
+static const char *no_comm = "(no comm)";
+
+static u64 hist_field_execname(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_elt_data *elt_data;
+
+ if (WARN_ON_ONCE(!elt))
+ return (u64)(unsigned long)no_comm;
+
+ elt_data = elt->private_data;
+
+ if (WARN_ON_ONCE(!elt_data->comm))
+ return (u64)(unsigned long)no_comm;
+
+ return (u64)(unsigned long)(elt_data->comm);
+}
+
+/* Convert a var that points to common_pid.execname to a string */
+static void update_var_execname(struct hist_field *hist_field)
+{
+ hist_field->flags = HIST_FIELD_FL_STRING | HIST_FIELD_FL_VAR |
+ HIST_FIELD_FL_EXECNAME;
+ hist_field->size = MAX_FILTER_STR_VAL;
+ hist_field->is_signed = 0;
+
+ kfree_const(hist_field->type);
+ hist_field->type = "char[]";
+
+ hist_field->fn = hist_field_execname;
+}
+
static int create_var_field(struct hist_trigger_data *hist_data,
unsigned int val_idx,
struct trace_event_file *file,
@@ -3755,6 +4181,9 @@ static int create_var_field(struct hist_trigger_data *hist_data,
ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
+ if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME)
+ update_var_execname(hist_data->fields[val_idx]);
+
if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING)
hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++;
@@ -3806,7 +4235,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
struct hist_field *hist_field = NULL;
unsigned long flags = 0;
unsigned int key_size;
- int ret = 0;
+ int ret = 0, n_subexprs = 0;
if (WARN_ON(key_idx >= HIST_FIELDS_MAX))
return -EINVAL;
@@ -3819,7 +4248,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
hist_field = create_hist_field(hist_data, NULL, flags, NULL);
} else {
hist_field = parse_expr(hist_data, file, field_str, flags,
- NULL, 0);
+ NULL, &n_subexprs);
if (IS_ERR(hist_field)) {
ret = PTR_ERR(hist_field);
goto out;
@@ -4490,6 +4919,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
if (hist_field->flags & HIST_FIELD_FL_STRING) {
unsigned int str_start, var_str_idx, idx;
char *str, *val_str;
+ unsigned int size;
str_start = hist_data->n_field_var_str +
hist_data->n_save_var_str;
@@ -4498,7 +4928,9 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
str = elt_data->field_var_str[idx];
val_str = (char *)(uintptr_t)hist_val;
- strscpy(str, val_str, STR_VAR_LEN_MAX);
+
+ size = min(hist_field->size, STR_VAR_LEN_MAX);
+ strscpy(str, val_str, size);
hist_val = (u64)(uintptr_t)str;
}
@@ -4618,7 +5050,6 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
unsigned long *stacktrace_entries,
unsigned int max_entries)
{
- char str[KSYM_SYMBOL_LEN];
unsigned int spaces = 8;
unsigned int i;
@@ -4627,8 +5058,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
return;
seq_printf(m, "%*c", 1 + spaces, ' ');
- sprint_symbol(str, stacktrace_entries[i]);
- seq_printf(m, "%s\n", str);
+ seq_printf(m, "%pS\n", (void*)stacktrace_entries[i]);
}
}
@@ -4638,7 +5068,6 @@ static void hist_trigger_print_key(struct seq_file *m,
struct tracing_map_elt *elt)
{
struct hist_field *key_field;
- char str[KSYM_SYMBOL_LEN];
bool multiline = false;
const char *field_name;
unsigned int i;
@@ -4659,14 +5088,12 @@ static void hist_trigger_print_key(struct seq_file *m,
seq_printf(m, "%s: %llx", field_name, uval);
} else if (key_field->flags & HIST_FIELD_FL_SYM) {
uval = *(u64 *)(key + key_field->offset);
- sprint_symbol_no_offset(str, uval);
- seq_printf(m, "%s: [%llx] %-45s", field_name,
- uval, str);
+ seq_printf(m, "%s: [%llx] %-45ps", field_name,
+ uval, (void *)(uintptr_t)uval);
} else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
uval = *(u64 *)(key + key_field->offset);
- sprint_symbol(str, uval);
- seq_printf(m, "%s: [%llx] %-55s", field_name,
- uval, str);
+ seq_printf(m, "%s: [%llx] %-55pS", field_name,
+ uval, (void *)(uintptr_t)uval);
} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
struct hist_elt_data *elt_data = elt->private_data;
char *comm;
@@ -4698,6 +5125,11 @@ static void hist_trigger_print_key(struct seq_file *m,
} else if (key_field->flags & HIST_FIELD_FL_LOG2) {
seq_printf(m, "%s: ~ 2^%-2llu", field_name,
*(u64 *)(key + key_field->offset));
+ } else if (key_field->flags & HIST_FIELD_FL_BUCKET) {
+ unsigned long buckets = key_field->buckets;
+ uval = *(u64 *)(key + key_field->offset);
+ seq_printf(m, "%s: ~ %llu-%llu", field_name,
+ uval, uval + buckets -1);
} else if (key_field->flags & HIST_FIELD_FL_STRING) {
seq_printf(m, "%s: %-50s", field_name,
(char *)(key + key_field->offset));
@@ -4857,6 +5289,8 @@ static void hist_field_debug_show_flags(struct seq_file *m,
if (flags & HIST_FIELD_FL_ALIAS)
seq_puts(m, " HIST_FIELD_FL_ALIAS\n");
+ else if (flags & HIST_FIELD_FL_CONST)
+ seq_puts(m, " HIST_FIELD_FL_CONST\n");
}
static int hist_field_debug_show(struct seq_file *m,
@@ -4878,6 +5312,9 @@ static int hist_field_debug_show(struct seq_file *m,
field->var.idx);
}
+ if (field->flags & HIST_FIELD_FL_CONST)
+ seq_printf(m, " constant: %llu\n", field->constant);
+
if (field->flags & HIST_FIELD_FL_ALIAS)
seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n",
field->var_ref_idx);
@@ -5120,6 +5557,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
if (hist_field->flags & HIST_FIELD_FL_CPU)
seq_puts(m, "common_cpu");
+ else if (hist_field->flags & HIST_FIELD_FL_CONST)
+ seq_printf(m, "%llu", hist_field->constant);
else if (field_name) {
if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
hist_field->flags & HIST_FIELD_FL_ALIAS)
@@ -5137,6 +5576,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
seq_printf(m, ".%s", flags);
}
}
+ if (hist_field->buckets)
+ seq_printf(m, "=%ld", hist_field->buckets);
}
static int event_hist_trigger_print(struct seq_file *m,
@@ -5700,7 +6141,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
struct synth_event *se;
const char *se_name;
bool remove = false;
- char *trigger, *p;
+ char *trigger, *p, *start;
int ret = 0;
lockdep_assert_held(&event_mutex);
@@ -5748,6 +6189,16 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
trigger = strstrip(trigger);
}
+ /*
+ * To simplify arithmetic expression parsing, replace occurrences of
+ * '.sym-offset' modifier with '.symXoffset'
+ */
+ start = strstr(trigger, ".sym-offset");
+ while (start) {
+ *(start + 4) = 'X';
+ start = strstr(start + 11, ".sym-offset");
+ }
+
attrs = parse_hist_trigger_attrs(file->tr, trigger);
if (IS_ERR(attrs))
return PTR_ERR(attrs);
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 9315fc03e303..ca9c13b2ecf4 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -1237,9 +1237,8 @@ static int __create_synth_event(const char *name, const char *raw_fields)
argv + consumed, &consumed,
&field_version);
if (IS_ERR(field)) {
- argv_free(argv);
ret = PTR_ERR(field);
- goto err;
+ goto err_free_arg;
}
/*
@@ -1262,18 +1261,19 @@ static int __create_synth_event(const char *name, const char *raw_fields)
if (cmd_version > 1 && n_fields_this_loop >= 1) {
synth_err(SYNTH_ERR_INVALID_CMD, errpos(field_str));
ret = -EINVAL;
- goto err;
+ goto err_free_arg;
}
fields[n_fields++] = field;
if (n_fields == SYNTH_FIELDS_MAX) {
synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0);
ret = -EINVAL;
- goto err;
+ goto err_free_arg;
}
n_fields_this_loop++;
}
+ argv_free(argv);
if (consumed < argc) {
synth_err(SYNTH_ERR_INVALID_CMD, 0);
@@ -1281,7 +1281,6 @@ static int __create_synth_event(const char *name, const char *raw_fields)
goto err;
}
- argv_free(argv);
}
if (n_fields == 0) {
@@ -1298,7 +1297,7 @@ static int __create_synth_event(const char *name, const char *raw_fields)
}
ret = register_synth_event(event);
if (!ret)
- dyn_event_add(&event->devent);
+ dyn_event_add(&event->devent, &event->call);
else
free_synth_event(event);
out:
@@ -1307,6 +1306,8 @@ static int __create_synth_event(const char *name, const char *raw_fields)
kfree(saved_fields);
return ret;
+ err_free_arg:
+ argv_free(argv);
err:
for (i = 0; i < n_fields; i++)
free_synth_field(fields[i]);
@@ -1369,13 +1370,15 @@ static int destroy_synth_event(struct synth_event *se)
int ret;
if (se->ref)
- ret = -EBUSY;
- else {
- ret = unregister_synth_event(se);
- if (!ret) {
- dyn_event_remove(&se->devent);
- free_synth_event(se);
- }
+ return -EBUSY;
+
+ if (trace_event_dyn_busy(&se->call))
+ return -EBUSY;
+
+ ret = unregister_synth_event(se);
+ if (!ret) {
+ dyn_event_remove(&se->devent);
+ free_synth_event(se);
}
return ret;
@@ -2102,6 +2105,9 @@ static int synth_event_release(struct dyn_event *ev)
if (event->ref)
return -EBUSY;
+ if (trace_event_dyn_busy(&event->call))
+ return -EBUSY;
+
ret = unregister_synth_event(event);
if (ret)
return ret;
@@ -2222,8 +2228,8 @@ static __init int trace_events_synth_init(void)
if (err)
goto err;
- entry = tracefs_create_file("synthetic_events", 0644, NULL,
- NULL, &synth_events_fops);
+ entry = tracefs_create_file("synthetic_events", TRACE_MODE_WRITE,
+ NULL, NULL, &synth_events_fops);
if (!entry) {
err = -ENODEV;
goto err;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index cf84d0f6583a..3d5c07239a2a 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -124,6 +124,18 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
return seq_list_next(t, &event_file->triggers, pos);
}
+static bool check_user_trigger(struct trace_event_file *file)
+{
+ struct event_trigger_data *data;
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ if (data->flags & EVENT_TRIGGER_FL_PROBE)
+ continue;
+ return true;
+ }
+ return false;
+}
+
static void *trigger_start(struct seq_file *m, loff_t *pos)
{
struct trace_event_file *event_file;
@@ -134,7 +146,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos)
if (unlikely(!event_file))
return ERR_PTR(-ENODEV);
- if (list_empty(&event_file->triggers))
+ if (list_empty(&event_file->triggers) || !check_user_trigger(event_file))
return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL;
return seq_list_start(&event_file->triggers, *pos);
@@ -1334,7 +1346,7 @@ void event_enable_trigger_free(struct event_trigger_ops *ops,
if (!data->ref) {
/* Remove the SOFT_MODE flag */
trace_event_enable_disable(enable_data->file, 0, 1);
- module_put(enable_data->file->event_call->mod);
+ trace_event_put_ref(enable_data->file->event_call);
trigger_data_free(data);
kfree(enable_data);
}
@@ -1481,7 +1493,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
out_reg:
/* Don't let event modules unload while probe registered */
- ret = try_module_get(event_enable_file->event_call->mod);
+ ret = trace_event_try_get_ref(event_enable_file->event_call);
if (!ret) {
ret = -EBUSY;
goto out_free;
@@ -1510,7 +1522,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
out_disable:
trace_event_enable_disable(event_enable_file, 0, 1);
out_put:
- module_put(event_enable_file->event_call->mod);
+ trace_event_put_ref(event_enable_file->event_call);
out_free:
if (cmd_ops->set_filter)
cmd_ops->set_filter(NULL, trigger_data, NULL);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 1f0e63f5d1f9..9f1bfbe105e8 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -186,7 +186,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
return;
trace_ctx = tracing_gen_ctx();
- preempt_disable_notrace();
cpu = smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
@@ -194,7 +193,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
trace_function(tr, ip, parent_ip, trace_ctx);
ftrace_test_recursion_unlock(bit);
- preempt_enable_notrace();
}
#ifdef CONFIG_UNWINDER_ORC
@@ -298,8 +296,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
if (bit < 0)
return;
- preempt_disable_notrace();
-
cpu = smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
if (atomic_read(&data->disabled))
@@ -324,7 +320,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
out:
ftrace_test_recursion_unlock(bit);
- preempt_enable_notrace();
}
static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 0de6837722da..203204cadf92 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -120,7 +120,7 @@ static inline int ftrace_graph_ignore_irqs(void)
if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
return 0;
- return in_irq();
+ return in_hardirq();
}
int trace_graph_entry(struct ftrace_graph_ent *trace)
@@ -1340,7 +1340,7 @@ static __init int init_graph_tracefs(void)
if (ret)
return 0;
- trace_create_file("max_graph_depth", 0644, NULL,
+ trace_create_file("max_graph_depth", TRACE_MODE_WRITE, NULL,
NULL, &graph_depth_fops);
return 0;
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 14f46aae1981..d440ddd5fd8b 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -79,8 +79,8 @@ struct hwlat_kthread_data {
int nmi_cpu;
};
-struct hwlat_kthread_data hwlat_single_cpu_data;
-DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data);
+static struct hwlat_kthread_data hwlat_single_cpu_data;
+static DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data);
/* Tells NMIs to call back to the hwlat tracer to record timestamps */
bool trace_hwlat_callback_enabled;
@@ -325,10 +325,10 @@ static void move_to_next_cpu(void)
if (!cpumask_equal(current_mask, current->cpus_ptr))
goto change_mode;
- get_online_cpus();
+ cpus_read_lock();
cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
next_cpu = cpumask_next(raw_smp_processor_id(), current_mask);
- put_online_cpus();
+ cpus_read_unlock();
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(current_mask);
@@ -398,7 +398,7 @@ static void stop_single_kthread(void)
struct hwlat_kthread_data *kdata = get_cpu_data();
struct task_struct *kthread;
- get_online_cpus();
+ cpus_read_lock();
kthread = kdata->kthread;
if (!kthread)
@@ -408,7 +408,7 @@ static void stop_single_kthread(void)
kdata->kthread = NULL;
out_put_cpus:
- put_online_cpus();
+ cpus_read_unlock();
}
@@ -425,14 +425,14 @@ static int start_single_kthread(struct trace_array *tr)
struct task_struct *kthread;
int next_cpu;
- get_online_cpus();
+ cpus_read_lock();
if (kdata->kthread)
goto out_put_cpus;
kthread = kthread_create(kthread_fn, NULL, "hwlatd");
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
- put_online_cpus();
+ cpus_read_unlock();
return -ENOMEM;
}
@@ -452,7 +452,7 @@ static int start_single_kthread(struct trace_array *tr)
wake_up_process(kthread);
out_put_cpus:
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
@@ -479,10 +479,10 @@ static void stop_per_cpu_kthreads(void)
{
unsigned int cpu;
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu)
stop_cpu_kthread(cpu);
- put_online_cpus();
+ cpus_read_unlock();
}
/*
@@ -491,18 +491,14 @@ static void stop_per_cpu_kthreads(void)
static int start_cpu_kthread(unsigned int cpu)
{
struct task_struct *kthread;
- char comm[24];
- snprintf(comm, 24, "hwlatd/%d", cpu);
-
- kthread = kthread_create_on_cpu(kthread_fn, NULL, cpu, comm);
+ kthread = kthread_run_on_cpu(kthread_fn, NULL, cpu, "hwlatd/%u");
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
return -ENOMEM;
}
per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread;
- wake_up_process(kthread);
return 0;
}
@@ -515,7 +511,7 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy)
mutex_lock(&trace_types_lock);
mutex_lock(&hwlat_data.lock);
- get_online_cpus();
+ cpus_read_lock();
if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU)
goto out_unlock;
@@ -526,7 +522,7 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy)
start_cpu_kthread(cpu);
out_unlock:
- put_online_cpus();
+ cpus_read_unlock();
mutex_unlock(&hwlat_data.lock);
mutex_unlock(&trace_types_lock);
}
@@ -582,7 +578,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr)
unsigned int cpu;
int retval;
- get_online_cpus();
+ cpus_read_lock();
/*
* Run only on CPUs in which hwlat is allowed to run.
*/
@@ -596,12 +592,12 @@ static int start_per_cpu_kthreads(struct trace_array *tr)
if (retval)
goto out_error;
}
- put_online_cpus();
+ cpus_read_unlock();
return 0;
out_error:
- put_online_cpus();
+ cpus_read_unlock();
stop_per_cpu_kthreads();
return retval;
}
@@ -782,21 +778,21 @@ static int init_tracefs(void)
if (!top_dir)
return -ENOMEM;
- hwlat_sample_window = tracefs_create_file("window", 0640,
+ hwlat_sample_window = tracefs_create_file("window", TRACE_MODE_WRITE,
top_dir,
&hwlat_window,
&trace_min_max_fops);
if (!hwlat_sample_window)
goto err;
- hwlat_sample_width = tracefs_create_file("width", 0644,
+ hwlat_sample_width = tracefs_create_file("width", TRACE_MODE_WRITE,
top_dir,
&hwlat_width,
&trace_min_max_fops);
if (!hwlat_sample_width)
goto err;
- hwlat_thread_mode = trace_create_file("mode", 0644,
+ hwlat_thread_mode = trace_create_file("mode", TRACE_MODE_WRITE,
top_dir,
NULL,
&thread_mode_fops);
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 9da76104f7a2..59857a1ee44c 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -147,11 +147,17 @@ static int kdb_ftdump(int argc, const char **argv)
return 0;
}
+static kdbtab_t ftdump_cmd = {
+ .name = "ftdump",
+ .func = kdb_ftdump,
+ .usage = "[skip_#entries] [cpu]",
+ .help = "Dump ftrace log; -skip dumps last #entries",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+};
+
static __init int kdb_ftrace_register(void)
{
- kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]",
- "Dump ftrace log; -skip dumps last #entries", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register(&ftdump_cmd);
return 0;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ea6178cb5e33..4e1257f50aa3 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -7,6 +7,7 @@
*/
#define pr_fmt(fmt) "trace_kprobe: " fmt
+#include <linux/bpf-cgroup.h>
#include <linux/security.h>
#include <linux/module.h>
#include <linux/uaccess.h>
@@ -80,10 +81,6 @@ static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev)
for_each_dyn_event(dpos) \
if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos)))
-#define SIZEOF_TRACE_KPROBE(n) \
- (offsetof(struct trace_kprobe, tp.args) + \
- (sizeof(struct probe_arg) * (n)))
-
static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
{
return tk->rp.handler != NULL;
@@ -101,7 +98,7 @@ static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk
static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
{
- return !!(kprobe_gone(&tk->rp.kp));
+ return kprobe_gone(&tk->rp.kp);
}
static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
@@ -265,7 +262,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
struct trace_kprobe *tk;
int ret = -ENOMEM;
- tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL);
+ tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL);
if (!tk)
return ERR_PTR(ret);
@@ -543,6 +540,10 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk)
if (trace_probe_is_enabled(&tk->tp))
return -EBUSY;
+ /* If there's a reference to the dynamic event */
+ if (trace_event_dyn_busy(trace_probe_event_call(&tk->tp)))
+ return -EBUSY;
+
/* Will fail if probe is being used by ftrace or perf */
if (unregister_kprobe_event(tk))
return -EBUSY;
@@ -618,7 +619,7 @@ static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to)
if (ret)
trace_probe_unlink(&tk->tp);
else
- dyn_event_add(&tk->devent);
+ dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp));
return ret;
}
@@ -647,7 +648,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
/* Register new event */
ret = register_kprobe_event(tk);
if (ret) {
- pr_warn("Failed to register probe event(%d)\n", ret);
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ } else
+ pr_warn("Failed to register probe event(%d)\n", ret);
goto end;
}
@@ -661,7 +666,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
if (ret < 0)
unregister_kprobe_event(tk);
else
- dyn_event_add(&tk->devent);
+ dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp));
end:
mutex_unlock(&event_mutex);
@@ -703,14 +708,6 @@ static struct notifier_block trace_kprobe_module_nb = {
.priority = 1 /* Invoked after kprobe module callback */
};
-/* Convert certain expected symbols into '_' when generating event names */
-static inline void sanitize_event_name(char *name)
-{
- while (*name++ != '\0')
- if (*name == ':' || *name == '.')
- *name = '_';
-}
-
static int __trace_kprobe_create(int argc, const char *argv[])
{
/*
@@ -742,6 +739,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
bool is_return = false;
char *symbol = NULL, *tmp = NULL;
const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
+ enum probe_print_type ptype;
int maxactive = 0;
long offset = 0;
void *addr = NULL;
@@ -869,20 +867,14 @@ static int __trace_kprobe_create(int argc, const char *argv[])
/* parse arguments */
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
- tmp = kstrdup(argv[i], GFP_KERNEL);
- if (!tmp) {
- ret = -ENOMEM;
- goto error;
- }
-
trace_probe_log_set_index(i + 2);
- ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags);
- kfree(tmp);
+ ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags);
if (ret)
goto error; /* This can be -ENOMEM */
}
- ret = traceprobe_set_print_fmt(&tk->tp, is_return);
+ ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
+ ret = traceprobe_set_print_fmt(&tk->tp, ptype);
if (ret < 0)
goto error;
@@ -1330,9 +1322,10 @@ probe_mem_read(void *dest, void *src, size_t size)
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
+process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
void *base)
{
+ struct pt_regs *regs = rec;
unsigned long val;
retry:
@@ -1806,6 +1799,7 @@ struct trace_event_call *
create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
bool is_return)
{
+ enum probe_print_type ptype;
struct trace_kprobe *tk;
int ret;
char *event;
@@ -1829,7 +1823,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
init_trace_event_call(tk);
- if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) {
+ ptype = trace_kprobe_is_return(tk) ?
+ PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
+ if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) {
ret = -ENOMEM;
goto error;
}
@@ -1930,16 +1926,16 @@ static __init int init_kprobe_trace(void)
if (ret)
return 0;
- entry = tracefs_create_file("kprobe_events", 0644, NULL,
- NULL, &kprobe_events_ops);
+ entry = tracefs_create_file("kprobe_events", TRACE_MODE_WRITE,
+ NULL, NULL, &kprobe_events_ops);
/* Event list interface */
if (!entry)
pr_warn("Could not create tracefs 'kprobe_events' entry\n");
/* Profile interface */
- entry = tracefs_create_file("kprobe_profile", 0444, NULL,
- NULL, &kprobe_profile_ops);
+ entry = tracefs_create_file("kprobe_profile", TRACE_MODE_READ,
+ NULL, NULL, &kprobe_profile_ops);
if (!entry)
pr_warn("Could not create tracefs 'kprobe_profile' entry\n");
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index b61eefe5ccf5..89d6cbac6f10 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -38,8 +38,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/osnoise.h>
-static struct trace_array *osnoise_trace;
-
/*
* Default values.
*/
@@ -51,6 +49,100 @@ static struct trace_array *osnoise_trace;
#define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */
/*
+ * trace_array of the enabled osnoise/timerlat instances.
+ */
+struct osnoise_instance {
+ struct list_head list;
+ struct trace_array *tr;
+};
+
+static struct list_head osnoise_instances;
+
+static bool osnoise_has_registered_instances(void)
+{
+ return !!list_first_or_null_rcu(&osnoise_instances,
+ struct osnoise_instance,
+ list);
+}
+
+/*
+ * osnoise_instance_registered - check if a tr is already registered
+ */
+static int osnoise_instance_registered(struct trace_array *tr)
+{
+ struct osnoise_instance *inst;
+ int found = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ if (inst->tr == tr)
+ found = 1;
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
+/*
+ * osnoise_register_instance - register a new trace instance
+ *
+ * Register a trace_array *tr in the list of instances running
+ * osnoise/timerlat tracers.
+ */
+static int osnoise_register_instance(struct trace_array *tr)
+{
+ struct osnoise_instance *inst;
+
+ /*
+ * register/unregister serialization is provided by trace's
+ * trace_types_lock.
+ */
+ lockdep_assert_held(&trace_types_lock);
+
+ inst = kmalloc(sizeof(*inst), GFP_KERNEL);
+ if (!inst)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD_RCU(&inst->list);
+ inst->tr = tr;
+ list_add_tail_rcu(&inst->list, &osnoise_instances);
+
+ return 0;
+}
+
+/*
+ * osnoise_unregister_instance - unregister a registered trace instance
+ *
+ * Remove the trace_array *tr from the list of instances running
+ * osnoise/timerlat tracers.
+ */
+static void osnoise_unregister_instance(struct trace_array *tr)
+{
+ struct osnoise_instance *inst;
+ int found = 0;
+
+ /*
+ * register/unregister serialization is provided by trace's
+ * trace_types_lock.
+ */
+ lockdep_assert_held(&trace_types_lock);
+
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ if (inst->tr == tr) {
+ list_del_rcu(&inst->list);
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found)
+ return;
+
+ synchronize_rcu();
+ kfree(inst);
+}
+
+/*
* NMI runtime info.
*/
struct osn_nmi {
@@ -248,10 +340,56 @@ static struct osnoise_data {
#endif
};
-/*
- * Boolean variable used to inform that the tracer is currently sampling.
- */
-static bool osnoise_busy;
+#ifdef CONFIG_TIMERLAT_TRACER
+static inline bool timerlat_enabled(void)
+{
+ return osnoise_data.timerlat_tracer;
+}
+
+static inline int timerlat_softirq_exit(struct osnoise_variables *osn_var)
+{
+ struct timerlat_variables *tlat_var = this_cpu_tmr_var();
+ /*
+ * If the timerlat is enabled, but the irq handler did
+ * not run yet enabling timerlat_tracer, do not trace.
+ */
+ if (!tlat_var->tracing_thread) {
+ osn_var->softirq.arrival_time = 0;
+ osn_var->softirq.delta_start = 0;
+ return 0;
+ }
+ return 1;
+}
+
+static inline int timerlat_thread_exit(struct osnoise_variables *osn_var)
+{
+ struct timerlat_variables *tlat_var = this_cpu_tmr_var();
+ /*
+ * If the timerlat is enabled, but the irq handler did
+ * not run yet enabling timerlat_tracer, do not trace.
+ */
+ if (!tlat_var->tracing_thread) {
+ osn_var->thread.delta_start = 0;
+ osn_var->thread.arrival_time = 0;
+ return 0;
+ }
+ return 1;
+}
+#else /* CONFIG_TIMERLAT_TRACER */
+static inline bool timerlat_enabled(void)
+{
+ return false;
+}
+
+static inline int timerlat_softirq_exit(struct osnoise_variables *osn_var)
+{
+ return 1;
+}
+static inline int timerlat_thread_exit(struct osnoise_variables *osn_var)
+{
+ return 1;
+}
+#endif
#ifdef CONFIG_PREEMPT_RT
/*
@@ -294,19 +432,19 @@ static void print_osnoise_headers(struct seq_file *s)
seq_puts(s, "# _-----=> irqs-off\n");
seq_puts(s, "# / _----=> need-resched\n");
seq_puts(s, "# | / _---=> hardirq/softirq\n");
- seq_puts(s, "# || / _--=> preempt-depth ");
- seq_puts(s, " MAX\n");
-
- seq_puts(s, "# || / ");
+ seq_puts(s, "# || / _--=> preempt-depth\n");
+ seq_puts(s, "# ||| / _-=> migrate-disable ");
+ seq_puts(s, " MAX\n");
+ seq_puts(s, "# |||| / delay ");
seq_puts(s, " SINGLE Interference counters:\n");
- seq_puts(s, "# |||| RUNTIME ");
+ seq_puts(s, "# ||||| RUNTIME ");
seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n");
- seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP IN US ");
+ seq_puts(s, "# TASK-PID CPU# ||||| TIMESTAMP IN US ");
seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n");
- seq_puts(s, "# | | | |||| | | ");
+ seq_puts(s, "# | | | ||||| | | ");
seq_puts(s, " | | | | | | | |\n");
}
#endif /* CONFIG_PREEMPT_RT */
@@ -315,19 +453,24 @@ static void print_osnoise_headers(struct seq_file *s)
* osnoise_taint - report an osnoise error.
*/
#define osnoise_taint(msg) ({ \
- struct trace_array *tr = osnoise_trace; \
+ struct osnoise_instance *inst; \
+ struct trace_buffer *buffer; \
\
- trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg); \
+ rcu_read_lock(); \
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) { \
+ buffer = inst->tr->array_buffer.buffer; \
+ trace_array_printk_buf(buffer, _THIS_IP_, msg); \
+ } \
+ rcu_read_unlock(); \
osnoise_data.tainted = true; \
})
/*
* Record an osnoise_sample into the tracer buffer.
*/
-static void trace_osnoise_sample(struct osnoise_sample *sample)
+static void
+__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer)
{
- struct trace_array *tr = osnoise_trace;
- struct trace_buffer *buffer = tr->array_buffer.buffer;
struct trace_event_call *call = &event_osnoise;
struct ring_buffer_event *event;
struct osnoise_entry *entry;
@@ -350,6 +493,22 @@ static void trace_osnoise_sample(struct osnoise_sample *sample)
trace_buffer_unlock_commit_nostack(buffer, event);
}
+/*
+ * Record an osnoise_sample on all osnoise instances.
+ */
+static void trace_osnoise_sample(struct osnoise_sample *sample)
+{
+ struct osnoise_instance *inst;
+ struct trace_buffer *buffer;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ buffer = inst->tr->array_buffer.buffer;
+ __trace_osnoise_sample(sample, buffer);
+ }
+ rcu_read_unlock();
+}
+
#ifdef CONFIG_TIMERLAT_TRACER
/*
* Print the timerlat header info.
@@ -378,23 +537,20 @@ static void print_timerlat_headers(struct seq_file *s)
seq_puts(s, "# / _----=> need-resched\n");
seq_puts(s, "# | / _---=> hardirq/softirq\n");
seq_puts(s, "# || / _--=> preempt-depth\n");
- seq_puts(s, "# || /\n");
- seq_puts(s, "# |||| ACTIVATION\n");
- seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP ID ");
- seq_puts(s, " CONTEXT LATENCY\n");
- seq_puts(s, "# | | | |||| | | ");
+ seq_puts(s, "# ||| / _-=> migrate-disable\n");
+ seq_puts(s, "# |||| / delay\n");
+ seq_puts(s, "# ||||| ACTIVATION\n");
+ seq_puts(s, "# TASK-PID CPU# ||||| TIMESTAMP ID ");
+ seq_puts(s, " CONTEXT LATENCY\n");
+ seq_puts(s, "# | | | ||||| | | ");
seq_puts(s, " | |\n");
}
#endif /* CONFIG_PREEMPT_RT */
-/*
- * Record an timerlat_sample into the tracer buffer.
- */
-static void trace_timerlat_sample(struct timerlat_sample *sample)
+static void
+__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer)
{
- struct trace_array *tr = osnoise_trace;
struct trace_event_call *call = &event_osnoise;
- struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct timerlat_entry *entry;
@@ -411,6 +567,22 @@ static void trace_timerlat_sample(struct timerlat_sample *sample)
trace_buffer_unlock_commit_nostack(buffer, event);
}
+/*
+ * Record an timerlat_sample into the tracer buffer.
+ */
+static void trace_timerlat_sample(struct timerlat_sample *sample)
+{
+ struct osnoise_instance *inst;
+ struct trace_buffer *buffer;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ buffer = inst->tr->array_buffer.buffer;
+ __trace_timerlat_sample(sample, buffer);
+ }
+ rcu_read_unlock();
+}
+
#ifdef CONFIG_STACKTRACE
#define MAX_CALLS 256
@@ -450,29 +622,18 @@ static void timerlat_save_stack(int skip)
return;
}
-/*
- * timerlat_dump_stack - dump a stack trace previously saved
- *
- * Dump a saved stack trace into the trace buffer.
- */
-static void timerlat_dump_stack(void)
+
+static void
+__timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size)
{
struct trace_event_call *call = &event_osnoise;
- struct trace_array *tr = osnoise_trace;
- struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
- struct trace_stack *fstack;
struct stack_entry *entry;
- unsigned int size;
-
- preempt_disable_notrace();
- fstack = this_cpu_ptr(&trace_stack);
- size = fstack->stack_size;
event = trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry) + size,
tracing_gen_ctx());
if (!event)
- goto out;
+ return;
entry = ring_buffer_event_data(event);
@@ -481,12 +642,39 @@ static void timerlat_dump_stack(void)
if (!call_filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit_nostack(buffer, event);
+}
-out:
+/*
+ * timerlat_dump_stack - dump a stack trace previously saved
+ */
+static void timerlat_dump_stack(u64 latency)
+{
+ struct osnoise_instance *inst;
+ struct trace_buffer *buffer;
+ struct trace_stack *fstack;
+ unsigned int size;
+
+ /*
+ * trace only if latency > print_stack config, if enabled.
+ */
+ if (!osnoise_data.print_stack || osnoise_data.print_stack > latency)
+ return;
+
+ preempt_disable_notrace();
+ fstack = this_cpu_ptr(&trace_stack);
+ size = fstack->stack_size;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ buffer = inst->tr->array_buffer.buffer;
+ __timerlat_dump_stack(buffer, fstack, size);
+
+ }
+ rcu_read_unlock();
preempt_enable_notrace();
}
-#else
-#define timerlat_dump_stack() do {} while (0)
+#else /* CONFIG_STACKTRACE */
+#define timerlat_dump_stack(u64 latency) do {} while (0)
#define timerlat_save_stack(a) do {} while (0)
#endif /* CONFIG_STACKTRACE */
#endif /* CONFIG_TIMERLAT_TRACER */
@@ -866,21 +1054,9 @@ static void trace_softirq_exit_callback(void *data, unsigned int vec_nr)
if (!osn_var->sampling)
return;
-#ifdef CONFIG_TIMERLAT_TRACER
- /*
- * If the timerlat is enabled, but the irq handler did
- * not run yet enabling timerlat_tracer, do not trace.
- */
- if (unlikely(osnoise_data.timerlat_tracer)) {
- struct timerlat_variables *tlat_var;
- tlat_var = this_cpu_tmr_var();
- if (!tlat_var->tracing_thread) {
- osn_var->softirq.arrival_time = 0;
- osn_var->softirq.delta_start = 0;
+ if (unlikely(timerlat_enabled()))
+ if (!timerlat_softirq_exit(osn_var))
return;
- }
- }
-#endif
duration = get_int_safe_duration(osn_var, &osn_var->softirq.delta_start);
trace_softirq_noise(vec_nr, osn_var->softirq.arrival_time, duration);
@@ -974,17 +1150,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
if (!osn_var->sampling)
return;
-#ifdef CONFIG_TIMERLAT_TRACER
- if (osnoise_data.timerlat_tracer) {
- struct timerlat_variables *tlat_var;
- tlat_var = this_cpu_tmr_var();
- if (!tlat_var->tracing_thread) {
- osn_var->thread.delta_start = 0;
- osn_var->thread.arrival_time = 0;
+ if (unlikely(timerlat_enabled()))
+ if (!timerlat_thread_exit(osn_var))
return;
- }
- }
-#endif
duration = get_int_safe_duration(osn_var, &osn_var->thread.delta_start);
@@ -1077,12 +1245,37 @@ diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *
*/
static __always_inline void osnoise_stop_tracing(void)
{
- struct trace_array *tr = osnoise_trace;
+ struct osnoise_instance *inst;
+ struct trace_array *tr;
- trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
- "stop tracing hit on cpu %d\n", smp_processor_id());
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ tr = inst->tr;
+ trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
+ "stop tracing hit on cpu %d\n", smp_processor_id());
- tracer_tracing_off(tr);
+ tracer_tracing_off(tr);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * notify_new_max_latency - Notify a new max latency via fsnotify interface.
+ */
+static void notify_new_max_latency(u64 latency)
+{
+ struct osnoise_instance *inst;
+ struct trace_array *tr;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ tr = inst->tr;
+ if (tr->max_latency < latency) {
+ tr->max_latency = latency;
+ latency_fsnotify(tr);
+ }
+ }
+ rcu_read_unlock();
}
/*
@@ -1096,7 +1289,6 @@ static __always_inline void osnoise_stop_tracing(void)
static int run_osnoise(void)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
- struct trace_array *tr = osnoise_trace;
u64 start, sample, last_sample;
u64 last_int_count, int_count;
s64 noise = 0, max_noise = 0;
@@ -1231,11 +1423,7 @@ static int run_osnoise(void)
trace_osnoise_sample(&s);
- /* Keep a running maximum ever recorded osnoise "latency" */
- if (max_noise > tr->max_latency) {
- tr->max_latency = max_noise;
- latency_fsnotify(tr);
- }
+ notify_new_max_latency(max_noise);
if (osnoise_data.stop_tracing_total)
if (s.noise > osnoise_data.stop_tracing_total)
@@ -1293,7 +1481,6 @@ static int osnoise_main(void *data)
static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
- struct trace_array *tr = osnoise_trace;
struct timerlat_variables *tlat;
struct timerlat_sample s;
u64 now;
@@ -1332,9 +1519,11 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
* running, the thread needs to receive the softirq delta_start. The
* reason being is that the softirq will be the last to be unfolded,
* resseting the thread delay to zero.
+ *
+ * The PREEMPT_RT is a special case, though. As softirqs run as threads
+ * on RT, moving the thread is enough.
*/
-#ifndef CONFIG_PREEMPT_RT
- if (osn_var->softirq.delta_start) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && osn_var->softirq.delta_start) {
copy_int_safe_time(osn_var, &osn_var->thread.delta_start,
&osn_var->softirq.delta_start);
@@ -1344,13 +1533,6 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
copy_int_safe_time(osn_var, &osn_var->thread.delta_start,
&osn_var->irq.delta_start);
}
-#else /* CONFIG_PREEMPT_RT */
- /*
- * The sofirqs run as threads on RT, so there is not need
- * to keep track of it.
- */
- copy_int_safe_time(osn_var, &osn_var->thread.delta_start, &osn_var->irq.delta_start);
-#endif /* CONFIG_PREEMPT_RT */
/*
* Compute the current time with the expected time.
@@ -1364,11 +1546,7 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
trace_timerlat_sample(&s);
- /* Keep a running maximum ever recorded os noise "latency" */
- if (diff > tr->max_latency) {
- tr->max_latency = diff;
- latency_fsnotify(tr);
- }
+ notify_new_max_latency(diff);
if (osnoise_data.stop_tracing)
if (time_to_us(diff) >= osnoise_data.stop_tracing)
@@ -1456,11 +1634,7 @@ static int timerlat_main(void *data)
trace_timerlat_sample(&s);
-#ifdef CONFIG_STACKTRACE
- if (osnoise_data.print_stack)
- if (osnoise_data.print_stack <= time_to_us(diff))
- timerlat_dump_stack();
-#endif /* CONFIG_STACKTRACE */
+ timerlat_dump_stack(time_to_us(diff));
tlat->tracing_thread = false;
if (osnoise_data.stop_tracing_total)
@@ -1473,6 +1647,11 @@ static int timerlat_main(void *data)
hrtimer_cancel(&tlat->timer);
return 0;
}
+#else /* CONFIG_TIMERLAT_TRACER */
+static int timerlat_main(void *data)
+{
+ return 0;
+}
#endif /* CONFIG_TIMERLAT_TRACER */
/*
@@ -1498,12 +1677,12 @@ static void stop_per_cpu_kthreads(void)
{
int cpu;
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu)
stop_kthread(cpu);
- put_online_cpus();
+ cpus_read_unlock();
}
/*
@@ -1515,17 +1694,14 @@ static int start_kthread(unsigned int cpu)
void *main = osnoise_main;
char comm[24];
-#ifdef CONFIG_TIMERLAT_TRACER
- if (osnoise_data.timerlat_tracer) {
+ if (timerlat_enabled()) {
snprintf(comm, 24, "timerlat/%d", cpu);
main = timerlat_main;
} else {
snprintf(comm, 24, "osnoise/%d", cpu);
}
-#else
- snprintf(comm, 24, "osnoise/%d", cpu);
-#endif
- kthread = kthread_create_on_cpu(main, NULL, cpu, comm);
+
+ kthread = kthread_run_on_cpu(main, NULL, cpu, comm);
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
@@ -1534,7 +1710,6 @@ static int start_kthread(unsigned int cpu)
}
per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread;
- wake_up_process(kthread);
return 0;
}
@@ -1545,21 +1720,17 @@ static int start_kthread(unsigned int cpu)
* This starts the kernel thread that will look for osnoise on many
* cpus.
*/
-static int start_per_cpu_kthreads(struct trace_array *tr)
+static int start_per_cpu_kthreads(void)
{
struct cpumask *current_mask = &save_cpumask;
- int retval;
+ int retval = 0;
int cpu;
- get_online_cpus();
+ cpus_read_lock();
/*
- * Run only on CPUs in which trace and osnoise are allowed to run.
+ * Run only on online CPUs in which osnoise is allowed to run.
*/
- cpumask_and(current_mask, tr->tracing_cpumask, &osnoise_cpumask);
- /*
- * And the CPU is online.
- */
- cpumask_and(current_mask, cpu_online_mask, current_mask);
+ cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask);
for_each_possible_cpu(cpu)
per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
@@ -1568,40 +1739,35 @@ static int start_per_cpu_kthreads(struct trace_array *tr)
retval = start_kthread(cpu);
if (retval) {
stop_per_cpu_kthreads();
- return retval;
+ break;
}
}
- put_online_cpus();
+ cpus_read_unlock();
- return 0;
+ return retval;
}
#ifdef CONFIG_HOTPLUG_CPU
static void osnoise_hotplug_workfn(struct work_struct *dummy)
{
- struct trace_array *tr = osnoise_trace;
unsigned int cpu = smp_processor_id();
-
mutex_lock(&trace_types_lock);
- if (!osnoise_busy)
+ if (!osnoise_has_registered_instances())
goto out_unlock_trace;
mutex_lock(&interface_lock);
- get_online_cpus();
+ cpus_read_lock();
if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
goto out_unlock;
- if (!cpumask_test_cpu(cpu, tr->tracing_cpumask))
- goto out_unlock;
-
start_kthread(cpu);
out_unlock:
- put_online_cpus();
+ cpus_read_unlock();
mutex_unlock(&interface_lock);
out_unlock_trace:
mutex_unlock(&trace_types_lock);
@@ -1686,9 +1852,6 @@ out_unlock:
return count;
}
-static void osnoise_tracer_start(struct trace_array *tr);
-static void osnoise_tracer_stop(struct trace_array *tr);
-
/*
* osnoise_cpus_write - Write function for "cpus" entry
* @filp: The active open file structure
@@ -1700,19 +1863,15 @@ static void osnoise_tracer_stop(struct trace_array *tr);
* interface to the osnoise trace. By default, it lists all CPUs,
* in this way, allowing osnoise threads to run on any online CPU
* of the system. It serves to restrict the execution of osnoise to the
- * set of CPUs writing via this interface. Note that osnoise also
- * respects the "tracing_cpumask." Hence, osnoise threads will run only
- * on the set of CPUs allowed here AND on "tracing_cpumask." Why not
- * have just "tracing_cpumask?" Because the user might be interested
- * in tracing what is running on other CPUs. For instance, one might
- * run osnoise in one HT CPU while observing what is running on the
- * sibling HT CPU.
+ * set of CPUs writing via this interface. Why not use "tracing_cpumask"?
+ * Because the user might be interested in tracing what is running on
+ * other CPUs. For instance, one might run osnoise in one HT CPU
+ * while observing what is running on the sibling HT CPU.
*/
static ssize_t
osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
loff_t *ppos)
{
- struct trace_array *tr = osnoise_trace;
cpumask_var_t osnoise_cpumask_new;
int running, err;
char buf[256];
@@ -1731,27 +1890,26 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
goto err_free;
/*
- * trace_types_lock is taken to avoid concurrency on start/stop
- * and osnoise_busy.
+ * trace_types_lock is taken to avoid concurrency on start/stop.
*/
mutex_lock(&trace_types_lock);
- running = osnoise_busy;
+ running = osnoise_has_registered_instances();
if (running)
- osnoise_tracer_stop(tr);
+ stop_per_cpu_kthreads();
mutex_lock(&interface_lock);
/*
* osnoise_cpumask is read by CPU hotplug operations.
*/
- get_online_cpus();
+ cpus_read_lock();
cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new);
- put_online_cpus();
+ cpus_read_unlock();
mutex_unlock(&interface_lock);
if (running)
- osnoise_tracer_start(tr);
+ start_per_cpu_kthreads();
mutex_unlock(&trace_types_lock);
free_cpumask_var(osnoise_cpumask_new);
@@ -1835,6 +1993,47 @@ static const struct file_operations cpus_fops = {
.llseek = generic_file_llseek,
};
+#ifdef CONFIG_TIMERLAT_TRACER
+#ifdef CONFIG_STACKTRACE
+static int init_timerlat_stack_tracefs(struct dentry *top_dir)
+{
+ struct dentry *tmp;
+
+ tmp = tracefs_create_file("print_stack", TRACE_MODE_WRITE, top_dir,
+ &osnoise_print_stack, &trace_min_max_fops);
+ if (!tmp)
+ return -ENOMEM;
+
+ return 0;
+}
+#else /* CONFIG_STACKTRACE */
+static int init_timerlat_stack_tracefs(struct dentry *top_dir)
+{
+ return 0;
+}
+#endif /* CONFIG_STACKTRACE */
+
+/*
+ * init_timerlat_tracefs - A function to initialize the timerlat interface files
+ */
+static int init_timerlat_tracefs(struct dentry *top_dir)
+{
+ struct dentry *tmp;
+
+ tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir,
+ &timerlat_period, &trace_min_max_fops);
+ if (!tmp)
+ return -ENOMEM;
+
+ return init_timerlat_stack_tracefs(top_dir);
+}
+#else /* CONFIG_TIMERLAT_TRACER */
+static int init_timerlat_tracefs(struct dentry *top_dir)
+{
+ return 0;
+}
+#endif /* CONFIG_TIMERLAT_TRACER */
+
/*
* init_tracefs - A function to initialize the tracefs interface files
*
@@ -1856,42 +2055,33 @@ static int init_tracefs(void)
if (!top_dir)
return 0;
- tmp = tracefs_create_file("period_us", 0640, top_dir,
+ tmp = tracefs_create_file("period_us", TRACE_MODE_WRITE, top_dir,
&osnoise_period, &trace_min_max_fops);
if (!tmp)
goto err;
- tmp = tracefs_create_file("runtime_us", 0644, top_dir,
+ tmp = tracefs_create_file("runtime_us", TRACE_MODE_WRITE, top_dir,
&osnoise_runtime, &trace_min_max_fops);
if (!tmp)
goto err;
- tmp = tracefs_create_file("stop_tracing_us", 0640, top_dir,
+ tmp = tracefs_create_file("stop_tracing_us", TRACE_MODE_WRITE, top_dir,
&osnoise_stop_tracing_in, &trace_min_max_fops);
if (!tmp)
goto err;
- tmp = tracefs_create_file("stop_tracing_total_us", 0640, top_dir,
+ tmp = tracefs_create_file("stop_tracing_total_us", TRACE_MODE_WRITE, top_dir,
&osnoise_stop_tracing_total, &trace_min_max_fops);
if (!tmp)
goto err;
- tmp = trace_create_file("cpus", 0644, top_dir, NULL, &cpus_fops);
+ tmp = trace_create_file("cpus", TRACE_MODE_WRITE, top_dir, NULL, &cpus_fops);
if (!tmp)
goto err;
-#ifdef CONFIG_TIMERLAT_TRACER
-#ifdef CONFIG_STACKTRACE
- tmp = tracefs_create_file("print_stack", 0640, top_dir,
- &osnoise_print_stack, &trace_min_max_fops);
- if (!tmp)
- goto err;
-#endif
- tmp = tracefs_create_file("timerlat_period_us", 0640, top_dir,
- &timerlat_period, &trace_min_max_fops);
- if (!tmp)
+ ret = init_timerlat_tracefs(top_dir);
+ if (ret)
goto err;
-#endif
return 0;
@@ -1932,74 +2122,110 @@ out_unhook_irq:
return -EINVAL;
}
-static int __osnoise_tracer_start(struct trace_array *tr)
+/*
+ * osnoise_workload_start - start the workload and hook to events
+ */
+static int osnoise_workload_start(void)
{
int retval;
+ /*
+ * Instances need to be registered after calling workload
+ * start. Hence, if there is already an instance, the
+ * workload was already registered. Otherwise, this
+ * code is on the way to register the first instance,
+ * and the workload will start.
+ */
+ if (osnoise_has_registered_instances())
+ return 0;
+
osn_var_reset_all();
retval = osnoise_hook_events();
if (retval)
return retval;
+
/*
- * Make sure NMIs see reseted values.
+ * Make sure that ftrace_nmi_enter/exit() see reset values
+ * before enabling trace_osnoise_callback_enabled.
*/
barrier();
trace_osnoise_callback_enabled = true;
- retval = start_per_cpu_kthreads(tr);
+ retval = start_per_cpu_kthreads();
if (retval) {
unhook_irq_events();
return retval;
}
- osnoise_busy = true;
-
return 0;
}
+/*
+ * osnoise_workload_stop - stop the workload and unhook the events
+ */
+static void osnoise_workload_stop(void)
+{
+ /*
+ * Instances need to be unregistered before calling
+ * stop. Hence, if there is a registered instance, more
+ * than one instance is running, and the workload will not
+ * yet stop. Otherwise, this code is on the way to disable
+ * the last instance, and the workload can stop.
+ */
+ if (osnoise_has_registered_instances())
+ return;
+
+ trace_osnoise_callback_enabled = false;
+ /*
+ * Make sure that ftrace_nmi_enter/exit() see
+ * trace_osnoise_callback_enabled as false before continuing.
+ */
+ barrier();
+
+ stop_per_cpu_kthreads();
+
+ unhook_irq_events();
+ unhook_softirq_events();
+ unhook_thread_events();
+}
+
static void osnoise_tracer_start(struct trace_array *tr)
{
int retval;
- if (osnoise_busy)
+ /*
+ * If the instance is already registered, there is no need to
+ * register it again.
+ */
+ if (osnoise_instance_registered(tr))
return;
- retval = __osnoise_tracer_start(tr);
+ retval = osnoise_workload_start();
if (retval)
pr_err(BANNER "Error starting osnoise tracer\n");
+ osnoise_register_instance(tr);
}
static void osnoise_tracer_stop(struct trace_array *tr)
{
- if (!osnoise_busy)
- return;
-
- trace_osnoise_callback_enabled = false;
- barrier();
-
- stop_per_cpu_kthreads();
-
- unhook_irq_events();
- unhook_softirq_events();
- unhook_thread_events();
-
- osnoise_busy = false;
+ osnoise_unregister_instance(tr);
+ osnoise_workload_stop();
}
static int osnoise_tracer_init(struct trace_array *tr)
{
-
- /* Only allow one instance to enable this */
- if (osnoise_busy)
+ /*
+ * Only allow osnoise tracer if timerlat tracer is not running
+ * already.
+ */
+ if (timerlat_enabled())
return -EBUSY;
- osnoise_trace = tr;
tr->max_latency = 0;
osnoise_tracer_start(tr);
-
return 0;
}
@@ -2023,45 +2249,55 @@ static void timerlat_tracer_start(struct trace_array *tr)
{
int retval;
- if (osnoise_busy)
+ /*
+ * If the instance is already registered, there is no need to
+ * register it again.
+ */
+ if (osnoise_instance_registered(tr))
return;
- osnoise_data.timerlat_tracer = 1;
-
- retval = __osnoise_tracer_start(tr);
+ retval = osnoise_workload_start();
if (retval)
- goto out_err;
+ pr_err(BANNER "Error starting timerlat tracer\n");
+
+ osnoise_register_instance(tr);
return;
-out_err:
- pr_err(BANNER "Error starting timerlat tracer\n");
}
static void timerlat_tracer_stop(struct trace_array *tr)
{
int cpu;
- if (!osnoise_busy)
- return;
-
- for_each_online_cpu(cpu)
- per_cpu(per_cpu_osnoise_var, cpu).sampling = 0;
+ osnoise_unregister_instance(tr);
- osnoise_tracer_stop(tr);
+ /*
+ * Instruct the threads to stop only if this is the last instance.
+ */
+ if (!osnoise_has_registered_instances()) {
+ for_each_online_cpu(cpu)
+ per_cpu(per_cpu_osnoise_var, cpu).sampling = 0;
+ }
- osnoise_data.timerlat_tracer = 0;
+ osnoise_workload_stop();
}
static int timerlat_tracer_init(struct trace_array *tr)
{
- /* Only allow one instance to enable this */
- if (osnoise_busy)
+ /*
+ * Only allow timerlat tracer if osnoise tracer is not running already.
+ */
+ if (osnoise_has_registered_instances() && !osnoise_data.timerlat_tracer)
return -EBUSY;
- osnoise_trace = tr;
+ /*
+ * If this is the first instance, set timerlat_tracer to block
+ * osnoise tracer start.
+ */
+ if (!osnoise_has_registered_instances())
+ osnoise_data.timerlat_tracer = 1;
tr->max_latency = 0;
-
timerlat_tracer_start(tr);
return 0;
@@ -2070,6 +2306,13 @@ static int timerlat_tracer_init(struct trace_array *tr)
static void timerlat_tracer_reset(struct trace_array *tr)
{
timerlat_tracer_stop(tr);
+
+ /*
+ * If this is the last instance, reset timerlat_tracer allowing
+ * osnoise to be started.
+ */
+ if (!osnoise_has_registered_instances())
+ osnoise_data.timerlat_tracer = 0;
}
static struct tracer timerlat_tracer __read_mostly = {
@@ -2081,6 +2324,16 @@ static struct tracer timerlat_tracer __read_mostly = {
.print_header = print_timerlat_headers,
.allow_instances = true,
};
+
+__init static int init_timerlat_tracer(void)
+{
+ return register_tracer(&timerlat_tracer);
+}
+#else /* CONFIG_TIMERLAT_TRACER */
+__init static int init_timerlat_tracer(void)
+{
+ return 0;
+}
#endif /* CONFIG_TIMERLAT_TRACER */
__init static int init_osnoise_tracer(void)
@@ -2097,15 +2350,16 @@ __init static int init_osnoise_tracer(void)
return ret;
}
-#ifdef CONFIG_TIMERLAT_TRACER
- ret = register_tracer(&timerlat_tracer);
+ ret = init_timerlat_tracer();
if (ret) {
- pr_err(BANNER "Error registering timerlat\n");
+ pr_err(BANNER "Error registering timerlat!\n");
return ret;
}
-#endif
+
osnoise_init_hotplug_support();
+ INIT_LIST_HEAD_RCU(&osnoise_instances);
+
init_tracefs();
return 0;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a0bf446bb034..3547e7176ff7 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
@@ -346,22 +347,12 @@ int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
}
EXPORT_SYMBOL_GPL(trace_output_call);
-#ifdef CONFIG_KRETPROBES
-static inline const char *kretprobed(const char *name)
+static inline const char *kretprobed(const char *name, unsigned long addr)
{
- static const char tramp_name[] = "kretprobe_trampoline";
- int size = sizeof(tramp_name);
-
- if (strncmp(tramp_name, name, size) == 0)
+ if (is_kretprobe_trampoline(addr))
return "[unknown/kretprobe'd]";
return name;
}
-#else
-static inline const char *kretprobed(const char *name)
-{
- return name;
-}
-#endif /* CONFIG_KRETPROBES */
void
trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
@@ -374,7 +365,7 @@ trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
sprint_symbol(str, address);
else
kallsyms_lookup(address, NULL, NULL, NULL, str);
- name = kretprobed(str);
+ name = kretprobed(str, address);
if (name && strlen(name)) {
trace_seq_puts(s, name);
@@ -492,8 +483,13 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
trace_seq_printf(s, "%c%c%c",
irqs_off, need_resched, hardsoft_irq);
- if (entry->preempt_count)
- trace_seq_printf(s, "%x", entry->preempt_count);
+ if (entry->preempt_count & 0xf)
+ trace_seq_printf(s, "%x", entry->preempt_count & 0xf);
+ else
+ trace_seq_putc(s, '.');
+
+ if (entry->preempt_count & 0xf0)
+ trace_seq_printf(s, "%x", entry->preempt_count >> 4);
else
trace_seq_putc(s, '.');
@@ -656,7 +652,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
trace_seq_printf(
s, "%16s %7d %3d %d %08x %08lx ",
comm, entry->pid, iter->cpu, entry->flags,
- entry->preempt_count, iter->idx);
+ entry->preempt_count & 0xf, iter->idx);
} else {
lat_print_generic(s, entry, iter->cpu);
}
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 4b320fe7df70..29f6e95439b6 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -384,7 +384,7 @@ static __init int init_trace_printk_function_export(void)
if (ret)
return 0;
- trace_create_file("printk_formats", 0444, NULL,
+ trace_create_file("printk_formats", TRACE_MODE_READ, NULL,
NULL, &ftrace_formats_fops);
return 0;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 15413ad7cef2..3ed2a3f37297 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -233,6 +233,9 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
int len;
slash = strchr(event, '/');
+ if (!slash)
+ slash = strchr(event, '.');
+
if (slash) {
if (slash == event) {
trace_probe_log_err(offset, NO_GROUP_NAME);
@@ -316,6 +319,13 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
code->op = FETCH_OP_ARG;
code->param = (unsigned int)param - 1;
#endif
+ } else if (flags & TPARG_FL_TPOINT) {
+ if (code->data)
+ return -EFAULT;
+ code->data = kstrdup(arg, GFP_KERNEL);
+ if (!code->data)
+ return -ENOMEM;
+ code->op = FETCH_OP_TP_ARG;
} else
goto inval_var;
@@ -540,26 +550,34 @@ static int __parse_bitfield_probe_arg(const char *bf,
}
/* String length checking wrapper */
-static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
+static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
struct probe_arg *parg, unsigned int flags, int offset)
{
struct fetch_insn *code, *scode, *tmp = NULL;
char *t, *t2, *t3;
+ char *arg;
int ret, len;
+ arg = kstrdup(argv, GFP_KERNEL);
+ if (!arg)
+ return -ENOMEM;
+
+ ret = -EINVAL;
len = strlen(arg);
if (len > MAX_ARGSTR_LEN) {
trace_probe_log_err(offset, ARG_TOO_LONG);
- return -EINVAL;
+ goto out;
} else if (len == 0) {
trace_probe_log_err(offset, NO_ARG_BODY);
- return -EINVAL;
+ goto out;
}
+ ret = -ENOMEM;
parg->comm = kstrdup(arg, GFP_KERNEL);
if (!parg->comm)
- return -ENOMEM;
+ goto out;
+ ret = -EINVAL;
t = strchr(arg, ':');
if (t) {
*t = '\0';
@@ -571,22 +589,22 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
offset += t2 + strlen(t2) - arg;
trace_probe_log_err(offset,
ARRAY_NO_CLOSE);
- return -EINVAL;
+ goto out;
} else if (t3[1] != '\0') {
trace_probe_log_err(offset + t3 + 1 - arg,
BAD_ARRAY_SUFFIX);
- return -EINVAL;
+ goto out;
}
*t3 = '\0';
if (kstrtouint(t2, 0, &parg->count) || !parg->count) {
trace_probe_log_err(offset + t2 - arg,
BAD_ARRAY_NUM);
- return -EINVAL;
+ goto out;
}
if (parg->count > MAX_ARRAY_LEN) {
trace_probe_log_err(offset + t2 - arg,
ARRAY_TOO_BIG);
- return -EINVAL;
+ goto out;
}
}
}
@@ -598,29 +616,30 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) {
/* The type of $comm must be "string", and not an array. */
if (parg->count || (t && strcmp(t, "string")))
- return -EINVAL;
+ goto out;
parg->type = find_fetch_type("string");
} else
parg->type = find_fetch_type(t);
if (!parg->type) {
trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE);
- return -EINVAL;
+ goto out;
}
parg->offset = *size;
*size += parg->type->size * (parg->count ?: 1);
+ ret = -ENOMEM;
if (parg->count) {
len = strlen(parg->type->fmttype) + 6;
parg->fmt = kmalloc(len, GFP_KERNEL);
if (!parg->fmt)
- return -ENOMEM;
+ goto out;
snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
parg->count);
}
code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
if (!code)
- return -ENOMEM;
+ goto out;
code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
@@ -628,19 +647,20 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
if (ret)
goto fail;
+ ret = -EINVAL;
/* Store operation */
if (!strcmp(parg->type->name, "string") ||
!strcmp(parg->type->name, "ustring")) {
if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
- code->op != FETCH_OP_DATA) {
+ code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) {
trace_probe_log_err(offset + (t ? (t - arg) : 0),
BAD_STRING);
- ret = -EINVAL;
goto fail;
}
if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
- code->op == FETCH_OP_DATA) || parg->count) {
+ code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG ||
+ parg->count) {
/*
* IMM, DATA and COMM is pointing actual address, those
* must be kept, and if parg->count != 0, this is an
@@ -650,7 +670,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(offset, TOO_MANY_OPS);
- ret = -EINVAL;
goto fail;
}
}
@@ -672,7 +691,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(offset, TOO_MANY_OPS);
- ret = -EINVAL;
goto fail;
}
code->op = FETCH_OP_ST_RAW;
@@ -687,6 +705,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
goto fail;
}
}
+ ret = -EINVAL;
/* Loop(Array) operation */
if (parg->count) {
if (scode->op != FETCH_OP_ST_MEM &&
@@ -694,13 +713,11 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
scode->op != FETCH_OP_ST_USTRING) {
trace_probe_log_err(offset + (t ? (t - arg) : 0),
BAD_STRING);
- ret = -EINVAL;
goto fail;
}
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(offset, TOO_MANY_OPS);
- ret = -EINVAL;
goto fail;
}
code->op = FETCH_OP_LP_ARRAY;
@@ -709,6 +726,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
code++;
code->op = FETCH_OP_END;
+ ret = 0;
/* Shrink down the code buffer */
parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL);
if (!parg->code)
@@ -724,6 +742,8 @@ fail:
kfree(code->data);
}
kfree(tmp);
+out:
+ kfree(arg);
return ret;
}
@@ -745,11 +765,11 @@ static int traceprobe_conflict_field_name(const char *name,
return 0;
}
-int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
+int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg,
unsigned int flags)
{
struct probe_arg *parg = &tp->args[i];
- char *body;
+ const char *body;
/* Increment count for freeing args in error case */
tp->nr_args++;
@@ -839,19 +859,29 @@ int traceprobe_update_arg(struct probe_arg *arg)
/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)
static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
- bool is_return)
+ enum probe_print_type ptype)
{
struct probe_arg *parg;
int i, j;
int pos = 0;
const char *fmt, *arg;
- if (!is_return) {
+ switch (ptype) {
+ case PROBE_PRINT_NORMAL:
fmt = "(%lx)";
arg = "REC->" FIELD_STRING_IP;
- } else {
+ break;
+ case PROBE_PRINT_RETURN:
fmt = "(%lx <- %lx)";
arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+ break;
+ case PROBE_PRINT_EVENT:
+ fmt = "(%u)";
+ arg = "REC->" FIELD_STRING_TYPE;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
}
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
@@ -900,20 +930,20 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
}
#undef LEN_OR_ZERO
-int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return)
+int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype)
{
struct trace_event_call *call = trace_probe_event_call(tp);
int len;
char *print_fmt;
/* First: called with 0 length to calculate the needed length */
- len = __set_print_fmt(tp, NULL, 0, is_return);
+ len = __set_print_fmt(tp, NULL, 0, ptype);
print_fmt = kmalloc(len + 1, GFP_KERNEL);
if (!print_fmt)
return -ENOMEM;
/* Second: actually write the @print_fmt */
- __set_print_fmt(tp, print_fmt, len + 1, is_return);
+ __set_print_fmt(tp, print_fmt, len + 1, ptype);
call->print_fmt = print_fmt;
return 0;
@@ -1029,11 +1059,36 @@ error:
return ret;
}
+static struct trace_event_call *
+find_trace_event_call(const char *system, const char *event_name)
+{
+ struct trace_event_call *tp_event;
+ const char *name;
+
+ list_for_each_entry(tp_event, &ftrace_events, list) {
+ if (!tp_event->class->system ||
+ strcmp(system, tp_event->class->system))
+ continue;
+ name = trace_event_name(tp_event);
+ if (!name || strcmp(event_name, name))
+ continue;
+ return tp_event;
+ }
+
+ return NULL;
+}
+
int trace_probe_register_event_call(struct trace_probe *tp)
{
struct trace_event_call *call = trace_probe_event_call(tp);
int ret;
+ lockdep_assert_held(&event_mutex);
+
+ if (find_trace_event_call(trace_probe_group_name(tp),
+ trace_probe_name(tp)))
+ return -EEXIST;
+
ret = register_trace_event(&call->event);
if (!ret)
return -ENODEV;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 227d518e5ba5..99e7a5df025e 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -38,6 +38,7 @@
#define FIELD_STRING_IP "__probe_ip"
#define FIELD_STRING_RETIP "__probe_ret_ip"
#define FIELD_STRING_FUNC "__probe_func"
+#define FIELD_STRING_TYPE "__probe_type"
#undef DEFINE_FIELD
#define DEFINE_FIELD(type, item, name, is_signed) \
@@ -102,6 +103,7 @@ enum fetch_op {
FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */
// Stage 5 (loop) op
FETCH_OP_LP_ARRAY, /* Array: .param = loop count */
+ FETCH_OP_TP_ARG, /* Trace Point argument */
FETCH_OP_END,
FETCH_NOP_SYMBOL, /* Unresolved Symbol holder */
};
@@ -351,10 +353,11 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char
#define TPARG_FL_RETURN BIT(0)
#define TPARG_FL_KERNEL BIT(1)
#define TPARG_FL_FENTRY BIT(2)
-#define TPARG_FL_MASK GENMASK(2, 0)
+#define TPARG_FL_TPOINT BIT(3)
+#define TPARG_FL_MASK GENMASK(3, 0)
extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
- char *arg, unsigned int flags);
+ const char *argv, unsigned int flags);
extern int traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);
@@ -363,7 +366,13 @@ extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
char *buf, int offset);
-extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return);
+enum probe_print_type {
+ PROBE_PRINT_NORMAL,
+ PROBE_PRINT_RETURN,
+ PROBE_PRINT_EVENT,
+};
+
+extern int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype);
#ifdef CONFIG_PERF_EVENTS
extern struct trace_event_call *
@@ -399,6 +408,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NO_EVENT_NAME, "Event name is not specified"), \
C(EVENT_TOO_LONG, "Event name is too long"), \
C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \
+ C(EVENT_EXIST, "Given group/event name is already used by another event"), \
C(RETVAL_ON_PROBE, "$retval is not available on probe"), \
C(BAD_STACK_NUM, "Invalid stack number"), \
C(BAD_ARG_NUM, "Invalid argument number"), \
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index f003c5d02a3a..b3bdb8ddb862 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -54,7 +54,7 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf)
* If dest is NULL, don't store result and return required dynamic data size.
*/
static int
-process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs,
+process_fetch_insn(struct fetch_insn *code, void *rec,
void *dest, void *base);
static nokprobe_inline int fetch_store_strlen(unsigned long addr);
static nokprobe_inline int
@@ -188,7 +188,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
/* Store the value of each argument */
static nokprobe_inline void
-store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs,
+store_trace_args(void *data, struct trace_probe *tp, void *rec,
int header_size, int maxlen)
{
struct probe_arg *arg;
@@ -203,7 +203,7 @@ store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs,
/* Point the dynamic data area if needed */
if (unlikely(arg->dynamic))
*dl = make_data_loc(maxlen, dyndata - base);
- ret = process_fetch_insn(arg->code, regs, dl, base);
+ ret = process_fetch_insn(arg->code, rec, dl, base);
if (unlikely(ret < 0 && arg->dynamic)) {
*dl = make_data_loc(0, dyndata - base);
} else {
diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c
index b2edac1fe156..4d4b78c8ca25 100644
--- a/kernel/trace/trace_recursion_record.c
+++ b/kernel/trace/trace_recursion_record.c
@@ -226,8 +226,8 @@ __init static int create_recursed_functions(void)
{
struct dentry *dentry;
- dentry = trace_create_file("recursed_functions", 0644, NULL, NULL,
- &recursed_functions_fops);
+ dentry = trace_create_file("recursed_functions", TRACE_MODE_WRITE,
+ NULL, NULL, &recursed_functions_fops);
if (!dentry)
pr_warn("WARNING: Failed to create recursed_functions\n");
return 0;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index adf7ef194005..afd937a46496 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -287,6 +287,40 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt)
if (trace_selftest_test_probe3_cnt != 4)
goto out_free;
+ /* Remove trace function from probe 3 */
+ func1_name = "!" __stringify(DYN_FTRACE_TEST_NAME);
+ len1 = strlen(func1_name);
+
+ ftrace_set_filter(&test_probe3, func1_name, len1, 0);
+
+ DYN_FTRACE_TEST_NAME();
+
+ print_counts();
+
+ if (trace_selftest_test_probe1_cnt != 3)
+ goto out_free;
+ if (trace_selftest_test_probe2_cnt != 2)
+ goto out_free;
+ if (trace_selftest_test_probe3_cnt != 4)
+ goto out_free;
+ if (cnt > 1) {
+ if (trace_selftest_test_global_cnt == 0)
+ goto out_free;
+ }
+ if (trace_selftest_test_dyn_cnt == 0)
+ goto out_free;
+
+ DYN_FTRACE_TEST_NAME2();
+
+ print_counts();
+
+ if (trace_selftest_test_probe1_cnt != 3)
+ goto out_free;
+ if (trace_selftest_test_probe2_cnt != 3)
+ goto out_free;
+ if (trace_selftest_test_probe3_cnt != 5)
+ goto out_free;
+
ret = 0;
out_free:
unregister_ftrace_function(dyn_ops);
@@ -750,6 +784,12 @@ static struct fgraph_ops fgraph_ops __initdata = {
.retfunc = &trace_graph_return,
};
+#if defined(CONFIG_DYNAMIC_FTRACE) && \
+ defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS)
+#define TEST_DIRECT_TRAMP
+noinline __noclone static void trace_direct_tramp(void) { }
+#endif
+
/*
* Pretty much the same than for the function tracer from which the selftest
* has been borrowed.
@@ -760,6 +800,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
{
int ret;
unsigned long count;
+ char *func_name __maybe_unused;
#ifdef CONFIG_DYNAMIC_FTRACE
if (ftrace_filter_param) {
@@ -808,8 +849,57 @@ trace_selftest_startup_function_graph(struct tracer *trace,
goto out;
}
- /* Don't test dynamic tracing, the function tracer already did */
+#ifdef TEST_DIRECT_TRAMP
+ tracing_reset_online_cpus(&tr->array_buffer);
+ set_graph_array(tr);
+
+ /*
+ * Some archs *cough*PowerPC*cough* add characters to the
+ * start of the function names. We simply put a '*' to
+ * accommodate them.
+ */
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ ftrace_set_global_filter(func_name, strlen(func_name), 1);
+
+ /*
+ * Register direct function together with graph tracer
+ * and make sure we get graph trace.
+ */
+ ret = register_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME,
+ (unsigned long) trace_direct_tramp);
+ if (ret)
+ goto out;
+
+ ret = register_ftrace_graph(&fgraph_ops);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ goto out;
+ }
+
+ DYN_FTRACE_TEST_NAME();
+
+ count = 0;
+ tracing_stop();
+ /* check the trace buffer */
+ ret = trace_test_buffer(&tr->array_buffer, &count);
+
+ unregister_ftrace_graph(&fgraph_ops);
+
+ ret = unregister_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME,
+ (unsigned long) trace_direct_tramp);
+ if (ret)
+ goto out;
+
+ tracing_start();
+
+ if (!ret && !count) {
+ ret = -1;
+ goto out;
+ }
+#endif
+
+ /* Don't test dynamic tracing, the function tracer already did */
out:
/* Stop it if we failed */
if (ret)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 63c285042051..5a48dba912ea 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -559,14 +559,14 @@ static __init int stack_trace_init(void)
if (ret)
return 0;
- trace_create_file("stack_max_size", 0644, NULL,
+ trace_create_file("stack_max_size", TRACE_MODE_WRITE, NULL,
&stack_trace_max_size, &stack_max_size_fops);
- trace_create_file("stack_trace", 0444, NULL,
+ trace_create_file("stack_trace", TRACE_MODE_READ, NULL,
NULL, &stack_trace_fops);
#ifdef CONFIG_DYNAMIC_FTRACE
- trace_create_file("stack_trace_filter", 0644, NULL,
+ trace_create_file("stack_trace_filter", TRACE_MODE_WRITE, NULL,
&trace_ops, &stack_trace_filter_fops);
#endif
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 8d141c3825a9..bb247beec447 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -297,9 +297,9 @@ static int init_stat_file(struct stat_session *session)
if (!stat_dir && (ret = tracing_stat_init()))
return ret;
- session->file = tracefs_create_file(session->ts->name, 0644,
- stat_dir,
- session, &tracing_stat_fops);
+ session->file = tracefs_create_file(session->ts->name, TRACE_MODE_WRITE,
+ stat_dir, session,
+ &tracing_stat_fops);
if (!session->file)
return -ENOMEM;
return 0;
diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h
index 4007fe95cf42..b29595fe3ac5 100644
--- a/kernel/trace/trace_synth.h
+++ b/kernel/trace/trace_synth.h
@@ -5,7 +5,7 @@
#include "trace_dynevent.h"
#define SYNTH_SYSTEM "synthetic"
-#define SYNTH_FIELDS_MAX 32
+#define SYNTH_FIELDS_MAX 64
#define STR_VAR_LEN_MAX MAX_FILTER_STR_VAL /* must be multiple of sizeof(u64) */
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9b50869a5ddb..4f35514a48f3 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -7,6 +7,7 @@
*/
#define pr_fmt(fmt) "trace_uprobe: " fmt
+#include <linux/bpf-cgroup.h>
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/module.h>
@@ -83,10 +84,6 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
for_each_dyn_event(dpos) \
if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos)))
-#define SIZEOF_TRACE_UPROBE(n) \
- (offsetof(struct trace_uprobe, tp.args) + \
- (sizeof(struct probe_arg) * (n)))
-
static int register_uprobe_event(struct trace_uprobe *tu);
static int unregister_uprobe_event(struct trace_uprobe *tu);
@@ -217,9 +214,10 @@ static unsigned long translate_user_vaddr(unsigned long file_offset)
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
+process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
void *base)
{
+ struct pt_regs *regs = rec;
unsigned long val;
/* 1st stage: get value from context */
@@ -340,7 +338,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
struct trace_uprobe *tu;
int ret;
- tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
+ tu = kzalloc(struct_size(tu, tp.args, nargs), GFP_KERNEL);
if (!tu)
return ERR_PTR(-ENOMEM);
@@ -393,6 +391,10 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu)
if (trace_probe_has_sibling(&tu->tp))
goto unreg;
+ /* If there's a reference to the dynamic event */
+ if (trace_event_dyn_busy(trace_probe_event_call(&tu->tp)))
+ return -EBUSY;
+
ret = unregister_uprobe_event(tu);
if (ret)
return ret;
@@ -455,7 +457,7 @@ static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to)
/* Append to existing event */
ret = trace_probe_append(&tu->tp, &to->tp);
if (!ret)
- dyn_event_add(&tu->devent);
+ dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp));
return ret;
}
@@ -514,11 +516,15 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
ret = register_uprobe_event(tu);
if (ret) {
- pr_warn("Failed to register probe event(%d)\n", ret);
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ } else
+ pr_warn("Failed to register probe event(%d)\n", ret);
goto end;
}
- dyn_event_add(&tu->devent);
+ dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp));
end:
mutex_unlock(&event_mutex);
@@ -536,6 +542,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
char *arg, *filename, *rctr, *rctr_end, *tmp;
char buf[MAX_EVENT_NAME_LEN];
+ enum probe_print_type ptype;
struct path path;
unsigned long offset, ref_ctr_offset;
bool is_return = false;
@@ -680,21 +687,15 @@ static int __trace_uprobe_create(int argc, const char **argv)
/* parse arguments */
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
- tmp = kstrdup(argv[i], GFP_KERNEL);
- if (!tmp) {
- ret = -ENOMEM;
- goto error;
- }
-
trace_probe_log_set_index(i + 2);
- ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp,
+ ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i],
is_return ? TPARG_FL_RETURN : 0);
- kfree(tmp);
if (ret)
goto error;
}
- ret = traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu));
+ ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
+ ret = traceprobe_set_print_fmt(&tu->tp, ptype);
if (ret < 0)
goto error;
@@ -1313,6 +1314,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return 0;
list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ tu = container_of(pos, struct trace_uprobe, tp);
err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
if (err) {
uprobe_perf_close(call, event);
@@ -1585,6 +1587,7 @@ struct trace_event_call *
create_local_trace_uprobe(char *name, unsigned long offs,
unsigned long ref_ctr_offset, bool is_return)
{
+ enum probe_print_type ptype;
struct trace_uprobe *tu;
struct path path;
int ret;
@@ -1619,7 +1622,8 @@ create_local_trace_uprobe(char *name, unsigned long offs,
tu->filename = kstrdup(name, GFP_KERNEL);
init_trace_event_call(tu);
- if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) {
+ ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
+ if (traceprobe_set_print_fmt(&tu->tp, ptype) < 0) {
ret = -ENOMEM;
goto error;
}
@@ -1653,10 +1657,10 @@ static __init int init_uprobe_trace(void)
if (ret)
return 0;
- trace_create_file("uprobe_events", 0644, NULL,
+ trace_create_file("uprobe_events", TRACE_MODE_WRITE, NULL,
NULL, &uprobe_events_ops);
/* Profile interface */
- trace_create_file("uprobe_profile", 0444, NULL,
+ trace_create_file("uprobe_profile", TRACE_MODE_READ, NULL,
NULL, &uprobe_profile_ops);
return 0;
}
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index d6bddb157ef2..9628b5571846 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -15,6 +15,7 @@
#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/sort.h>
+#include <linux/kmemleak.h>
#include "tracing_map.h"
#include "trace.h"
@@ -307,6 +308,7 @@ static void tracing_map_array_free(struct tracing_map_array *a)
for (i = 0; i < a->n_pages; i++) {
if (!a->pages[i])
break;
+ kmemleak_free(a->pages[i]);
free_page((unsigned long)a->pages[i]);
}
@@ -342,6 +344,7 @@ static struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
if (!a->pages[i])
goto free;
+ kmemleak_alloc(a->pages[i], PAGE_SIZE, 1, GFP_KERNEL);
}
out:
return a;
@@ -834,29 +837,35 @@ int tracing_map_init(struct tracing_map *map)
return err;
}
-static int cmp_entries_dup(const struct tracing_map_sort_entry **a,
- const struct tracing_map_sort_entry **b)
+static int cmp_entries_dup(const void *A, const void *B)
{
+ const struct tracing_map_sort_entry *a, *b;
int ret = 0;
- if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size))
+ a = *(const struct tracing_map_sort_entry **)A;
+ b = *(const struct tracing_map_sort_entry **)B;
+
+ if (memcmp(a->key, b->key, a->elt->map->key_size))
ret = 1;
return ret;
}
-static int cmp_entries_sum(const struct tracing_map_sort_entry **a,
- const struct tracing_map_sort_entry **b)
+static int cmp_entries_sum(const void *A, const void *B)
{
const struct tracing_map_elt *elt_a, *elt_b;
+ const struct tracing_map_sort_entry *a, *b;
struct tracing_map_sort_key *sort_key;
struct tracing_map_field *field;
tracing_map_cmp_fn_t cmp_fn;
void *val_a, *val_b;
int ret = 0;
- elt_a = (*a)->elt;
- elt_b = (*b)->elt;
+ a = *(const struct tracing_map_sort_entry **)A;
+ b = *(const struct tracing_map_sort_entry **)B;
+
+ elt_a = a->elt;
+ elt_b = b->elt;
sort_key = &elt_a->map->sort_key;
@@ -873,18 +882,21 @@ static int cmp_entries_sum(const struct tracing_map_sort_entry **a,
return ret;
}
-static int cmp_entries_key(const struct tracing_map_sort_entry **a,
- const struct tracing_map_sort_entry **b)
+static int cmp_entries_key(const void *A, const void *B)
{
const struct tracing_map_elt *elt_a, *elt_b;
+ const struct tracing_map_sort_entry *a, *b;
struct tracing_map_sort_key *sort_key;
struct tracing_map_field *field;
tracing_map_cmp_fn_t cmp_fn;
void *val_a, *val_b;
int ret = 0;
- elt_a = (*a)->elt;
- elt_b = (*b)->elt;
+ a = *(const struct tracing_map_sort_entry **)A;
+ b = *(const struct tracing_map_sort_entry **)B;
+
+ elt_a = a->elt;
+ elt_b = b->elt;
sort_key = &elt_a->map->sort_key;
@@ -989,10 +1001,8 @@ static void sort_secondary(struct tracing_map *map,
struct tracing_map_sort_key *primary_key,
struct tracing_map_sort_key *secondary_key)
{
- int (*primary_fn)(const struct tracing_map_sort_entry **,
- const struct tracing_map_sort_entry **);
- int (*secondary_fn)(const struct tracing_map_sort_entry **,
- const struct tracing_map_sort_entry **);
+ int (*primary_fn)(const void *, const void *);
+ int (*secondary_fn)(const void *, const void *);
unsigned i, start = 0, n_sub = 1;
if (is_key(map, primary_key->field_idx))
@@ -1061,8 +1071,7 @@ int tracing_map_sort_entries(struct tracing_map *map,
unsigned int n_sort_keys,
struct tracing_map_sort_entry ***sort_entries)
{
- int (*cmp_entries_fn)(const struct tracing_map_sort_entry **,
- const struct tracing_map_sort_entry **);
+ int (*cmp_entries_fn)(const void *, const void *);
struct tracing_map_sort_entry *sort_entry, **entries;
int i, n_entries, ret;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index efd14c79fab4..64ea283f2f86 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -577,7 +577,7 @@ bool trace_module_has_bad_taint(struct module *mod)
static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list);
/**
- * register_tracepoint_notifier - register tracepoint coming/going notifier
+ * register_tracepoint_module_notifier - register tracepoint coming/going notifier
* @nb: notifier block
*
* Notifiers registered with this function are called on module
@@ -603,7 +603,7 @@ end:
EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier);
/**
- * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier
+ * unregister_tracepoint_module_notifier - unregister tracepoint coming/going notifier
* @nb: notifier block
*
* The notifier block callback should expect a "struct tp_module" data
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 257ffb993ea2..f00de83d0246 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -137,7 +137,7 @@ static void __acct_update_integrals(struct task_struct *tsk,
* the rest of the math is done in xacct_add_tsk.
*/
tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
- tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
+ tsk->acct_vm_mem1 += delta * READ_ONCE(tsk->mm->total_vm) >> 10;
}
/**
diff --git a/kernel/ucount.c b/kernel/ucount.c
index bb51849e6375..7b32c356ebc5 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -150,9 +150,15 @@ static void hlist_add_ucounts(struct ucounts *ucounts)
spin_unlock_irq(&ucounts_lock);
}
+static inline bool get_ucounts_or_wrap(struct ucounts *ucounts)
+{
+ /* Returns true on a successful get, false if the count wraps. */
+ return !atomic_add_negative(1, &ucounts->count);
+}
+
struct ucounts *get_ucounts(struct ucounts *ucounts)
{
- if (ucounts && atomic_add_negative(1, &ucounts->count)) {
+ if (!get_ucounts_or_wrap(ucounts)) {
put_ucounts(ucounts);
ucounts = NULL;
}
@@ -163,7 +169,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
{
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
struct ucounts *ucounts, *new;
- long overflow;
+ bool wrapped;
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -188,9 +194,9 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
return new;
}
}
- overflow = atomic_add_negative(1, &ucounts->count);
+ wrapped = !get_ucounts_or_wrap(ucounts);
spin_unlock_irq(&ucounts_lock);
- if (overflow) {
+ if (wrapped) {
put_ucounts(ucounts);
return NULL;
}
@@ -258,15 +264,16 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
{
struct ucounts *iter;
+ long max = LONG_MAX;
long ret = 0;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- long max = READ_ONCE(iter->ns->ucount_max[type]);
long new = atomic_long_add_return(v, &iter->ucount[type]);
if (new < 0 || new > max)
ret = LONG_MAX;
else if (iter == ucounts)
ret = new;
+ max = READ_ONCE(iter->ns->ucount_max[type]);
}
return ret;
}
@@ -276,7 +283,7 @@ bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
struct ucounts *iter;
long new = -1; /* Silence compiler warning */
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- long dec = atomic_long_add_return(-v, &iter->ucount[type]);
+ long dec = atomic_long_sub_return(v, &iter->ucount[type]);
WARN_ON_ONCE(dec < 0);
if (iter == ucounts)
new = dec;
@@ -284,15 +291,66 @@ bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
return (new == 0);
}
-bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
+static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
+ struct ucounts *last, enum ucount_type type)
{
+ struct ucounts *iter, *next;
+ for (iter = ucounts; iter != last; iter = next) {
+ long dec = atomic_long_sub_return(1, &iter->ucount[type]);
+ WARN_ON_ONCE(dec < 0);
+ next = iter->ns->ucounts;
+ if (dec == 0)
+ put_ucounts(iter);
+ }
+}
+
+void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type)
+{
+ do_dec_rlimit_put_ucounts(ucounts, NULL, type);
+}
+
+long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type)
+{
+ /* Caller must hold a reference to ucounts */
struct ucounts *iter;
- if (get_ucounts_value(ucounts, type) > max)
- return true;
+ long max = LONG_MAX;
+ long dec, ret = 0;
+
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ long new = atomic_long_add_return(1, &iter->ucount[type]);
+ if (new < 0 || new > max)
+ goto unwind;
+ if (iter == ucounts)
+ ret = new;
max = READ_ONCE(iter->ns->ucount_max[type]);
+ /*
+ * Grab an extra ucount reference for the caller when
+ * the rlimit count was previously 0.
+ */
+ if (new != 1)
+ continue;
+ if (!get_ucounts(iter))
+ goto dec_unwind;
+ }
+ return ret;
+dec_unwind:
+ dec = atomic_long_sub_return(1, &iter->ucount[type]);
+ WARN_ON_ONCE(dec < 0);
+unwind:
+ do_dec_rlimit_put_ucounts(ucounts, iter, type);
+ return 0;
+}
+
+bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long rlimit)
+{
+ struct ucounts *iter;
+ long max = rlimit;
+ if (rlimit > LONG_MAX)
+ max = LONG_MAX;
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
if (get_ucounts_value(iter, type) > max)
return true;
+ max = READ_ONCE(iter->ns->ucount_max[type]);
}
return false;
}
diff --git a/kernel/user.c b/kernel/user.c
index c82399c1618a..e2cf8c22b539 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -129,6 +129,22 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
return NULL;
}
+static int user_epoll_alloc(struct user_struct *up)
+{
+#ifdef CONFIG_EPOLL
+ return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL);
+#else
+ return 0;
+#endif
+}
+
+static void user_epoll_free(struct user_struct *up)
+{
+#ifdef CONFIG_EPOLL
+ percpu_counter_destroy(&up->epoll_watches);
+#endif
+}
+
/* IRQs are disabled and uidhash_lock is held upon function entry.
* IRQ state (as stored in flags) is restored and uidhash_lock released
* upon function exit.
@@ -138,6 +154,7 @@ static void free_user(struct user_struct *up, unsigned long flags)
{
uid_hash_remove(up);
spin_unlock_irqrestore(&uidhash_lock, flags);
+ user_epoll_free(up);
kmem_cache_free(uid_cachep, up);
}
@@ -185,6 +202,10 @@ struct user_struct *alloc_uid(kuid_t uid)
new->uid = uid;
refcount_set(&new->__count, 1);
+ if (user_epoll_alloc(new)) {
+ kmem_cache_free(uid_cachep, new);
+ return NULL;
+ }
ratelimit_state_init(&new->ratelimit, HZ, 100);
ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);
@@ -195,6 +216,7 @@ struct user_struct *alloc_uid(kuid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
+ user_epoll_free(new);
kmem_cache_free(uid_cachep, new);
} else {
uid_hash_insert(new, hashent);
@@ -216,6 +238,9 @@ static int __init uid_cache_init(void)
for(n = 0; n < UIDHASH_SZ; ++n)
INIT_HLIST_HEAD(uidhash_table + n);
+ if (user_epoll_alloc(&root_user))
+ panic("root_user epoll percpu counter alloc failed");
+
/* Insert the root user immediately (init already runs as root) */
spin_lock_irq(&uidhash_lock);
uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 33a6b4a2443d..33f1106b4f99 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -154,6 +154,9 @@ struct worker_pool {
unsigned long watchdog_ts; /* L: watchdog timestamp */
+ /* The current concurrency level. */
+ atomic_t nr_running;
+
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
@@ -178,18 +181,11 @@ struct worker_pool {
int refcnt; /* PL: refcnt for unbound pools */
/*
- * The current concurrency level. As it's likely to be accessed
- * from other CPUs during try_to_wake_up(), put it in a separate
- * cacheline.
- */
- atomic_t nr_running ____cacheline_aligned_in_smp;
-
- /*
* Destruction of pool is RCU protected to allow dereferences
* from get_work_pool().
*/
struct rcu_head rcu;
-} ____cacheline_aligned_in_smp;
+};
/*
* The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
@@ -375,6 +371,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
static void show_pwq(struct pool_workqueue *pwq);
+static void show_one_worker_pool(struct worker_pool *pool);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
@@ -867,8 +864,17 @@ void wq_worker_running(struct task_struct *task)
if (!worker->sleeping)
return;
+
+ /*
+ * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
+ * and the nr_running increment below, we may ruin the nr_running reset
+ * and leave with an unexpected pool->nr_running == 1 on the newly unbound
+ * pool. Protect against such race.
+ */
+ preempt_disable();
if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(&worker->pool->nr_running);
+ preempt_enable();
worker->sleeping = 0;
}
@@ -877,8 +883,7 @@ void wq_worker_running(struct task_struct *task)
* @task: task going to sleep
*
* This function is called from schedule() when a busy worker is
- * going to sleep. Preemption needs to be disabled to protect ->sleeping
- * assignment.
+ * going to sleep.
*/
void wq_worker_sleeping(struct task_struct *task)
{
@@ -903,6 +908,16 @@ void wq_worker_sleeping(struct task_struct *task)
raw_spin_lock_irq(&pool->lock);
/*
+ * Recheck in case unbind_workers() preempted us. We don't
+ * want to decrement nr_running after the worker is unbound
+ * and nr_running has been reset.
+ */
+ if (worker->flags & WORKER_NOT_RUNNING) {
+ raw_spin_unlock_irq(&pool->lock);
+ return;
+ }
+
+ /*
* The counterpart of the following dec_and_test, implied mb,
* worklist not empty test sequence is in insert_work().
* Please read comment there.
@@ -1350,7 +1365,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct worker_pool *pool = pwq->pool;
/* record the work call stack in order to print it in KASAN reports */
- kasan_record_aux_stack(work);
+ kasan_record_aux_stack_noalloc(work);
/* we own @work, set data and link */
set_work_pwq(work, pwq, extra_flags);
@@ -1530,7 +1545,8 @@ out:
* @work: work to queue
*
* We queue the work to a specific CPU, the caller must ensure it
- * can't go away.
+ * can't go away. Callers that fail to ensure that the specified
+ * CPU cannot go away will execute on a randomly chosen CPU.
*
* Return: %false if @work was already on a queue, %true otherwise.
*/
@@ -1810,14 +1826,8 @@ static void worker_enter_idle(struct worker *worker)
if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
- /*
- * Sanity check nr_running. Because unbind_workers() releases
- * pool->lock between setting %WORKER_UNBOUND and zapping
- * nr_running, the warning may trigger spuriously. Check iff
- * unbind is not in progress.
- */
- WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
- pool->nr_workers == pool->nr_idle &&
+ /* Sanity check nr_running. */
+ WARN_ON_ONCE(pool->nr_workers == pool->nr_idle &&
atomic_read(&pool->nr_running));
}
@@ -4447,7 +4457,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
raw_spin_unlock_irq(&pwq->pool->lock);
mutex_unlock(&wq->mutex);
mutex_unlock(&wq_pool_mutex);
- show_workqueue_state();
+ show_one_workqueue(wq);
return;
}
raw_spin_unlock_irq(&pwq->pool->lock);
@@ -4797,83 +4807,116 @@ static void show_pwq(struct pool_workqueue *pwq)
}
/**
- * show_workqueue_state - dump workqueue state
- *
- * Called from a sysrq handler or try_to_freeze_tasks() and prints out
- * all busy workqueues and pools.
+ * show_one_workqueue - dump state of specified workqueue
+ * @wq: workqueue whose state will be printed
*/
-void show_workqueue_state(void)
+void show_one_workqueue(struct workqueue_struct *wq)
{
- struct workqueue_struct *wq;
- struct worker_pool *pool;
+ struct pool_workqueue *pwq;
+ bool idle = true;
unsigned long flags;
- int pi;
-
- rcu_read_lock();
-
- pr_info("Showing busy workqueues and worker pools:\n");
- list_for_each_entry_rcu(wq, &workqueues, list) {
- struct pool_workqueue *pwq;
- bool idle = true;
-
- for_each_pwq(pwq, wq) {
- if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
- idle = false;
- break;
- }
+ for_each_pwq(pwq, wq) {
+ if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
+ idle = false;
+ break;
}
- if (idle)
- continue;
+ }
+ if (idle) /* Nothing to print for idle workqueue */
+ return;
- pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+ pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
- for_each_pwq(pwq, wq) {
- raw_spin_lock_irqsave(&pwq->pool->lock, flags);
- if (pwq->nr_active || !list_empty(&pwq->inactive_works))
- show_pwq(pwq);
- raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ for_each_pwq(pwq, wq) {
+ raw_spin_lock_irqsave(&pwq->pool->lock, flags);
+ if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
/*
- * We could be printing a lot from atomic context, e.g.
- * sysrq-t -> show_workqueue_state(). Avoid triggering
- * hard lockup.
+ * Defer printing to avoid deadlocks in console
+ * drivers that queue work while holding locks
+ * also taken in their write paths.
*/
- touch_nmi_watchdog();
- }
- }
-
- for_each_pool(pool, pi) {
- struct worker *worker;
- bool first = true;
-
- raw_spin_lock_irqsave(&pool->lock, flags);
- if (pool->nr_workers == pool->nr_idle)
- goto next_pool;
-
- pr_info("pool %d:", pool->id);
- pr_cont_pool_info(pool);
- pr_cont(" hung=%us workers=%d",
- jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
- pool->nr_workers);
- if (pool->manager)
- pr_cont(" manager: %d",
- task_pid_nr(pool->manager->task));
- list_for_each_entry(worker, &pool->idle_list, entry) {
- pr_cont(" %s%d", first ? "idle: " : "",
- task_pid_nr(worker->task));
- first = false;
+ printk_deferred_enter();
+ show_pwq(pwq);
+ printk_deferred_exit();
}
- pr_cont("\n");
- next_pool:
- raw_spin_unlock_irqrestore(&pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
/*
* We could be printing a lot from atomic context, e.g.
- * sysrq-t -> show_workqueue_state(). Avoid triggering
+ * sysrq-t -> show_all_workqueues(). Avoid triggering
* hard lockup.
*/
touch_nmi_watchdog();
}
+}
+
+/**
+ * show_one_worker_pool - dump state of specified worker pool
+ * @pool: worker pool whose state will be printed
+ */
+static void show_one_worker_pool(struct worker_pool *pool)
+{
+ struct worker *worker;
+ bool first = true;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&pool->lock, flags);
+ if (pool->nr_workers == pool->nr_idle)
+ goto next_pool;
+ /*
+ * Defer printing to avoid deadlocks in console drivers that
+ * queue work while holding locks also taken in their write
+ * paths.
+ */
+ printk_deferred_enter();
+ pr_info("pool %d:", pool->id);
+ pr_cont_pool_info(pool);
+ pr_cont(" hung=%us workers=%d",
+ jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
+ pool->nr_workers);
+ if (pool->manager)
+ pr_cont(" manager: %d",
+ task_pid_nr(pool->manager->task));
+ list_for_each_entry(worker, &pool->idle_list, entry) {
+ pr_cont(" %s%d", first ? "idle: " : "",
+ task_pid_nr(worker->task));
+ first = false;
+ }
+ pr_cont("\n");
+ printk_deferred_exit();
+next_pool:
+ raw_spin_unlock_irqrestore(&pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_all_workqueues(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
+
+}
+
+/**
+ * show_all_workqueues - dump workqueue state
+ *
+ * Called from a sysrq handler or try_to_freeze_tasks() and prints out
+ * all busy workqueues and pools.
+ */
+void show_all_workqueues(void)
+{
+ struct workqueue_struct *wq;
+ struct worker_pool *pool;
+ int pi;
+
+ rcu_read_lock();
+
+ pr_info("Showing busy workqueues and worker pools:\n");
+
+ list_for_each_entry_rcu(wq, &workqueues, list)
+ show_one_workqueue(wq);
+
+ for_each_pool(pool, pi)
+ show_one_worker_pool(pool);
+
rcu_read_unlock();
}
@@ -4945,38 +4988,22 @@ static void unbind_workers(int cpu)
/*
* We've blocked all attach/detach operations. Make all workers
* unbound and set DISASSOCIATED. Before this, all workers
- * except for the ones which are still executing works from
- * before the last CPU down must be on the cpu. After
- * this, they may become diasporas.
+ * must be on the cpu. After this, they may become diasporas.
+ * And the preemption disabled section in their sched callbacks
+ * are guaranteed to see WORKER_UNBOUND since the code here
+ * is on the same cpu.
*/
for_each_pool_worker(worker, pool)
worker->flags |= WORKER_UNBOUND;
pool->flags |= POOL_DISASSOCIATED;
- raw_spin_unlock_irq(&pool->lock);
-
- for_each_pool_worker(worker, pool) {
- kthread_set_per_cpu(worker->task, -1);
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
- }
-
- mutex_unlock(&wq_pool_attach_mutex);
-
/*
- * Call schedule() so that we cross rq->lock and thus can
- * guarantee sched callbacks see the %WORKER_UNBOUND flag.
- * This is necessary as scheduler callbacks may be invoked
- * from other cpus.
- */
- schedule();
-
- /*
- * Sched callbacks are disabled now. Zap nr_running.
- * After this, nr_running stays zero and need_more_worker()
- * and keep_working() are always true as long as the
- * worklist is not empty. This pool now behaves as an
- * unbound (in terms of concurrency management) pool which
+ * The handling of nr_running in sched callbacks are disabled
+ * now. Zap nr_running. After this, nr_running stays zero and
+ * need_more_worker() and keep_working() are always true as
+ * long as the worklist is not empty. This pool now behaves as
+ * an unbound (in terms of concurrency management) pool which
* are served by workers tied to the pool.
*/
atomic_set(&pool->nr_running, 0);
@@ -4986,9 +5013,16 @@ static void unbind_workers(int cpu)
* worker blocking could lead to lengthy stalls. Kick off
* unbound chain execution of currently pending work items.
*/
- raw_spin_lock_irq(&pool->lock);
wake_up_worker(pool);
+
raw_spin_unlock_irq(&pool->lock);
+
+ for_each_pool_worker(worker, pool) {
+ kthread_set_per_cpu(worker->task, -1);
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+ }
+
+ mutex_unlock(&wq_pool_attach_mutex);
}
}
@@ -5025,17 +5059,6 @@ static void rebind_workers(struct worker_pool *pool)
unsigned int worker_flags = worker->flags;
/*
- * A bound idle worker should actually be on the runqueue
- * of the associated CPU for local wake-ups targeting it to
- * work. Kick all idle workers so that they migrate to the
- * associated CPU. Doing this in the same loop as
- * replacing UNBOUND with REBOUND is safe as no worker will
- * be bound before @pool->lock is released.
- */
- if (worker_flags & WORKER_IDLE)
- wake_up_process(worker->task);
-
- /*
* We want to clear UNBOUND but can't directly call
* worker_clr_flags() or adjust nr_running. Atomically
* replace UNBOUND with another NOT_RUNNING flag REBOUND.
@@ -5370,9 +5393,6 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
int ret = -EINVAL;
cpumask_var_t saved_cpumask;
- if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
- return -ENOMEM;
-
/*
* Not excluding isolated cpus on purpose.
* If the user wishes to include them, we allow that.
@@ -5380,6 +5400,15 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
cpumask_and(cpumask, cpumask, cpu_possible_mask);
if (!cpumask_empty(cpumask)) {
apply_wqattrs_lock();
+ if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
/* save the old wq_unbound_cpumask. */
cpumask_copy(saved_cpumask, wq_unbound_cpumask);
@@ -5392,10 +5421,11 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
if (ret < 0)
cpumask_copy(wq_unbound_cpumask, saved_cpumask);
+ free_cpumask_var(saved_cpumask);
+out_unlock:
apply_wqattrs_unlock();
}
- free_cpumask_var(saved_cpumask);
return ret;
}
@@ -5855,7 +5885,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
rcu_read_unlock();
if (lockup_detected)
- show_workqueue_state();
+ show_all_workqueues();
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh);