From b48345aafb203803ccda4488cb5409b1ed435c0a Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 10 May 2019 12:21:49 -0400 Subject: audit: deliver signal_info regarless of syscall When a process signals the audit daemon (shutdown, rotate, resume, reconfig) but syscall auditing is not enabled, we still want to know the identity of the process sending the signal to the audit daemon. Move audit_signal_info() out of syscall auditing to general auditing but create a new function audit_signal_info_syscall() to take care of the syscall dependent parts for when syscall auditing is enabled. Please see the github kernel audit issue https://github.com/linux-audit/audit-kernel/issues/111 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 27 +++++++++++++++++++++++++++ kernel/audit.h | 8 ++++++-- kernel/auditsc.c | 19 +++---------------- kernel/signal.c | 2 +- 4 files changed, 37 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index b96bf69183f4..67399ff72d43 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2273,6 +2273,33 @@ out: return rc; } +/** + * audit_signal_info - record signal info for shutting down audit subsystem + * @sig: signal value + * @t: task being signaled + * + * If the audit subsystem is being terminated, record the task (pid) + * and uid that is doing that. + */ +int audit_signal_info(int sig, struct task_struct *t) +{ + kuid_t uid = current_uid(), auid; + + if (auditd_test_task(t) && + (sig == SIGTERM || sig == SIGHUP || + sig == SIGUSR1 || sig == SIGUSR2)) { + audit_sig_pid = task_tgid_nr(current); + auid = audit_get_loginuid(current); + if (uid_valid(auid)) + audit_sig_uid = auid; + else + audit_sig_uid = uid; + security_task_getsecid(current, &audit_sig_sid); + } + + return audit_signal_info_syscall(t); +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer diff --git a/kernel/audit.h b/kernel/audit.h index 2071725a999f..996d94faad43 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -299,7 +299,7 @@ extern const char *audit_tree_path(struct audit_tree *tree); extern void audit_put_tree(struct audit_tree *tree); extern void audit_kill_trees(struct audit_context *context); -extern int audit_signal_info(int sig, struct task_struct *t); +extern int audit_signal_info_syscall(struct task_struct *t); extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); @@ -330,7 +330,11 @@ extern struct list_head *audit_killed_trees(void); #define audit_tree_path(rule) "" /* never called */ #define audit_kill_trees(context) BUG() -#define audit_signal_info(s, t) AUDIT_DISABLED +static inline int audit_signal_info_syscall(struct task_struct *t) +{ + return 0; +} + #define audit_filter_inodes(t, c) AUDIT_DISABLED #endif /* CONFIG_AUDITSYSCALL */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 95ae27edd417..30aa07b0115f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2360,30 +2360,17 @@ void __audit_ptrace(struct task_struct *t) } /** - * audit_signal_info - record signal info for shutting down audit subsystem - * @sig: signal value + * audit_signal_info_syscall - record signal info for syscalls * @t: task being signaled * * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ -int audit_signal_info(int sig, struct task_struct *t) +int audit_signal_info_syscall(struct task_struct *t) { struct audit_aux_data_pids *axp; struct audit_context *ctx = audit_context(); - kuid_t uid = current_uid(), auid, t_uid = task_uid(t); - - if (auditd_test_task(t) && - (sig == SIGTERM || sig == SIGHUP || - sig == SIGUSR1 || sig == SIGUSR2)) { - audit_sig_pid = task_tgid_nr(current); - auid = audit_get_loginuid(current); - if (uid_valid(auid)) - audit_sig_uid = auid; - else - audit_sig_uid = uid; - security_task_getsecid(current, &audit_sig_sid); - } + kuid_t t_uid = task_uid(t); if (!audit_signals || audit_dummy_context()) return 0; diff --git a/kernel/signal.c b/kernel/signal.c index a1eb44dc9ff5..5cfc8611867b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -44,6 +44,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -53,7 +54,6 @@ #include #include #include -#include "audit.h" /* audit_signal_info() */ /* * SLAB caches for signal bits. -- cgit v1.2.3 From 2e21865faf4fd7ca99eb2ace072c6d618059e342 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 May 2019 14:06:51 +0100 Subject: keys: sparse: Fix key_fs[ug]id_changed() Sparse warnings are incurred by key_fs[ug]id_changed() due to unprotected accesses of tsk->cred, which is marked __rcu. Fix this by passing the new cred struct to these functions from commit_creds() rather than the task pointer. Signed-off-by: David Howells Reviewed-by: James Morris --- kernel/cred.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 45d77284aed0..3bd40de9e192 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -455,9 +455,9 @@ int commit_creds(struct cred *new) /* alter the thread keyring */ if (!uid_eq(new->fsuid, old->fsuid)) - key_fsuid_changed(task); + key_fsuid_changed(new); if (!gid_eq(new->fsgid, old->fsgid)) - key_fsgid_changed(task); + key_fsgid_changed(new); /* do it * RLIMIT_NPROC limits on user->processes have already been checked -- cgit v1.2.3 From 70f1b0d34bdf03065fe869e93cc17cad1ea20c4a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 7 Feb 2019 19:44:12 -0600 Subject: signal/usb: Replace kill_pid_info_as_cred with kill_pid_usb_asyncio The usb support for asyncio encoded one of it's values in the wrong field. It should have used si_value but instead used si_addr which is not present in the _rt union member of struct siginfo. The practical result of this is that on a 64bit big endian kernel when delivering a signal to a 32bit process the si_addr field is set to NULL, instead of the expected pointer value. This issue can not be fixed in copy_siginfo_to_user32 as the usb usage of the the _sigfault (aka si_addr) member of the siginfo union when SI_ASYNCIO is set is incompatible with the POSIX and glibc usage of the _rt member of the siginfo union. Therefore replace kill_pid_info_as_cred with kill_pid_usb_asyncio a dedicated function for this one specific case. There are no other users of kill_pid_info_as_cred so this specialization should have no impact on the amount of code in the kernel. Have kill_pid_usb_asyncio take instead of a siginfo_t which is difficult and error prone, 3 arguments, a signal number, an errno value, and an address enconded as a sigval_t. The encoding of the address as a sigval_t allows the code that reads the userspace request for a signal to handle this compat issue along with all of the other compat issues. Add BUILD_BUG_ONs in kernel/signal.c to ensure that we can now place the pointer value at the in si_pid (instead of si_addr). That is the code now verifies that si_pid and si_addr always occur at the same location. Further the code veries that for native structures a value placed in si_pid and spilling into si_uid will appear in userspace in si_addr (on a byte by byte copy of siginfo or a field by field copy of siginfo). The code also verifies that for a 64bit kernel and a 32bit userspace the 32bit pointer will fit in si_pid. I have used the usbsig.c program below written by Alan Stern and slightly tweaked by me to run on a big endian machine to verify the issue exists (on sparc64) and to confirm the patch below fixes the issue. /* usbsig.c -- test USB async signal delivery */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include static struct usbdevfs_urb urb; static struct usbdevfs_disconnectsignal ds; static volatile sig_atomic_t done = 0; void urb_handler(int sig, siginfo_t *info , void *ucontext) { printf("Got signal %d, signo %d errno %d code %d addr: %p urb: %p\n", sig, info->si_signo, info->si_errno, info->si_code, info->si_addr, &urb); printf("%s\n", (info->si_addr == &urb) ? "Good" : "Bad"); } void ds_handler(int sig, siginfo_t *info , void *ucontext) { printf("Got signal %d, signo %d errno %d code %d addr: %p ds: %p\n", sig, info->si_signo, info->si_errno, info->si_code, info->si_addr, &ds); printf("%s\n", (info->si_addr == &ds) ? "Good" : "Bad"); done = 1; } int main(int argc, char **argv) { char *devfilename; int fd; int rc; struct sigaction act; struct usb_ctrlrequest *req; void *ptr; char buf[80]; if (argc != 2) { fprintf(stderr, "Usage: usbsig device-file-name\n"); return 1; } devfilename = argv[1]; fd = open(devfilename, O_RDWR); if (fd == -1) { perror("Error opening device file"); return 1; } act.sa_sigaction = urb_handler; sigemptyset(&act.sa_mask); act.sa_flags = SA_SIGINFO; rc = sigaction(SIGUSR1, &act, NULL); if (rc == -1) { perror("Error in sigaction"); return 1; } act.sa_sigaction = ds_handler; sigemptyset(&act.sa_mask); act.sa_flags = SA_SIGINFO; rc = sigaction(SIGUSR2, &act, NULL); if (rc == -1) { perror("Error in sigaction"); return 1; } memset(&urb, 0, sizeof(urb)); urb.type = USBDEVFS_URB_TYPE_CONTROL; urb.endpoint = USB_DIR_IN | 0; urb.buffer = buf; urb.buffer_length = sizeof(buf); urb.signr = SIGUSR1; req = (struct usb_ctrlrequest *) buf; req->bRequestType = USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE; req->bRequest = USB_REQ_GET_DESCRIPTOR; req->wValue = htole16(USB_DT_DEVICE << 8); req->wIndex = htole16(0); req->wLength = htole16(sizeof(buf) - sizeof(*req)); rc = ioctl(fd, USBDEVFS_SUBMITURB, &urb); if (rc == -1) { perror("Error in SUBMITURB ioctl"); return 1; } rc = ioctl(fd, USBDEVFS_REAPURB, &ptr); if (rc == -1) { perror("Error in REAPURB ioctl"); return 1; } memset(&ds, 0, sizeof(ds)); ds.signr = SIGUSR2; ds.context = &ds; rc = ioctl(fd, USBDEVFS_DISCSIGNAL, &ds); if (rc == -1) { perror("Error in DISCSIGNAL ioctl"); return 1; } printf("Waiting for usb disconnect\n"); while (!done) { sleep(1); } close(fd); return 0; } Cc: Greg Kroah-Hartman Cc: linux-usb@vger.kernel.org Cc: Alan Stern Cc: Oliver Neukum Fixes: v2.3.39 Cc: stable@vger.kernel.org Acked-by: Alan Stern Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a1eb44dc9ff5..18040d6bd63a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1439,13 +1439,44 @@ static inline bool kill_as_cred_perm(const struct cred *cred, uid_eq(cred->uid, pcred->uid); } -/* like kill_pid_info(), but doesn't use uid/euid of "current" */ -int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid, - const struct cred *cred) +/* + * The usb asyncio usage of siginfo is wrong. The glibc support + * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT. + * AKA after the generic fields: + * kernel_pid_t si_pid; + * kernel_uid32_t si_uid; + * sigval_t si_value; + * + * Unfortunately when usb generates SI_ASYNCIO it assumes the layout + * after the generic fields is: + * void __user *si_addr; + * + * This is a practical problem when there is a 64bit big endian kernel + * and a 32bit userspace. As the 32bit address will encoded in the low + * 32bits of the pointer. Those low 32bits will be stored at higher + * address than appear in a 32 bit pointer. So userspace will not + * see the address it was expecting for it's completions. + * + * There is nothing in the encoding that can allow + * copy_siginfo_to_user32 to detect this confusion of formats, so + * handle this by requiring the caller of kill_pid_usb_asyncio to + * notice when this situration takes place and to store the 32bit + * pointer in sival_int, instead of sival_addr of the sigval_t addr + * parameter. + */ +int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, + struct pid *pid, const struct cred *cred) { - int ret = -EINVAL; + struct kernel_siginfo info; struct task_struct *p; unsigned long flags; + int ret = -EINVAL; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = errno; + info.si_code = SI_ASYNCIO; + *((sigval_t *)&info.si_pid) = addr; if (!valid_signal(sig)) return ret; @@ -1456,17 +1487,17 @@ int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid, ret = -ESRCH; goto out_unlock; } - if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) { + if (!kill_as_cred_perm(cred, p)) { ret = -EPERM; goto out_unlock; } - ret = security_task_kill(p, info, sig, cred); + ret = security_task_kill(p, &info, sig, cred); if (ret) goto out_unlock; if (sig) { if (lock_task_sighand(p, &flags)) { - ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0); + ret = __send_signal(sig, &info, p, PIDTYPE_TGID, 0); unlock_task_sighand(p, &flags); } else ret = -ESRCH; @@ -1475,7 +1506,7 @@ out_unlock: rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(kill_pid_info_as_cred); +EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio); /* * kill_something_info() interprets pid in interesting ways just like kill(2). @@ -4474,6 +4505,28 @@ static inline void siginfo_buildtime_checks(void) CHECK_OFFSET(si_syscall); CHECK_OFFSET(si_arch); #undef CHECK_OFFSET + + /* usb asyncio */ + BUILD_BUG_ON(offsetof(struct siginfo, si_pid) != + offsetof(struct siginfo, si_addr)); + if (sizeof(int) == sizeof(void __user *)) { + BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) != + sizeof(void __user *)); + } else { + BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) + + sizeof_field(struct siginfo, si_uid)) != + sizeof(void __user *)); + BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) != + offsetof(struct siginfo, si_uid)); + } +#ifdef CONFIG_COMPAT + BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) != + offsetof(struct compat_siginfo, si_addr)); + BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != + sizeof(compat_uptr_t)); + BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != + sizeof_field(struct siginfo, si_pid)); +#endif } void __init signals_init(void) -- cgit v1.2.3 From 7a0cf094944e2540758b7f957eb6846d5126f535 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 May 2019 22:54:56 -0500 Subject: signal: Correct namespace fixups of si_pid and si_uid The function send_signal was split from __send_signal so that it would be possible to bypass the namespace logic based upon current[1]. As it turns out the si_pid and the si_uid fixup are both inappropriate in the case of kill_pid_usb_asyncio so move that logic into send_signal. It is difficult to arrange but possible for a signal with an si_code of SI_TIMER or SI_SIGIO to be sent across namespace boundaries. In which case tests for when it is ok to change si_pid and si_uid based on SI_FROMUSER are incorrect. Replace the use of SI_FROMUSER with a new test has_si_pid_and_used based on siginfo_layout. Now that the uid fixup is no longer present after expanding SEND_SIG_NOINFO properly calculate the si_uid that the target task needs to read. [1] 7978b567d315 ("signals: add from_ancestor_ns parameter to send_signal()") Cc: stable@vger.kernel.org Fixes: 6588c1e3ff01 ("signals: SI_USER: Masquerade si_pid when crossing pid ns boundary") Fixes: 6b550f949594 ("user namespace: make signal.c respect user namespaces") Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 67 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 18040d6bd63a..39a3eca5ce22 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1056,27 +1056,6 @@ static inline bool legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } -#ifdef CONFIG_USER_NS -static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t) -{ - if (current_user_ns() == task_cred_xxx(t, user_ns)) - return; - - if (SI_FROMKERNEL(info)) - return; - - rcu_read_lock(); - info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), - make_kuid(current_user_ns(), info->si_uid)); - rcu_read_unlock(); -} -#else -static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t) -{ - return; -} -#endif - static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type, int from_ancestor_ns) { @@ -1134,7 +1113,11 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc q->info.si_code = SI_USER; q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); - q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); + rcu_read_lock(); + q->info.si_uid = + from_kuid_munged(task_cred_xxx(t, user_ns), + current_uid()); + rcu_read_unlock(); break; case (unsigned long) SEND_SIG_PRIV: clear_siginfo(&q->info); @@ -1146,13 +1129,8 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc break; default: copy_siginfo(&q->info, info); - if (from_ancestor_ns) - q->info.si_pid = 0; break; } - - userns_fixup_signal_uid(&q->info, t); - } else if (!is_si_special(info)) { if (sig >= SIGRTMIN && info->si_code != SI_USER) { /* @@ -1196,6 +1174,28 @@ ret: return ret; } +static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) +{ + bool ret = false; + switch (siginfo_layout(info->si_signo, info->si_code)) { + case SIL_KILL: + case SIL_CHLD: + case SIL_RT: + ret = true; + break; + case SIL_TIMER: + case SIL_POLL: + case SIL_FAULT: + case SIL_FAULT_MCEERR: + case SIL_FAULT_BNDERR: + case SIL_FAULT_PKUERR: + case SIL_SYS: + ret = false; + break; + } + return ret; +} + static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type) { @@ -1205,7 +1205,20 @@ static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct from_ancestor_ns = si_fromuser(info) && !task_pid_nr_ns(current, task_active_pid_ns(t)); #endif + if (!is_si_special(info) && has_si_pid_and_uid(info)) { + struct user_namespace *t_user_ns; + rcu_read_lock(); + t_user_ns = task_cred_xxx(t, user_ns); + if (current_user_ns() != t_user_ns) { + kuid_t uid = make_kuid(current_user_ns(), info->si_uid); + info->si_uid = from_kuid_munged(t_user_ns, uid); + } + rcu_read_unlock(); + + if (!task_pid_nr_ns(current, task_active_pid_ns(t))) + info->si_pid = 0; + } return __send_signal(sig, info, t, type, from_ancestor_ns); } -- cgit v1.2.3 From b285fcb760da7aa87d6d31e6c6a4907d82d9299c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:14:19 -0700 Subject: bpf: bump jmp sequence limit The limit of 1024 subsequent jumps was causing otherwise valid programs to be rejected. Bump it to 8192 and make the error more verbose. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 95f9354495ad..3f8b5443cc67 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -176,7 +176,7 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 #define BPF_COMPLEXITY_LIMIT_STATES 64 #define BPF_MAP_PTR_UNPRIV 1UL @@ -782,8 +782,9 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, if (err) goto err; elem->st.speculative |= speculative; - if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { - verbose(env, "BPF program is too complex\n"); + if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { + verbose(env, "The sequence of %d jumps is too complex.\n", + env->stack_size); goto err; } return &elem->st; -- cgit v1.2.3 From 5d839021675a2e1b76653189cc6a90cfd8e30a69 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:05 -0700 Subject: bpf: cleanup explored_states clean up explored_states to prep for introduction of hashtable No functional changes. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3f8b5443cc67..736b5a0d4848 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5437,6 +5437,17 @@ enum { }; #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) +static struct bpf_verifier_state_list **explored_state( + struct bpf_verifier_env *env, + int idx) +{ + return &env->explored_states[idx]; +} + +static void init_explored_state(struct bpf_verifier_env *env, int idx) +{ + env->explored_states[idx] = STATE_LIST_MARK; +} /* t, w, e - match pseudo-code above: * t - index of current instruction @@ -5462,7 +5473,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) if (e == BRANCH) /* mark branch target for state pruning */ - env->explored_states[w] = STATE_LIST_MARK; + init_explored_state(env, w); if (insn_state[w] == 0) { /* tree-edge */ @@ -5530,9 +5541,9 @@ peek_stack: else if (ret < 0) goto err_free; if (t + 1 < insn_cnt) - env->explored_states[t + 1] = STATE_LIST_MARK; + init_explored_state(env, t + 1); if (insns[t].src_reg == BPF_PSEUDO_CALL) { - env->explored_states[t] = STATE_LIST_MARK; + init_explored_state(env, t); ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); if (ret == 1) goto peek_stack; @@ -5555,10 +5566,10 @@ peek_stack: * after every call and jump */ if (t + 1 < insn_cnt) - env->explored_states[t + 1] = STATE_LIST_MARK; + init_explored_state(env, t + 1); } else { /* conditional jump with two edges */ - env->explored_states[t] = STATE_LIST_MARK; + init_explored_state(env, t); ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret == 1) goto peek_stack; @@ -6006,7 +6017,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state_list *sl; int i; - sl = env->explored_states[insn]; + sl = *explored_state(env, insn); if (!sl) return; @@ -6365,7 +6376,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - pprev = &env->explored_states[insn_idx]; + pprev = explored_state(env, insn_idx); sl = *pprev; if (!sl) @@ -6452,8 +6463,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) kfree(new_sl); return err; } - new_sl->next = env->explored_states[insn_idx]; - env->explored_states[insn_idx] = new_sl; + new_sl->next = *explored_state(env, insn_idx); + *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all * registers connected. Only r6 - r9 of the callers are alive (pushed * to the stack implicitly by JITs) so in callers' frames connect just -- cgit v1.2.3 From a8f500af0ccffc3d2aaf9018537981cb173865a1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:06 -0700 Subject: bpf: split explored_states split explored_states into prune_point boolean mark and link list of explored states. This removes STATE_LIST_MARK hack and allows marks to be separate from states. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 736b5a0d4848..6a3e69ba891e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5436,7 +5436,6 @@ enum { BRANCH = 2, }; -#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) static struct bpf_verifier_state_list **explored_state( struct bpf_verifier_env *env, int idx) @@ -5446,7 +5445,7 @@ static struct bpf_verifier_state_list **explored_state( static void init_explored_state(struct bpf_verifier_env *env, int idx) { - env->explored_states[idx] = STATE_LIST_MARK; + env->insn_aux_data[idx].prune_point = true; } /* t, w, e - match pseudo-code above: @@ -6018,10 +6017,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, int i; sl = *explored_state(env, insn); - if (!sl) - return; - - while (sl != STATE_LIST_MARK) { + while (sl) { if (sl->state.curframe != cur->curframe) goto next; for (i = 0; i <= cur->curframe; i++) @@ -6376,18 +6372,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - pprev = explored_state(env, insn_idx); - sl = *pprev; - - if (!sl) + if (!env->insn_aux_data[insn_idx].prune_point) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here */ return 0; + pprev = explored_state(env, insn_idx); + sl = *pprev; + clean_live_states(env, insn_idx, cur); - while (sl != STATE_LIST_MARK) { + while (sl) { if (states_equal(env, &sl->state, cur)) { sl->hit_cnt++; /* reached equivalent register/stack state, @@ -8145,13 +8141,12 @@ static void free_states(struct bpf_verifier_env *env) for (i = 0; i < env->prog->len; i++) { sl = env->explored_states[i]; - if (sl) - while (sl != STATE_LIST_MARK) { - sln = sl->next; - free_verifier_state(&sl->state, false); - kfree(sl); - sl = sln; - } + while (sl) { + sln = sl->next; + free_verifier_state(&sl->state, false); + kfree(sl); + sl = sln; + } } kvfree(env->explored_states); -- cgit v1.2.3 From dc2a4ebc0b44a212fcf72242210e56aa17e7317b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 21 May 2019 20:17:07 -0700 Subject: bpf: convert explored_states to hash table All prune points inside a callee bpf function most likely will have different callsites. For example, if function foo() is called from two callsites the half of explored states in all prune points in foo() will be useless for subsequent walking of one of those callsites. Fortunately explored_states pruning heuristics keeps the number of states per prune point small, but walking these states is still a waste of cpu time when the callsite of the current state is different from the callsite of the explored state. To improve pruning logic convert explored_states into hash table and use simple insn_idx ^ callsite hash to select hash bucket. This optimization has no effect on programs without bpf2bpf calls and drastically improves programs with calls. In the later case it reduces total memory consumption in 1M scale tests by almost 3 times (peak_states drops from 5752 to 2016). Care should be taken when comparing the states for equivalency. Since the same hash bucket can now contain states with different indices the insn_idx has to be part of verifier_state and compared. Different hash table sizes and different hash functions were explored, but the results were not significantly better vs this patch. They can be improved in the future. Hit/miss heuristic is not counting index miscompare as a miss. Otherwise verifier stats become unstable when experimenting with different hash functions. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a3e69ba891e..550091c7a46a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5436,11 +5436,19 @@ enum { BRANCH = 2, }; +static u32 state_htab_size(struct bpf_verifier_env *env) +{ + return env->prog->len; +} + static struct bpf_verifier_state_list **explored_state( struct bpf_verifier_env *env, int idx) { - return &env->explored_states[idx]; + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_func_state *state = cur->frame[cur->curframe]; + + return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; } static void init_explored_state(struct bpf_verifier_env *env, int idx) @@ -6018,7 +6026,8 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, sl = *explored_state(env, insn); while (sl) { - if (sl->state.curframe != cur->curframe) + if (sl->state.insn_idx != insn || + sl->state.curframe != cur->curframe) goto next; for (i = 0; i <= cur->curframe; i++) if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) @@ -6384,6 +6393,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) clean_live_states(env, insn_idx, cur); while (sl) { + states_cnt++; + if (sl->state.insn_idx != insn_idx) + goto next; if (states_equal(env, &sl->state, cur)) { sl->hit_cnt++; /* reached equivalent register/stack state, @@ -6401,7 +6413,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return err; return 1; } - states_cnt++; sl->miss_cnt++; /* heuristic to determine whether this state is beneficial * to keep checking from state equivalence point of view. @@ -6428,6 +6439,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) sl = *pprev; continue; } +next: pprev = &sl->next; sl = *pprev; } @@ -6459,6 +6471,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) kfree(new_sl); return err; } + new->insn_idx = insn_idx; new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -8138,7 +8151,7 @@ static void free_states(struct bpf_verifier_env *env) if (!env->explored_states) return; - for (i = 0; i < env->prog->len; i++) { + for (i = 0; i < state_htab_size(env); i++) { sl = env->explored_states[i]; while (sl) { @@ -8246,7 +8259,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - env->explored_states = kvcalloc(env->prog->len, + env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; -- cgit v1.2.3 From ecc68904a3e5efb07cb4f0b97d15c7e0e4623d13 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 22 May 2019 17:51:09 -0400 Subject: audit: re-structure audit field valid checks Multiple checks were being done in one switch case statement that started to cause some redundancies and awkward exceptions. Separate the valid field and op check from the select valid values checks. Enforce the elimination of meaningless bitwise and greater/lessthan checks on string fields and other fields with unrelated scalar values. Please see the github issue https://github.com/linux-audit/audit-kernel/issues/73 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditfilter.c | 56 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 303fb04770ce..d5e54e944f72 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -335,7 +335,7 @@ static u32 audit_to_op(u32 op) /* check if an audit field is valid */ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) { - switch(f->type) { + switch (f->type) { case AUDIT_MSGTYPE: if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE && entry->rule.listnr != AUDIT_FILTER_USER) @@ -347,7 +347,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) break; } - switch(entry->rule.listnr) { + switch (entry->rule.listnr) { case AUDIT_FILTER_FS: switch(f->type) { case AUDIT_FSTYPE: @@ -358,9 +358,16 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) } } - switch(f->type) { - default: - return -EINVAL; + /* Check for valid field type and op */ + switch (f->type) { + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + case AUDIT_PERS: /* */ + case AUDIT_DEVMINOR: + /* all ops are valid */ + break; case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: @@ -373,46 +380,52 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_FSGID: case AUDIT_OBJ_GID: case AUDIT_PID: - case AUDIT_PERS: case AUDIT_MSGTYPE: case AUDIT_PPID: case AUDIT_DEVMAJOR: - case AUDIT_DEVMINOR: case AUDIT_EXIT: case AUDIT_SUCCESS: case AUDIT_INODE: case AUDIT_SESSIONID: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; break; - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: case AUDIT_WATCH: case AUDIT_DIR: case AUDIT_FILTERKEY: - break; case AUDIT_LOGINUID_SET: - if ((f->val != 0) && (f->val != 1)) - return -EINVAL; - /* FALL THROUGH */ case AUDIT_ARCH: case AUDIT_FSTYPE: + case AUDIT_PERM: + case AUDIT_FILETYPE: + case AUDIT_FIELD_COMPARE: + case AUDIT_EXE: + /* only equal and not equal valid ops */ if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; break; + default: + /* field not recognized */ + return -EINVAL; + } + + /* Check for select valid field values */ + switch (f->type) { + case AUDIT_LOGINUID_SET: + if ((f->val != 0) && (f->val != 1)) + return -EINVAL; + break; case AUDIT_PERM: if (f->val & ~15) return -EINVAL; @@ -425,11 +438,10 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) if (f->val > AUDIT_MAX_FIELD_COMPARE) return -EINVAL; break; - case AUDIT_EXE: - if (f->op != Audit_not_equal && f->op != Audit_equal) - return -EINVAL; + default: break; } + return 0; } -- cgit v1.2.3 From bf361231c295d92a28ca283ea713f56e93e55796 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 9 May 2019 20:01:36 -0400 Subject: audit: add saddr_fam filter field Provide a method to filter out sockaddr and bind calls by network address family. Existing SOCKADDR records are listed for any network activity. Implement the AUDIT_SADDR_FAM field selector to be able to classify or limit records to specific network address families, such as AF_INET or AF_INET6. An example of a network record that is unlikely to be useful and flood the logs: type=SOCKADDR msg=audit(07/27/2017 12:18:27.019:845) : saddr={ fam=local path=/var/run/nscd/socket } type=SYSCALL msg=audit(07/27/2017 12:18:27.019:845) : arch=x86_64 syscall=connect success=no exit=ENOENT(No such file or directory) a0=0x3 a1=0x7fff229c4980 a2=0x6e a3=0x6 items=1 ppid=3301 pid=6145 auid=sgrubb uid=sgrubb gid=sgrubb euid=sgrubb suid=sgrubb fsuid=sgrubb egid=sgrubb sgid=sgrubb fsgid=sgrubb tty=pts3 ses=4 comm=bash exe=/usr/bin/bash subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=network-test Please see the audit-testsuite PR at https://github.com/linux-audit/audit-testsuite/pull/87 Please see the github issue https://github.com/linux-audit/audit-kernel/issues/64 Please see the github issue for the accompanying userspace support https://github.com/linux-audit/audit-userspace/issues/93 Signed-off-by: Richard Guy Briggs [PM: merge fuzz in auditfilter.c] Signed-off-by: Paul Moore --- kernel/auditfilter.c | 5 +++++ kernel/auditsc.c | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d5e54e944f72..e69d136eeaf6 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -391,6 +391,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_SUBJ_CLR: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: + case AUDIT_SADDR_FAM: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; @@ -438,6 +439,10 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) if (f->val > AUDIT_MAX_FIELD_COMPARE) return -EINVAL; break; + case AUDIT_SADDR_FAM: + if (f->val >= AF_MAX) + return -EINVAL; + break; default: break; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 30aa07b0115f..9134fe11ff6c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -615,6 +615,11 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_LOGINUID_SET: result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); break; + case AUDIT_SADDR_FAM: + if (ctx->sockaddr) + result = audit_comparator(ctx->sockaddr->ss_family, + f->op, f->val); + break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: -- cgit v1.2.3 From 978315462d3ea3cf6cfacd34c563ec1eb02a3aa5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 May 2019 23:22:34 +0200 Subject: locking/lockdep: Don't complain about incorrect name for no validate class It is possible to ignore the validation for a certain lock by using: lockdep_set_novalidate_class() on it. Each invocation will assign a new name to the class it created for created __lockdep_no_validate__. That means that once lockdep_set_novalidate_class() has been used on two locks then class->name won't match lock->name for the first lock triggering the warning. So ignore changed non-matching ->name pointer for the special __lockdep_no_validate__ class. Signed-off-by: Sebastian Andrzej Siewior Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20190517212234.32611-1-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c47788fa85f9..6b283b4f87aa 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -732,7 +732,8 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) * Huh! same key, different name? Did someone trample * on some memory? We're most confused. */ - WARN_ON_ONCE(class->name != lock->name); + WARN_ON_ONCE(class->name != lock->name && + lock->key != &__lockdep_no_validate__); return class; } } -- cgit v1.2.3 From c0090c4c85c27d1fa3d785c935501b7207cd2869 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Thu, 16 May 2019 21:13:26 +0200 Subject: locking/lockdep: Remove the unused print_lock_trace() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc warns that function print_lock_trace() is unused if CONFIG_PROVE_LOCKING isn't set: ../kernel/locking/lockdep.c:2820:13: warning: ‘print_lock_trace’ defined but not used [-Wunused-function] Rework so we remove the function if CONFIG_PROVE_LOCKING isn't set. Signed-off-by: Anders Roxell Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: will.deacon@arm.com Fixes: c120bce78065 ("lockdep: Simplify stack trace handling") Link: http://lkml.kernel.org/r/20190516191326.27003-1-anders.roxell@linaro.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 6b283b4f87aa..8d32ae7768a7 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2818,10 +2818,6 @@ static inline int validate_chain(struct task_struct *curr, { return 1; } - -static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) -{ -} #endif /* -- cgit v1.2.3 From 8b401f9ed2441ad9e219953927a842d24ed051fc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 May 2019 14:47:45 -0700 Subject: bpf: implement bpf_send_signal() helper This patch tries to solve the following specific use case. Currently, bpf program can already collect stack traces through kernel function get_perf_callchain() when certain events happens (e.g., cache miss counter or cpu clock counter overflows). But such stack traces are not enough for jitted programs, e.g., hhvm (jited php). To get real stack trace, jit engine internal data structures need to be traversed in order to get the real user functions. bpf program itself may not be the best place to traverse the jit engine as the traversing logic could be complex and it is not a stable interface either. Instead, hhvm implements a signal handler, e.g. for SIGALARM, and a set of program locations which it can dump stack traces. When it receives a signal, it will dump the stack in next such program location. Such a mechanism can be implemented in the following way: . a perf ring buffer is created between bpf program and tracing app. . once a particular event happens, bpf program writes to the ring buffer and the tracing app gets notified. . the tracing app sends a signal SIGALARM to the hhvm. But this method could have large delays and causing profiling results skewed. This patch implements bpf_send_signal() helper to send a signal to hhvm in real time, resulting in intended stack traces. Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f92d6ad5e080..70029eafc71f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -567,6 +567,63 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; +struct send_signal_irq_work { + struct irq_work irq_work; + struct task_struct *task; + u32 sig; +}; + +static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); + +static void do_bpf_send_signal(struct irq_work *entry) +{ + struct send_signal_irq_work *work; + + work = container_of(entry, struct send_signal_irq_work, irq_work); + group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID); +} + +BPF_CALL_1(bpf_send_signal, u32, sig) +{ + struct send_signal_irq_work *work = NULL; + + /* Similar to bpf_probe_write_user, task needs to be + * in a sound condition and kernel memory access be + * permitted in order to send signal to the current + * task. + */ + if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) + return -EPERM; + if (unlikely(uaccess_kernel())) + return -EPERM; + if (unlikely(!nmi_uaccess_okay())) + return -EPERM; + + if (in_nmi()) { + work = this_cpu_ptr(&send_signal_work); + if (work->irq_work.flags & IRQ_WORK_BUSY) + return -EBUSY; + + /* Add the current task, which is the target of sending signal, + * to the irq_work. The current task may change when queued + * irq works get executed. + */ + work->task = current; + work->sig = sig; + irq_work_queue(&work->irq_work); + return 0; + } + + return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); +} + +static const struct bpf_func_proto bpf_send_signal_proto = { + .func = bpf_send_signal, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -617,6 +674,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; #endif + case BPF_FUNC_send_signal: + return &bpf_send_signal_proto; default: return NULL; } @@ -1343,5 +1402,18 @@ static int __init bpf_event_init(void) return 0; } +static int __init send_signal_irq_work_init(void) +{ + int cpu; + struct send_signal_irq_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&send_signal_work, cpu); + init_irq_work(&work->irq_work, do_bpf_send_signal); + } + return 0; +} + fs_initcall(bpf_event_init); +subsys_initcall(send_signal_irq_work_init); #endif /* CONFIG_MODULES */ -- cgit v1.2.3 From 5327ed3d44b754f5cc51d5b3f18e442eaebacff5 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:12 +0100 Subject: bpf: verifier: mark verified-insn with sub-register zext flag eBPF ISA specification requires high 32-bit cleared when low 32-bit sub-register is written. This applies to destination register of ALU32 etc. JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and AArch64 ISA has the same semantics, so the corresponding JIT back-end doesn't need to do extra work. However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches (PowerPC, SPARC etc) need to do explicit zero extension to meet this requirement, otherwise code like the following will fail. u64_value = (u64) u32_value ... other uses of u64_value This is because compiler could exploit the semantic described above and save those zero extensions for extending u32_value to u64_value, these JIT back-ends are expected to guarantee this through inserting extra zero extensions which however could be a significant increase on the code size. Some benchmarks show there could be ~40% sub-register writes out of total insns, meaning at least ~40% extra code-gen. One observation is these extra zero extensions are not always necessary. Take above code snippet for example, it is possible u32_value will never be casted into a u64, the value of high 32-bit of u32_value then could be ignored and extra zero extension could be eliminated. This patch implements this idea, insns defining sub-registers will be marked when the high 32-bit of the defined sub-register matters. For those unmarked insns, it is safe to eliminate high 32-bit clearnace for them. Algo: - Split read flags into READ32 and READ64. - Record index of insn that does sub-register write. Keep the index inside reg state and update it during verifier insn walking. - A full register read on a sub-register marks its definition insn as needing zero extension on dst register. A new sub-register write overrides the old one. - When propagating read64 during path pruning, also mark any insn defining a sub-register that is read in the pruned path as full-register. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 160 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 550091c7a46a..f6b4c7148c3e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -982,6 +982,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, __mark_reg_not_init(regs + regno); } +#define DEF_NOT_SUBREG (0) static void init_reg_state(struct bpf_verifier_env *env, struct bpf_func_state *state) { @@ -992,6 +993,7 @@ static void init_reg_state(struct bpf_verifier_env *env, mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; regs[i].parent = NULL; + regs[i].subreg_def = DEF_NOT_SUBREG; } /* frame pointer */ @@ -1137,7 +1139,7 @@ next: */ static int mark_reg_read(struct bpf_verifier_env *env, const struct bpf_reg_state *state, - struct bpf_reg_state *parent) + struct bpf_reg_state *parent, u8 flag) { bool writes = parent == state->parent; /* Observe write marks */ int cnt = 0; @@ -1152,17 +1154,26 @@ static int mark_reg_read(struct bpf_verifier_env *env, parent->var_off.value, parent->off); return -EFAULT; } - if (parent->live & REG_LIVE_READ) + /* The first condition is more likely to be true than the + * second, checked it first. + */ + if ((parent->live & REG_LIVE_READ) == flag || + parent->live & REG_LIVE_READ64) /* The parentage chain never changes and * this parent was already marked as LIVE_READ. * There is no need to keep walking the chain again and * keep re-marking all parents as LIVE_READ. * This case happens when the same register is read * multiple times without writes into it in-between. + * Also, if parent has the stronger REG_LIVE_READ64 set, + * then no need to set the weak REG_LIVE_READ32. */ break; /* ... then we depend on parent's value */ - parent->live |= REG_LIVE_READ; + parent->live |= flag; + /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ + if (flag == REG_LIVE_READ64) + parent->live &= ~REG_LIVE_READ32; state = parent; parent = state->parent; writes = true; @@ -1174,12 +1185,111 @@ static int mark_reg_read(struct bpf_verifier_env *env, return 0; } +/* This function is supposed to be used by the following 32-bit optimization + * code only. It returns TRUE if the source or destination register operates + * on 64-bit, otherwise return FALSE. + */ +static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, + u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t) +{ + u8 code, class, op; + + code = insn->code; + class = BPF_CLASS(code); + op = BPF_OP(code); + if (class == BPF_JMP) { + /* BPF_EXIT for "main" will reach here. Return TRUE + * conservatively. + */ + if (op == BPF_EXIT) + return true; + if (op == BPF_CALL) { + /* BPF to BPF call will reach here because of marking + * caller saved clobber with DST_OP_NO_MARK for which we + * don't care the register def because they are anyway + * marked as NOT_INIT already. + */ + if (insn->src_reg == BPF_PSEUDO_CALL) + return false; + /* Helper call will reach here because of arg type + * check, conservatively return TRUE. + */ + if (t == SRC_OP) + return true; + + return false; + } + } + + if (class == BPF_ALU64 || class == BPF_JMP || + /* BPF_END always use BPF_ALU class. */ + (class == BPF_ALU && op == BPF_END && insn->imm == 64)) + return true; + + if (class == BPF_ALU || class == BPF_JMP32) + return false; + + if (class == BPF_LDX) { + if (t != SRC_OP) + return BPF_SIZE(code) == BPF_DW; + /* LDX source must be ptr. */ + return true; + } + + if (class == BPF_STX) { + if (reg->type != SCALAR_VALUE) + return true; + return BPF_SIZE(code) == BPF_DW; + } + + if (class == BPF_LD) { + u8 mode = BPF_MODE(code); + + /* LD_IMM64 */ + if (mode == BPF_IMM) + return true; + + /* Both LD_IND and LD_ABS return 32-bit data. */ + if (t != SRC_OP) + return false; + + /* Implicit ctx ptr. */ + if (regno == BPF_REG_6) + return true; + + /* Explicit source could be any width. */ + return true; + } + + if (class == BPF_ST) + /* The only source register for BPF_ST is a ptr. */ + return true; + + /* Conservatively return true at default. */ + return true; +} + +static void mark_insn_zext(struct bpf_verifier_env *env, + struct bpf_reg_state *reg) +{ + s32 def_idx = reg->subreg_def; + + if (def_idx == DEF_NOT_SUBREG) + return; + + env->insn_aux_data[def_idx - 1].zext_dst = true; + /* The dst will be zero extended, so won't be sub-register anymore. */ + reg->subreg_def = DEF_NOT_SUBREG; +} + static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; struct bpf_reg_state *reg, *regs = state->regs; + bool rw64; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); @@ -1187,6 +1297,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, } reg = ®s[regno]; + rw64 = is_reg64(env, insn, regno, reg, t); if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (reg->type == NOT_INIT) { @@ -1197,7 +1308,11 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, if (regno == BPF_REG_FP) return 0; - return mark_reg_read(env, reg, reg->parent); + if (rw64) + mark_insn_zext(env, reg); + + return mark_reg_read(env, reg, reg->parent, + rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -1205,6 +1320,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return -EACCES; } reg->live |= REG_LIVE_WRITTEN; + reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; if (t == DST_OP) mark_reg_unknown(env, regs, regno); } @@ -1384,7 +1500,8 @@ static int check_stack_read(struct bpf_verifier_env *env, state->regs[value_regno].live |= REG_LIVE_WRITTEN; } mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent); + reg_state->stack[spi].spilled_ptr.parent, + REG_LIVE_READ64); return 0; } else { int zeros = 0; @@ -1401,7 +1518,8 @@ static int check_stack_read(struct bpf_verifier_env *env, return -EACCES; } mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent); + reg_state->stack[spi].spilled_ptr.parent, + REG_LIVE_READ64); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, @@ -2110,6 +2228,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); if (reg_type_may_be_null(reg_type)) regs[value_regno].id = ++env->id_gen; + /* A load of ctx field could have different + * actual load size with the one encoded in the + * insn. When the dst is PTR, it is for sure not + * a sub-register. + */ + regs[value_regno].subreg_def = DEF_NOT_SUBREG; } regs[value_regno].type = reg_type; } @@ -2369,7 +2493,8 @@ mark: * the whole slot to be marked as 'read' */ mark_reg_read(env, &state->stack[spi].spilled_ptr, - state->stack[spi].spilled_ptr.parent); + state->stack[spi].spilled_ptr.parent, + REG_LIVE_READ64); } return update_stack_depth(env, state, min_off); } @@ -3333,6 +3458,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } + /* helper call returns 64-bit value. */ + regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ @@ -4264,6 +4392,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { @@ -4274,6 +4403,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (src_reg->type == SCALAR_VALUE) { *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = env->insn_idx + 1; } else { mark_reg_unknown(env, regs, insn->dst_reg); @@ -5353,6 +5483,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * Already marked as written above. */ mark_reg_unknown(env, regs, BPF_REG_0); + /* ld_abs load up to 32-bit skb data. */ + regs[BPF_REG_0].subreg_def = env->insn_idx + 1; return 0; } @@ -6309,20 +6441,33 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } +/* Return 0 if no propagation happened. Return negative error code if error + * happened. Otherwise, return the propagated bit. + */ static int propagate_liveness_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, struct bpf_reg_state *parent_reg) { + u8 parent_flag = parent_reg->live & REG_LIVE_READ; + u8 flag = reg->live & REG_LIVE_READ; int err; - if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) + /* When comes here, read flags of PARENT_REG or REG could be any of + * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need + * of propagation if PARENT_REG has strongest REG_LIVE_READ64. + */ + if (parent_flag == REG_LIVE_READ64 || + /* Or if there is no read flag from REG. */ + !flag || + /* Or if the read flag from REG is the same as PARENT_REG. */ + parent_flag == flag) return 0; - err = mark_reg_read(env, reg, parent_reg); + err = mark_reg_read(env, reg, parent_reg, flag); if (err) return err; - return 0; + return flag; } /* A write screens off any subsequent reads; but write marks come from the @@ -6356,8 +6501,10 @@ static int propagate_liveness(struct bpf_verifier_env *env, for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { err = propagate_liveness_reg(env, &state_reg[i], &parent_reg[i]); - if (err) + if (err < 0) return err; + if (err == REG_LIVE_READ64) + mark_insn_zext(env, &parent_reg[i]); } /* Propagate stack slots. */ @@ -6367,11 +6514,11 @@ static int propagate_liveness(struct bpf_verifier_env *env, state_reg = &state->stack[i].spilled_ptr; err = propagate_liveness_reg(env, state_reg, parent_reg); - if (err) + if (err < 0) return err; } } - return err; + return 0; } static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) -- cgit v1.2.3 From b325fbca4b136886885e51f4d36e2adab76596e3 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:13 +0100 Subject: bpf: verifier: mark patched-insn with sub-register zext flag Patched insns do not go through generic verification, therefore doesn't has zero extension information collected during insn walking. We don't bother analyze them at the moment, for any sub-register def comes from them, just conservatively mark it as needing zero extension. Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f6b4c7148c3e..a6af3166acae 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1269,6 +1269,24 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, return true; } +/* Return TRUE if INSN doesn't have explicit value define. */ +static bool insn_no_def(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + + return (class == BPF_JMP || class == BPF_JMP32 || + class == BPF_STX || class == BPF_ST); +} + +/* Return TRUE if INSN has defined any 32-bit value explicitly. */ +static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + if (insn_no_def(insn)) + return false; + + return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP); +} + static void mark_insn_zext(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { @@ -7298,14 +7316,23 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying * [0, off) and [off, end) to new locations, so the patched range stays zero */ -static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, - u32 off, u32 cnt) +static int adjust_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_prog *new_prog, u32 off, u32 cnt) { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + struct bpf_insn *insn = new_prog->insnsi; + u32 prog_len; int i; + /* aux info at OFF always needs adjustment, no matter fast path + * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the + * original insn at old prog. + */ + old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); + if (cnt == 1) return 0; + prog_len = new_prog->len; new_data = vzalloc(array_size(prog_len, sizeof(struct bpf_insn_aux_data))); if (!new_data) @@ -7313,8 +7340,10 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); - for (i = off; i < off + cnt - 1; i++) + for (i = off; i < off + cnt - 1; i++) { new_data[i].seen = true; + new_data[i].zext_dst = insn_has_def32(env, insn + i); + } env->insn_aux_data = new_data; vfree(old_data); return 0; @@ -7347,7 +7376,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of env->insn_aux_data[off].orig_idx); return NULL; } - if (adjust_insn_aux_data(env, new_prog->len, off, len)) + if (adjust_insn_aux_data(env, new_prog, off, len)) return NULL; adjust_subprog_starts(env, off, len); return new_prog; -- cgit v1.2.3 From a4b1d3c1ddf6cb441187b6c130a473c16a05a356 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:15 +0100 Subject: bpf: verifier: insert zero extension according to analysis result After previous patches, verifier will mark a insn if it really needs zero extension on dst_reg. It is then for back-ends to decide how to use such information to eliminate unnecessary zero extension code-gen during JIT compilation. One approach is verifier insert explicit zero extension for those insns that need zero extension in a generic way, JIT back-ends then do not generate zero extension for sub-register write at default. However, only those back-ends which do not have hardware zero extension want this optimization. Back-ends like x86_64 and AArch64 have hardware zero extension support that the insertion should be disabled. This patch introduces new target hook "bpf_jit_needs_zext" which returns false at default, meaning verifier zero extension insertion is disabled at default. A back-end could override this hook to return true if it doesn't have hardware support and want verifier insert zero extension explicitly. Offload targets do not use this native target hook, instead, they could get the optimization results using bpf_prog_offload_ops.finalize. NOTE: arches could have diversified features, it is possible for one arch to have hardware zero extension support for some sub-register write insns but not for all. For example, PowerPC, SPARC have zero extended loads, but not for alu32. So when verifier zero extension insertion enabled, these JIT back-ends need to peephole insns to remove those zero extension inserted for insn that actually has hardware zero extension support. The peephole could be as simple as looking the next insn, if it is a special zero extension insn then it is safe to eliminate it if the current insn has hardware zero extension support. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 9 +++++++++ kernel/bpf/verifier.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 242a643af82f..3675b19ecb90 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2090,6 +2090,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func) return false; } +/* Return TRUE if the JIT backend wants verifier to enable sub-register usage + * analysis code and wants explicit zero extension inserted by verifier. + * Otherwise, return FALSE. + */ +bool __weak bpf_jit_needs_zext(void) +{ + return false; +} + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a6af3166acae..d4394a84b9eb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7640,6 +7640,38 @@ static int opt_remove_nops(struct bpf_verifier_env *env) return 0; } +static int opt_subreg_zext_lo32(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + int i, delta = 0, len = env->prog->len; + struct bpf_insn zext_patch[2]; + struct bpf_prog *new_prog; + + zext_patch[1] = BPF_ZEXT_REG(0); + for (i = 0; i < len; i++) { + int adj_idx = i + delta; + struct bpf_insn insn; + + if (!aux[adj_idx].zext_dst) + continue; + + insn = insns[adj_idx]; + zext_patch[0] = insn; + zext_patch[1].dst_reg = insn.dst_reg; + zext_patch[1].src_reg = insn.dst_reg; + new_prog = bpf_patch_insn_data(env, adj_idx, zext_patch, 2); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + insns = new_prog->insnsi; + aux = env->insn_aux_data; + delta += 2; + } + + return 0; +} + /* convert load instructions that access fields of a context type into a * sequence of instructions that access fields of the underlying structure: * struct __sk_buff -> struct sk_buff @@ -8490,6 +8522,15 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); + /* do 32-bit optimization after insn patching has done so those patched + * insns could be handled correctly. + */ + if (ret == 0 && bpf_jit_needs_zext() && + !bpf_prog_is_dev_bound(env->prog->aux)) { + ret = opt_subreg_zext_lo32(env); + env->prog->aux->verifier_zext = !ret; + } + if (ret == 0) ret = fixup_call_args(env); -- cgit v1.2.3 From c240eff63a1cf1c4edc768e0cfc374811c02f069 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:16 +0100 Subject: bpf: introduce new bpf prog load flags "BPF_F_TEST_RND_HI32" x86_64 and AArch64 perhaps are two arches that running bpf testsuite frequently, however the zero extension insertion pass is not enabled for them because of their hardware support. It is critical to guarantee the pass correction as it is supposed to be enabled at default for a couple of other arches, for example PowerPC, SPARC, arm, NFP etc. Therefore, it would be very useful if there is a way to test this pass on for example x86_64. The test methodology employed by this set is "poisoning" useless bits. High 32-bit of a definition is randomized if it is identified as not used by any later insn. Such randomization is only enabled under testing mode which is gated by the new bpf prog load flags "BPF_F_TEST_RND_HI32". Suggested-by: Alexei Starovoitov Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb5440b02e82..3d546b6f4646 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1604,7 +1604,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; - if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT)) + if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | + BPF_F_ANY_ALIGNMENT | + BPF_F_TEST_RND_HI32)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && -- cgit v1.2.3 From d6c2308c742a655f4598364ab331959639aae166 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 24 May 2019 23:25:18 +0100 Subject: bpf: verifier: randomize high 32-bit when BPF_F_TEST_RND_HI32 is set This patch randomizes high 32-bit of a definition when BPF_F_TEST_RND_HI32 is set. Suggested-by: Alexei Starovoitov Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 68 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4394a84b9eb..2778417e6e0c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7640,33 +7640,79 @@ static int opt_remove_nops(struct bpf_verifier_env *env) return 0; } -static int opt_subreg_zext_lo32(struct bpf_verifier_env *env) +static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, + const union bpf_attr *attr) { + struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4]; struct bpf_insn_aux_data *aux = env->insn_aux_data; + int i, patch_len, delta = 0, len = env->prog->len; struct bpf_insn *insns = env->prog->insnsi; - int i, delta = 0, len = env->prog->len; - struct bpf_insn zext_patch[2]; struct bpf_prog *new_prog; + bool rnd_hi32; + rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32; zext_patch[1] = BPF_ZEXT_REG(0); + rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0); + rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); + rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX); for (i = 0; i < len; i++) { int adj_idx = i + delta; struct bpf_insn insn; - if (!aux[adj_idx].zext_dst) + insn = insns[adj_idx]; + if (!aux[adj_idx].zext_dst) { + u8 code, class; + u32 imm_rnd; + + if (!rnd_hi32) + continue; + + code = insn.code; + class = BPF_CLASS(code); + if (insn_no_def(&insn)) + continue; + + /* NOTE: arg "reg" (the fourth one) is only used for + * BPF_STX which has been ruled out in above + * check, it is safe to pass NULL here. + */ + if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) { + if (class == BPF_LD && + BPF_MODE(code) == BPF_IMM) + i++; + continue; + } + + /* ctx load could be transformed into wider load. */ + if (class == BPF_LDX && + aux[adj_idx].ptr_type == PTR_TO_CTX) + continue; + + imm_rnd = get_random_int(); + rnd_hi32_patch[0] = insn; + rnd_hi32_patch[1].imm = imm_rnd; + rnd_hi32_patch[3].dst_reg = insn.dst_reg; + patch = rnd_hi32_patch; + patch_len = 4; + goto apply_patch_buffer; + } + + if (!bpf_jit_needs_zext()) continue; - insn = insns[adj_idx]; zext_patch[0] = insn; zext_patch[1].dst_reg = insn.dst_reg; zext_patch[1].src_reg = insn.dst_reg; - new_prog = bpf_patch_insn_data(env, adj_idx, zext_patch, 2); + patch = zext_patch; + patch_len = 2; +apply_patch_buffer: + new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len); if (!new_prog) return -ENOMEM; env->prog = new_prog; insns = new_prog->insnsi; aux = env->insn_aux_data; - delta += 2; + delta += patch_len - 1; } return 0; @@ -8525,10 +8571,10 @@ skip_full_check: /* do 32-bit optimization after insn patching has done so those patched * insns could be handled correctly. */ - if (ret == 0 && bpf_jit_needs_zext() && - !bpf_prog_is_dev_bound(env->prog->aux)) { - ret = opt_subreg_zext_lo32(env); - env->prog->aux->verifier_zext = !ret; + if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) { + ret = opt_subreg_zext_lo32_rnd_hi32(env, attr); + env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret + : false; } if (ret == 0) -- cgit v1.2.3 From 48d07c04b4cc1dc1221965312f58fd84926212fe Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 20 Mar 2019 22:13:33 +0100 Subject: rcu: Enable elimination of Tree-RCU softirq processing Some workloads need to change kthread priority for RCU core processing without affecting other softirq work. This commit therefore introduces the rcutree.use_softirq kernel boot parameter, which moves the RCU core work from softirq to a per-CPU SCHED_OTHER kthread named rcuc. Use of SCHED_OTHER approach avoids the scalability problems that appeared with the earlier attempt to move RCU core processing to from softirq to kthreads. That said, kernels built with RCU_BOOST=y will run the rcuc kthreads at the RCU-boosting priority. Note that rcutree.use_softirq=0 must be specified to move RCU core processing to the rcuc kthreads: rcutree.use_softirq=1 is the default. Reported-by: Thomas Gleixner Tested-by: Mike Galbraith Signed-off-by: Sebastian Andrzej Siewior [ paulmck: Adjust for invoke_rcu_callbacks() only ever being invoked from RCU core processing, in contrast to softirq->rcuc transition in old mainline RCU priority boosting. ] [ paulmck: Avoid wakeups when scheduler might have invoked rcu_read_unlock() while holding rq or pi locks, also possibly fixing a pre-existing latent bug involving raise_softirq()-induced wakeups. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 138 ++++++++++++++++++++++++++++++++++++++++++----- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 134 +++++---------------------------------------- 3 files changed, 140 insertions(+), 134 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 980ca3ca643f..8e290163505a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -51,6 +51,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include "../time/tick-internal.h" #include "tree.h" #include "rcu.h" @@ -92,6 +98,9 @@ struct rcu_state rcu_state = { /* Dump rcu_node combining tree at boot to verify correct setup. */ static bool dump_tree; module_param(dump_tree, bool, 0444); +/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ +static bool use_softirq = 1; +module_param(use_softirq, bool, 0444); /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; module_param(rcu_fanout_exact, bool, 0444); @@ -2253,7 +2262,7 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* Perform RCU core processing work for the current CPU. */ -static __latent_entropy void rcu_core(struct softirq_action *unused) +static __latent_entropy void rcu_core(void) { unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); @@ -2295,29 +2304,131 @@ static __latent_entropy void rcu_core(struct softirq_action *unused) trace_rcu_utilization(TPS("End RCU core")); } +static void rcu_core_si(struct softirq_action *h) +{ + rcu_core(); +} + +static void rcu_wake_cond(struct task_struct *t, int status) +{ + /* + * If the thread is yielding, only wake it when this + * is invoked from idle + */ + if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current))) + wake_up_process(t); +} + +static void invoke_rcu_core_kthread(void) +{ + struct task_struct *t; + unsigned long flags; + + local_irq_save(flags); + __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); + t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task); + if (t != NULL && t != current) + rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); + local_irq_restore(flags); +} + /* - * Schedule RCU callback invocation. If the running implementation of RCU - * does not support RCU priority boosting, just do a direct call, otherwise - * wake up the per-CPU kernel kthread. Note that because we are running - * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task - * cannot disappear out from under us. + * Do RCU callback invocation. Not that if we are running !use_softirq, + * we are already in the rcuc kthread. If callbacks are offloaded, then + * ->cblist is always empty, so we don't get here. Therefore, we only + * ever need to check for the scheduler being operational (some callbacks + * do wakeups, so we do need the scheduler). */ static void invoke_rcu_callbacks(struct rcu_data *rdp) { if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) return; - if (likely(!rcu_state.boost)) { - rcu_do_batch(rdp); - return; - } - invoke_rcu_callbacks_kthread(); + rcu_do_batch(rdp); } +/* + * Wake up this CPU's rcuc kthread to do RCU core processing. + */ static void invoke_rcu_core(void) { - if (cpu_online(smp_processor_id())) + if (!cpu_online(smp_processor_id())) + return; + if (use_softirq) raise_softirq(RCU_SOFTIRQ); + else + invoke_rcu_core_kthread(); +} + +static void rcu_cpu_kthread_park(unsigned int cpu) +{ + per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; +} + +static int rcu_cpu_kthread_should_run(unsigned int cpu) +{ + return __this_cpu_read(rcu_data.rcu_cpu_has_work); +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces + * the RCU softirq used in configurations of RCU that do not support RCU + * priority boosting. + */ +static void rcu_cpu_kthread(unsigned int cpu) +{ + unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); + char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); + int spincnt; + + for (spincnt = 0; spincnt < 10; spincnt++) { + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); + local_bh_disable(); + *statusp = RCU_KTHREAD_RUNNING; + local_irq_disable(); + work = *workp; + *workp = 0; + local_irq_enable(); + if (work) + rcu_core(); + local_bh_enable(); + if (*workp == 0) { + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); + *statusp = RCU_KTHREAD_WAITING; + return; + } + } + *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); + schedule_timeout_interruptible(2); + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); + *statusp = RCU_KTHREAD_WAITING; +} + +static struct smp_hotplug_thread rcu_cpu_thread_spec = { + .store = &rcu_data.rcu_cpu_kthread_task, + .thread_should_run = rcu_cpu_kthread_should_run, + .thread_fn = rcu_cpu_kthread, + .thread_comm = "rcuc/%u", + .setup = rcu_cpu_kthread_setup, + .park = rcu_cpu_kthread_park, +}; + +/* + * Spawn per-CPU RCU core processing kthreads. + */ +static int __init rcu_spawn_core_kthreads(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; + if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq) + return 0; + WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), + "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__); + return 0; } +early_initcall(rcu_spawn_core_kthreads); /* * Handle any core-RCU processing required by a call_rcu() invocation. @@ -3355,7 +3466,8 @@ void __init rcu_init(void) rcu_init_one(); if (dump_tree) rcu_dump_rcu_node_tree(); - open_softirq(RCU_SOFTIRQ, rcu_core); + if (use_softirq) + open_softirq(RCU_SOFTIRQ, rcu_core_si); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e253d11af3c4..a1a72a1ecb02 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -407,8 +407,8 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); +static void rcu_cpu_kthread_setup(unsigned int cpu); static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1102765f91fd..21611862e083 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -11,29 +11,7 @@ * Paul E. McKenney */ -#include -#include -#include -#include -#include -#include -#include -#include "../time/tick-internal.h" - -#ifdef CONFIG_RCU_BOOST #include "../locking/rtmutex_common.h" -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* - * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST, - * all uses are in dead code. Provide a definition to keep the compiler - * happy, but add WARN_ON_ONCE() to complain if used in the wrong place. - * This probably needs to be excluded from -rt builds. - */ -#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) -#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1) - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ @@ -94,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + if (!use_softirq) + pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) pr_info("\tRCU debug extended QS entry/exit.\n"); rcupdate_announce_bootup_oddness(); @@ -627,7 +607,7 @@ static void rcu_read_unlock_special(struct task_struct *t) if (preempt_bh_were_disabled || irqs_were_disabled) { WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); /* Need to defer quiescent state until everything is enabled. */ - if (irqs_were_disabled) { + if (irqs_were_disabled && use_softirq) { /* Enabling irqs does not reschedule, so... */ raise_softirq_irqoff(RCU_SOFTIRQ); } else { @@ -944,18 +924,21 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +/* + * If boosting, set rcuc kthreads to realtime priority. + */ +static void rcu_cpu_kthread_setup(unsigned int cpu) +{ #ifdef CONFIG_RCU_BOOST + struct sched_param sp; -static void rcu_wake_cond(struct task_struct *t, int status) -{ - /* - * If the thread is yielding, only wake it when this - * is invoked from idle - */ - if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) - wake_up_process(t); + sp.sched_priority = kthread_prio; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); +#endif /* #ifdef CONFIG_RCU_BOOST */ } +#ifdef CONFIG_RCU_BOOST + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1090,23 +1073,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) } } -/* - * Wake up the per-CPU kthread to invoke RCU callbacks. - */ -static void invoke_rcu_callbacks_kthread(void) -{ - unsigned long flags; - - local_irq_save(flags); - __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) { - rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task), - __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); - } - local_irq_restore(flags); -} - /* * Is the current CPU running the RCU-callbacks kthread? * Caller must have preemption disabled. @@ -1160,59 +1126,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) return 0; } -static void rcu_cpu_kthread_setup(unsigned int cpu) -{ - struct sched_param sp; - - sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); -} - -static void rcu_cpu_kthread_park(unsigned int cpu) -{ - per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; -} - -static int rcu_cpu_kthread_should_run(unsigned int cpu) -{ - return __this_cpu_read(rcu_data.rcu_cpu_has_work); -} - -/* - * Per-CPU kernel thread that invokes RCU callbacks. This replaces - * the RCU softirq used in configurations of RCU that do not support RCU - * priority boosting. - */ -static void rcu_cpu_kthread(unsigned int cpu) -{ - unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); - char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); - int spincnt; - - for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); - local_bh_disable(); - *statusp = RCU_KTHREAD_RUNNING; - local_irq_disable(); - work = *workp; - *workp = 0; - local_irq_enable(); - if (work) - rcu_do_batch(this_cpu_ptr(&rcu_data)); - local_bh_enable(); - if (*workp == 0) { - trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); - *statusp = RCU_KTHREAD_WAITING; - return; - } - } - *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); - trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); - *statusp = RCU_KTHREAD_WAITING; -} - /* * Set the per-rcu_node kthread's affinity to cover all CPUs that are * served by the rcu_node in question. The CPU hotplug lock is still @@ -1243,27 +1156,13 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) free_cpumask_var(cm); } -static struct smp_hotplug_thread rcu_cpu_thread_spec = { - .store = &rcu_data.rcu_cpu_kthread_task, - .thread_should_run = rcu_cpu_kthread_should_run, - .thread_fn = rcu_cpu_kthread, - .thread_comm = "rcuc/%u", - .setup = rcu_cpu_kthread_setup, - .park = rcu_cpu_kthread_park, -}; - /* * Spawn boost kthreads -- called as soon as the scheduler is running. */ static void __init rcu_spawn_boost_kthreads(void) { struct rcu_node *rnp; - int cpu; - for_each_possible_cpu(cpu) - per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; - if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) - return; rcu_for_each_leaf_node(rnp) (void)rcu_spawn_one_boost_kthread(rnp); } @@ -1286,11 +1185,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -static void invoke_rcu_callbacks_kthread(void) -{ - WARN_ON_ONCE(1); -} - static bool rcu_is_callbacks_kthread(void) { return false; -- cgit v1.2.3 From 23634ebc1d946f19eb112d4455c1d84948875e31 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 24 Mar 2019 15:25:51 -0700 Subject: rcu: Check for wakeup-safe conditions in rcu_read_unlock_special() When RCU core processing is offloaded from RCU_SOFTIRQ to the rcuc kthreads, a full and unconditional wakeup is required to initiate RCU core processing. In contrast, when RCU core processing is carried out by RCU_SOFTIRQ, a raise_softirq() suffices. Of course, there are situations where raise_softirq() does a full wakeup, but these do not occur with normal usage of rcu_read_unlock(). The reason that full wakeups can be problematic is that the scheduler sometimes invokes rcu_read_unlock() with its pi or rq locks held, which can of course result in deadlock in CONFIG_PREEMPT=y kernels when rcu_read_unlock() invokes the scheduler. Scheduler invocations can happen in the following situations: (1) The just-ended reader has been subjected to RCU priority boosting, in which case rcu_read_unlock() must deboost, (2) Interrupts were disabled across the call to rcu_read_unlock(), so the quiescent state must be deferred, requiring a wakeup of the rcuc kthread corresponding to the current CPU. Now, the scheduler may hold one of its locks across rcu_read_unlock() only if preemption has been disabled across the entire RCU read-side critical section, which in the days prior to RCU flavor consolidation meant that rcu_read_unlock() never needed to do wakeups. However, this is no longer the case for any but the first rcu_read_unlock() following a condition (e.g., preempted RCU reader) requiring special rcu_read_unlock() attention. For example, an RCU read-side critical section might be preempted, but preemption might be disabled across the rcu_read_unlock(). The rcu_read_unlock() must defer the quiescent state, and therefore leaves the task queued on its leaf rcu_node structure. If a scheduler interrupt occurs, the scheduler might well invoke rcu_read_unlock() with one of its locks held. However, the preempted task is still queued, so rcu_read_unlock() will attempt to defer the quiescent state once more. When RCU core processing is carried out by RCU_SOFTIRQ, this works just fine: The raise_softirq() function simply sets a bit in a per-CPU mask and the RCU core processing will be undertaken upon return from interrupt. Not so when RCU core processing is carried out by the rcuc kthread: In this case, the required wakeup can result in deadlock. The initial solution to this problem was to use set_tsk_need_resched() and set_preempt_need_resched() to force a future context switch, which allows rcu_preempt_note_context_switch() to report the deferred quiescent state to RCU's core processing. Unfortunately for expedited grace periods, there can be a significant delay between the call for a context switch and the actual context switch. This commit therefore introduces a ->deferred_qs flag to the task_struct structure's rcu_special structure. This flag is initially false, and is set to true by the first call to rcu_read_unlock() requiring special attention, then finally reset back to false when the quiescent state is finally reported. Then rcu_read_unlock() attempts full wakeups only when ->deferred_qs is false, that is, on the first rcu_read_unlock() requiring special attention. Note that a chain of RCU readers linked by some other sort of reader may find that a later rcu_read_unlock() is once again able to do a full wakeup, courtesy of an intervening preemption: rcu_read_lock(); /* preempted */ local_irq_disable(); rcu_read_unlock(); /* Can do full wakeup, sets ->deferred_qs. */ rcu_read_lock(); local_irq_enable(); preempt_disable() rcu_read_unlock(); /* Cannot do full wakeup, ->deferred_qs set. */ rcu_read_lock(); preempt_enable(); /* preempted, >deferred_qs reset. */ local_irq_disable(); rcu_read_unlock(); /* Can again do full wakeup, sets ->deferred_qs. */ Such linked RCU readers do not yet seem to appear in the Linux kernel, and it is probably best if they don't. However, RCU needs to handle them, and some variations on this theme could make even raise_softirq() unsafe due to the possibility of its doing a full wakeup. This commit therefore also avoids invoking raise_softirq() when the ->deferred_qs set flag is set. Signed-off-by: Paul E. McKenney Cc: Sebastian Andrzej Siewior --- kernel/rcu/tree_plugin.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 21611862e083..75110ea75d01 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -455,6 +455,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) local_irq_restore(flags); return; } + t->rcu_read_unlock_special.b.deferred_qs = false; if (special.b.need_qs) { rcu_qs(); t->rcu_read_unlock_special.b.need_qs = false; @@ -605,16 +606,24 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); - /* Need to defer quiescent state until everything is enabled. */ - if (irqs_were_disabled && use_softirq) { - /* Enabling irqs does not reschedule, so... */ + t->rcu_read_unlock_special.b.exp_hint = false; + // Need to defer quiescent state until everything is enabled. + if (irqs_were_disabled && use_softirq && + (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { + // Using softirq, safe to awaken, and we get + // no help from enabling irqs, unlike bh/preempt. raise_softirq_irqoff(RCU_SOFTIRQ); + } else if (irqs_were_disabled && !use_softirq && + !t->rcu_read_unlock_special.b.deferred_qs) { + // Safe to awaken and we get no help from enabling + // irqs, unlike bh/preempt. + invoke_rcu_core(); } else { - /* Enabling BH or preempt does reschedule, so... */ + // Enabling BH or preempt does reschedule, so... set_tsk_need_resched(current); set_preempt_need_resched(); } + t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); return; } -- cgit v1.2.3 From 25102de65fdd246eb6801114ce6dfa3a076bb678 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Apr 2019 14:12:50 -0700 Subject: rcu: Only do rcu_read_unlock_special() wakeups if expedited Currently, rcu_read_unlock_special() will do wakeups whenever it is safe to do so. However, wakeups are expensive, and they are only really needed when the just-ended RCU read-side critical section is blocking an expedited grace period (in which case speed is of the essence) or on a nohz_full CPU (where it might be a good long time before an interrupt arrives). This commit therefore checks for these conditions, and does the expensive wakeups only if doing so would be useful. Note it can be rather expensive to determine whether or not the current task (as opposed to the current CPU) is blocking the current expedited grace period. Doing so requires traversing the ->blkd_tasks list, which can be quite long. This commit therefore cheats: If the current task is on a given ->blkd_tasks list, and some task on that list is blocking the current expedited grace period, the code assumes that the current task is blocking that expedited grace period. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 75110ea75d01..d15cdab6aeb4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -606,20 +606,28 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { + bool exp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; + t->rcu_read_unlock_special.b.exp_hint = false; + exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || + (rdp->grpmask & rnp->expmask) || + tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. - if (irqs_were_disabled && use_softirq && + if (exp && irqs_were_disabled && use_softirq && (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { // Using softirq, safe to awaken, and we get // no help from enabling irqs, unlike bh/preempt. raise_softirq_irqoff(RCU_SOFTIRQ); - } else if (irqs_were_disabled && !use_softirq && + } else if (exp && irqs_were_disabled && !use_softirq && !t->rcu_read_unlock_special.b.deferred_qs) { // Safe to awaken and we get no help from enabling // irqs, unlike bh/preempt. invoke_rcu_core(); } else { // Enabling BH or preempt does reschedule, so... + // Also if no expediting or NO_HZ_FULL, slow is OK. set_tsk_need_resched(current); set_preempt_need_resched(); } -- cgit v1.2.3 From 385b599e8c04fa843c4d7f785478827cc512d720 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Apr 2019 15:12:47 -0700 Subject: rcu: Allow rcu_read_unlock_special() to raise_softirq() if in_irq() When running in an interrupt handler, raise_softirq() and raise_softirq_irqoff() have extremely low overhead: They simply set a bit in a per-CPU mask, which is checked upon exit from that interrupt handler. Therefore, if rcu_read_unlock_special() is invoked within an interrupt handler and RCU_SOFTIRQ is in use, this commit make use of raise_softirq_irqoff() even if there is no expedited grace period in flight and even if this is not a nohz_full CPU. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d15cdab6aeb4..e1005f5e8094 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -615,7 +615,7 @@ static void rcu_read_unlock_special(struct task_struct *t) (rdp->grpmask & rnp->expmask) || tick_nohz_full_cpu(rdp->cpu); // Need to defer quiescent state until everything is enabled. - if (exp && irqs_were_disabled && use_softirq && + if ((exp || in_irq()) && irqs_were_disabled && use_softirq && (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { // Using softirq, safe to awaken, and we get // no help from enabling irqs, unlike bh/preempt. -- cgit v1.2.3 From 0864f057b050bc6dd68106b3185e02db5140012d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Apr 2019 12:19:25 -0700 Subject: rcu: Use irq_work to get scheduler's attention in clean context When rcu_read_unlock_special() is invoked with interrupts disabled, is either not in an interrupt handler or is not using RCU_SOFTIRQ, is not the first RCU read-side critical section in the chain, and either there is an expedited grace period in flight or this is a NO_HZ_FULL kernel, the end of the grace period can be unduly delayed. The reason for this is that it is not safe to do wakeups in this situation. This commit fixes this problem by using the irq_work subsystem to force a later interrupt handler in a clean environment. Because set_tsk_need_resched(current) and set_preempt_need_resched() are invoked prior to this, the scheduler will force a context switch upon return from this interrupt (though perhaps at the end of any interrupted preempt-disable or BH-disable region of code), which will invoke rcu_note_context_switch() (again in a clean environment), which will in turn give RCU the chance to report the deferred quiescent state. Of course, by then this task might be within another RCU read-side critical section. But that will be detected at that time and reporting will be further deferred to the outermost rcu_read_unlock(). See rcu_preempt_need_deferred_qs() and rcu_preempt_deferred_qs() for more details on the checking. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_plugin.h | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a1a72a1ecb02..21d740f0b8dc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -161,6 +161,8 @@ struct rcu_data { /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ + struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ + bool defer_qs_iw_pending; /* Scheduler attention pending? */ /* 2) batch handling */ struct rcu_segcblist cblist; /* Segmented callback list, with */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e1005f5e8094..58c7853f19e7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -587,6 +587,17 @@ static void rcu_preempt_deferred_qs(struct task_struct *t) t->rcu_read_lock_nesting += RCU_NEST_BIAS; } +/* + * Minimal handler to give the scheduler a chance to re-evaluate. + */ +static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + + rdp = container_of(iwp, struct rcu_data, defer_qs_iw); + rdp->defer_qs_iw_pending = false; +} + /* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU @@ -630,6 +641,15 @@ static void rcu_read_unlock_special(struct task_struct *t) // Also if no expediting or NO_HZ_FULL, slow is OK. set_tsk_need_resched(current); set_preempt_need_resched(); + if (IS_ENABLED(CONFIG_IRQ_WORK) && + !rdp->defer_qs_iw_pending && exp) { + // Get scheduler to re-evaluate and call hooks. + // If !IRQ_WORK, FQS scan will eventually IPI. + init_irq_work(&rdp->defer_qs_iw, + rcu_preempt_deferred_qs_handler); + rdp->defer_qs_iw_pending = true; + irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); + } } t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); -- cgit v1.2.3 From 43e903ad3e0843d03da15d8eaffb5ada22966c76 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Mar 2019 08:36:03 -0700 Subject: rcu: Inline invoke_rcu_callbacks() into its sole remaining caller This commit saves a few lines of code by inlining invoke_rcu_callbacks() into its sole remaining caller. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e290163505a..7822a2e1370d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -147,7 +147,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); -static void invoke_rcu_callbacks(struct rcu_data *rdp); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); @@ -2296,8 +2295,9 @@ static __latent_entropy void rcu_core(void) rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - invoke_rcu_callbacks(rdp); + if (rcu_segcblist_ready_cbs(&rdp->cblist) && + likely(READ_ONCE(rcu_scheduler_fully_active))) + rcu_do_batch(rdp); /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); @@ -2332,20 +2332,6 @@ static void invoke_rcu_core_kthread(void) local_irq_restore(flags); } -/* - * Do RCU callback invocation. Not that if we are running !use_softirq, - * we are already in the rcuc kthread. If callbacks are offloaded, then - * ->cblist is always empty, so we don't get here. Therefore, we only - * ever need to check for the scheduler being operational (some callbacks - * do wakeups, so we do need the scheduler). - */ -static void invoke_rcu_callbacks(struct rcu_data *rdp) -{ - if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) - return; - rcu_do_batch(rdp); -} - /* * Wake up this CPU's rcuc kthread to do RCU core processing. */ -- cgit v1.2.3 From b9ad4d6ed18e23b0ff6a824b925a1278625d5345 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2019 09:09:47 -0700 Subject: rcu: Avoid self-IPI in sync_rcu_exp_select_node_cpus() Although sync_rcu_exp_select_node_cpus() treats the current CPU as being in a quiescent state, it might well migrate to some other CPU before reaching the smp_call_function_single(), which could then result in an unnecessary simulated self-IPI. This commit therefore instead simply refuses to invoke smp_call_function_single() on the current CPU, which causes the later rcu_report_exp_cpu_mult() to report this CPU's quiescent state with less overhead. This also reduces the rcu_exp_handler() function's state space by removing the direct call that this smp_call_function_single() uses to emulate the requested self-IPI. Signed-off-by: Paul E. McKenney [ paulmck: Use get_cpu() instead of preempt_disable() per Joel Fernandes. ] --- kernel/rcu/tree_exp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 9c990df880d1..5390618787b6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -384,7 +384,12 @@ retry_ipi: mask_ofl_test |= mask; continue; } + if (get_cpu() == cpu) { + put_cpu(); + continue; + } ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); + put_cpu(); if (!ret) { mask_ofl_ipi &= ~mask; continue; -- cgit v1.2.3 From e015a341122024198f57d1f0498a776523137e94 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2019 10:03:12 -0700 Subject: rcu: Avoid self-IPI in sync_sched_exp_online_cleanup() The sync_sched_exp_online_cleanup() is invoked at online time to handle the case where the start of an expedited grace period ran concurrently with a CPU being taken offline and then immediately being placed online. It checks to see if RCU needs an expedited quiescent state from the incoming CPU, sending it an IPI if so. However, it is quite possible that sync_sched_exp_online_cleanup() is running on that CPU, in which case it is considerably less overhead to simply request the quiescent state locally instead of simulating a self-IPI. This commit therefore places the last few lines of rcu_exp_handler() into a new rcu_exp_need_qs() function, which is invoked both by rcu_exp_handler() and by sync_sched_exp_online_cleanup() in the self-IPI case. This also reduces the rcu_exp_handler() function's state space by removing the direct call that this smp_call_function_single() uses to emulate the requested self-IPI. This in turn will allow tighter error checking in rcu_is_cpu_rrupt_from_idle(). Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/tree_exp.h | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 5390618787b6..de1b4acf6979 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -699,6 +699,16 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) #else /* #ifdef CONFIG_PREEMPT_RCU */ +/* Request an expedited quiescent state. */ +static void rcu_exp_need_qs(void) +{ + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + /* Invoked on each online non-idle CPU for expedited quiescent state. */ static void rcu_exp_handler(void *unused) { @@ -714,25 +724,38 @@ static void rcu_exp_handler(void *unused) rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); return; } - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); - /* Store .exp before .rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); + rcu_exp_need_qs(); } /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ static void sync_sched_exp_online_cleanup(int cpu) { + unsigned long flags; + int my_cpu; struct rcu_data *rdp; int ret; struct rcu_node *rnp; rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + my_cpu = get_cpu(); + /* Quiescent state either not needed or already requested, leave. */ + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) { + put_cpu(); return; + } + /* Quiescent state needed on current CPU, so set it up locally. */ + if (my_cpu == cpu) { + local_irq_save(flags); + rcu_exp_need_qs(); + local_irq_restore(flags); + put_cpu(); + return; + } + /* Quiescent state needed on some other CPU, send IPI. */ ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); + put_cpu(); WARN_ON_ONCE(ret); } -- cgit v1.2.3 From f7a9945184100b531f0de3b12c617a349236dd8a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 May 2019 12:42:58 -0400 Subject: no need to protect against put_user_ns(NULL) it's a no-op Signed-off-by: Al Viro --- kernel/cgroup/cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 217cec4e22c6..bbcdd3457eb0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2184,8 +2184,7 @@ static int cgroup_init_fs_context(struct fs_context *fc) fc->ops = &cgroup_fs_context_ops; else fc->ops = &cgroup1_fs_context_ops; - if (fc->user_ns) - put_user_ns(fc->user_ns); + put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; return 0; -- cgit v1.2.3 From d5f68d330c156774bf69059e434b5d8acea3b92e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 13 May 2019 12:33:22 -0400 Subject: cpuset: move mount -t cpuset logics into cgroup.c ... and get rid of the weird dances in ->get_tree() - that logics can be easily handled in ->init_fs_context(). Signed-off-by: Al Viro --- kernel/cgroup/cgroup.c | 47 ++++++++++++++++++++++++++++++++++++++ kernel/cgroup/cpuset.c | 61 +------------------------------------------------- 2 files changed, 48 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index bbcdd3457eb0..4a0eb465d17e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2225,6 +2225,50 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; +#ifdef CONFIG_CPUSETS +static const struct fs_context_operations cpuset_fs_context_ops = { + .get_tree = cgroup1_get_tree, + .free = cgroup_fs_context_free, +}; + +/* + * This is ugly, but preserves the userspace API for existing cpuset + * users. If someone tries to mount the "cpuset" filesystem, we + * silently switch it to mount "cgroup" instead + */ +static int cpuset_init_fs_context(struct fs_context *fc) +{ + char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER); + struct cgroup_fs_context *ctx; + int err; + + err = cgroup_init_fs_context(fc); + if (err) { + kfree(agent); + return err; + } + + fc->ops = &cpuset_fs_context_ops; + + ctx = cgroup_fc2context(fc); + ctx->subsys_mask = 1 << cpuset_cgrp_id; + ctx->flags |= CGRP_ROOT_NOPREFIX; + ctx->release_agent = agent; + + get_filesystem(&cgroup_fs_type); + put_filesystem(fc->fs_type); + fc->fs_type = &cgroup_fs_type; + + return 0; +} + +static struct file_system_type cpuset_fs_type = { + .name = "cpuset", + .init_fs_context = cpuset_init_fs_context, + .fs_flags = FS_USERNS_MOUNT, +}; +#endif + int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { @@ -5710,6 +5754,9 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); +#ifdef CONFIG_CPUSETS + WARN_ON(register_filesystem(&cpuset_fs_type)); +#endif return 0; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6a1942ed781c..9806c8c8b509 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -355,59 +355,6 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } -/* - * This is ugly, but preserves the userspace API for existing cpuset - * users. If someone tries to mount the "cpuset" filesystem, we - * silently switch it to mount "cgroup" instead - */ -static int cpuset_get_tree(struct fs_context *fc) -{ - struct file_system_type *cgroup_fs; - struct fs_context *new_fc; - int ret; - - cgroup_fs = get_fs_type("cgroup"); - if (!cgroup_fs) - return -ENODEV; - - new_fc = fs_context_for_mount(cgroup_fs, fc->sb_flags); - if (IS_ERR(new_fc)) { - ret = PTR_ERR(new_fc); - } else { - static const char agent_path[] = "/sbin/cpuset_release_agent"; - ret = vfs_parse_fs_string(new_fc, "cpuset", NULL, 0); - if (!ret) - ret = vfs_parse_fs_string(new_fc, "noprefix", NULL, 0); - if (!ret) - ret = vfs_parse_fs_string(new_fc, "release_agent", - agent_path, sizeof(agent_path) - 1); - if (!ret) - ret = vfs_get_tree(new_fc); - if (!ret) { /* steal the result */ - fc->root = new_fc->root; - new_fc->root = NULL; - } - put_fs_context(new_fc); - } - put_filesystem(cgroup_fs); - return ret; -} - -static const struct fs_context_operations cpuset_fs_context_ops = { - .get_tree = cpuset_get_tree, -}; - -static int cpuset_init_fs_context(struct fs_context *fc) -{ - fc->ops = &cpuset_fs_context_ops; - return 0; -} - -static struct file_system_type cpuset_fs_type = { - .name = "cpuset", - .init_fs_context = cpuset_init_fs_context, -}; - /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy @@ -2853,13 +2800,11 @@ struct cgroup_subsys cpuset_cgrp_subsys = { /** * cpuset_init - initialize cpusets at system boot * - * Description: Initialize top_cpuset and the cpuset internal file system, + * Description: Initialize top_cpuset **/ int __init cpuset_init(void) { - int err = 0; - BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); @@ -2873,10 +2818,6 @@ int __init cpuset_init(void) set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1; - err = register_filesystem(&cpuset_fs_type); - if (err < 0) - return err; - BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); return 0; -- cgit v1.2.3 From 7375dca1647fa978310f2d706ddbff537f72110b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 20 May 2019 09:26:24 -0400 Subject: ftrace: Make enable and update parameters bool when applicable The code modification functions have "enable" and "update" variables that are sometimes "int" but used as "bool". Remove the ambiguity and make them "bool" when they are only used for true or false values. Link: http://lkml.kernel.org/r/e1429923d9eda92a3cf5ee9e33c7eacce539781d.1558115654.git.naveen.n.rao@linux.vnet.ibm.com Reported-by: "Naveen N. Rao" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a12aff849c04..4f2c26bebe2a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1768,7 +1768,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, count++; /* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */ - update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE; + update |= ftrace_test_record(rec, true) != FTRACE_UPDATE_IGNORE; /* Shortcut, if we handled all records, we are done. */ if (!all && count == hash->count) @@ -2047,7 +2047,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) } } -static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) +static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) { unsigned long flag = 0UL; @@ -2146,28 +2146,28 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) /** * ftrace_update_record, set a record that now is tracing or not * @rec: the record to update - * @enable: set to 1 if the record is tracing, zero to force disable + * @enable: set to true if the record is tracing, false to force disable * * The records that represent all functions that can be traced need * to be updated when tracing has been enabled. */ -int ftrace_update_record(struct dyn_ftrace *rec, int enable) +int ftrace_update_record(struct dyn_ftrace *rec, bool enable) { - return ftrace_check_record(rec, enable, 1); + return ftrace_check_record(rec, enable, true); } /** * ftrace_test_record, check if the record has been enabled or not * @rec: the record to test - * @enable: set to 1 to check if enabled, 0 if it is disabled + * @enable: set to true to check if enabled, false if it is disabled * * The arch code may need to test if a record is already set to * tracing to determine how to modify the function code that it * represents. */ -int ftrace_test_record(struct dyn_ftrace *rec, int enable) +int ftrace_test_record(struct dyn_ftrace *rec, bool enable) { - return ftrace_check_record(rec, enable, 0); + return ftrace_check_record(rec, enable, false); } static struct ftrace_ops * @@ -2356,7 +2356,7 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) } static int -__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +__ftrace_replace_code(struct dyn_ftrace *rec, bool enable) { unsigned long ftrace_old_addr; unsigned long ftrace_addr; @@ -2395,7 +2395,7 @@ void __weak ftrace_replace_code(int mod_flags) { struct dyn_ftrace *rec; struct ftrace_page *pg; - int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; + bool enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL; int failed; -- cgit v1.2.3 From 88903c464321cdbc2d473c24cbf311f576cf05bc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 15 May 2019 14:38:30 +0900 Subject: tracing/probe: Add ustring type for user-space string Add "ustring" type for fetching user-space string from kprobe event. User can specify ustring type at uprobe event, and it is same as "string" for uprobe. Note that probe-event provides this option but it doesn't choose the correct type automatically since we have not way to decide the address is in user-space or not on some arch (and on some other arch, you can fetch the string by "string" type). So user must carefully check the target code (e.g. if you see __user on the target variable) and use this new type. Link: http://lkml.kernel.org/r/155789871009.26965.14167558859557329331.stgit@devnote2 Acked-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_kprobe.c | 42 ++++++++++++++++++++++++++++++++++++++--- kernel/trace/trace_probe.c | 14 +++++++++++--- kernel/trace/trace_probe.h | 1 + kernel/trace/trace_probe_tmpl.h | 14 +++++++++++++- kernel/trace/trace_uprobe.c | 12 ++++++++++++ 6 files changed, 77 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1c80521fd436..d3a477a16e70 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4847,7 +4847,7 @@ static const char readme_msg[] = "\t $stack, $stack, $retval, $comm\n" #endif "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" - "\t b@/,\n" + "\t b@/, ustring,\n" "\t \\[\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: ;\n" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7d736248a070..439bf04d14ce 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -886,6 +886,15 @@ fetch_store_strlen(unsigned long addr) return (ret < 0) ? ret : len; } +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + + return strnlen_unsafe_user(uaddr, MAX_STRING_SIZE); +} + /* * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max * length and relative data location. @@ -894,19 +903,46 @@ static nokprobe_inline int fetch_store_string(unsigned long addr, void *dest, void *base) { int maxlen = get_loc_len(*(u32 *)dest); - u8 *dst = get_loc_data(dest, base); + void *__dest; long ret; if (unlikely(!maxlen)) return -ENOMEM; + + __dest = get_loc_data(dest, base); + /* * Try to get string again, since the string can be changed while * probing. */ - ret = strncpy_from_unsafe(dst, (void *)addr, maxlen); + ret = strncpy_from_unsafe(__dest, (void *)addr, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); + + return ret; +} +/* + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + const void __user *uaddr = (__force const void __user *)addr; + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + ret = strncpy_from_unsafe_user(__dest, uaddr, maxlen); if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, (void *)dst - base); + *(u32 *)dest = make_data_loc(ret, __dest - base); + return ret; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index a347faced959..5a0470f7b9de 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -78,6 +78,8 @@ static const struct fetch_type probe_fetch_types[] = { /* Special types */ __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, "__data_loc char[]"), + __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1, + "__data_loc char[]"), /* Basic types */ ASSIGN_FETCH_TYPE(u8, u8, 0), ASSIGN_FETCH_TYPE(u16, u16, 0), @@ -569,7 +571,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, goto fail; /* Store operation */ - if (!strcmp(parg->type->name, "string")) { + if (!strcmp(parg->type->name, "string") || + !strcmp(parg->type->name, "ustring")) { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) { trace_probe_log_err(offset + (t ? (t - arg) : 0), @@ -590,7 +593,11 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, goto fail; } } - code->op = FETCH_OP_ST_STRING; /* In DEREF case, replace it */ + /* If op == DEREF, replace it with STRING */ + if (!strcmp(parg->type->name, "ustring")) + code->op = FETCH_OP_ST_USTRING; + else + code->op = FETCH_OP_ST_STRING; code->size = parg->type->size; parg->dynamic = true; } else if (code->op == FETCH_OP_DEREF) { @@ -618,7 +625,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, /* Loop(Array) operation */ if (parg->count) { if (scode->op != FETCH_OP_ST_MEM && - scode->op != FETCH_OP_ST_STRING) { + scode->op != FETCH_OP_ST_STRING && + scode->op != FETCH_OP_ST_USTRING) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); ret = -EINVAL; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f9a8c632188b..c7546e7ff8e2 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -96,6 +96,7 @@ enum fetch_op { FETCH_OP_ST_RAW, /* Raw: .size */ FETCH_OP_ST_MEM, /* Mem: .offset, .size */ FETCH_OP_ST_STRING, /* String: .offset, .size */ + FETCH_OP_ST_USTRING, /* User String: .offset, .size */ // Stage 4 (modify) op FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ // Stage 5 (loop) op diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index c30c61f12ddd..2e9e4dae8839 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -59,6 +59,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, static nokprobe_inline int fetch_store_strlen(unsigned long addr); static nokprobe_inline int fetch_store_string(unsigned long addr, void *dest, void *base); +static nokprobe_inline int fetch_store_strlen_user(unsigned long addr); +static nokprobe_inline int +fetch_store_string_user(unsigned long addr, void *dest, void *base); static nokprobe_inline int probe_mem_read(void *dest, void *src, size_t size); @@ -91,6 +94,10 @@ stage3: ret = fetch_store_strlen(val + code->offset); code++; goto array; + } else if (code->op == FETCH_OP_ST_USTRING) { + ret += fetch_store_strlen_user(val + code->offset); + code++; + goto array; } else return -EILSEQ; } @@ -106,6 +113,10 @@ stage3: loc = *(u32 *)dest; ret = fetch_store_string(val + code->offset, dest, base); break; + case FETCH_OP_ST_USTRING: + loc = *(u32 *)dest; + ret = fetch_store_string_user(val + code->offset, dest, base); + break; default: return -EILSEQ; } @@ -123,7 +134,8 @@ array: total += ret; if (++i < code->param) { code = s3; - if (s3->op != FETCH_OP_ST_STRING) { + if (s3->op != FETCH_OP_ST_STRING && + s3->op != FETCH_OP_ST_USTRING) { dest += s3->size; val += s3->size; goto stage3; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index eb7e06b54741..852e998051f6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -176,6 +176,12 @@ fetch_store_string(unsigned long addr, void *dest, void *base) return ret; } +static nokprobe_inline int +fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + return fetch_store_string(addr, dest, base); +} + /* Return the length of string -- including null terminal byte */ static nokprobe_inline int fetch_store_strlen(unsigned long addr) @@ -191,6 +197,12 @@ fetch_store_strlen(unsigned long addr) return (len > MAX_STRING_SIZE) ? 0 : len; } +static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + return fetch_store_strlen(addr); +} + static unsigned long translate_user_vaddr(unsigned long file_offset) { unsigned long base_addr; -- cgit v1.2.3 From e65f7ae7f4da56622ecf8f1eaed333b9a13f9435 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 15 May 2019 14:38:42 +0900 Subject: tracing/probe: Support user-space dereference Support user-space dereference syntax for probe event arguments to dereference the data-structure or array in user-space. The syntax is just adding 'u' before an offset value. +|-u() e.g. +u8(%ax), +u0(+0(%si)) For example, if you probe do_sched_setscheduler(pid, policy, param) and record param->sched_priority, you can add new probe as below; p do_sched_setscheduler priority=+u0($arg3) Note that kprobe event provides this and it doesn't change the dereference method automatically because we do not know whether the given address is in userspace or kernel on some archs. So as same as "ustring", this is an option for user, who has to carefully choose the dereference method. Link: http://lkml.kernel.org/r/155789872187.26965.4468456816590888687.stgit@devnote2 Acked-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 5 +++-- kernel/trace/trace_kprobe.c | 6 ++++++ kernel/trace/trace_probe.c | 25 ++++++++++++++++++------- kernel/trace/trace_probe.h | 2 ++ kernel/trace/trace_probe_tmpl.h | 22 +++++++++++++++++----- kernel/trace/trace_uprobe.c | 7 +++++++ 6 files changed, 53 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d3a477a16e70..6b3b5b0495a8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4842,10 +4842,11 @@ static const char readme_msg[] = "\t args: =fetcharg[:type]\n" "\t fetcharg: %, @
, @[+|-],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API - "\t $stack, $stack, $retval, $comm, $arg\n" + "\t $stack, $stack, $retval, $comm, $arg,\n" #else - "\t $stack, $stack, $retval, $comm\n" + "\t $stack, $stack, $retval, $comm,\n" #endif + "\t +|-[u]()\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b@/, ustring,\n" "\t \\[\\]\n" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 439bf04d14ce..ff14eb011c1c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -952,6 +952,12 @@ probe_mem_read(void *dest, void *src, size_t size) return probe_kernel_read(dest, src, size); } +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size) +{ + return probe_user_read(dest, src, size); +} + /* Note that we don't verify it, since the code does not come from user space */ static int process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 5a0470f7b9de..b6b0593844cd 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -324,6 +324,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, { struct fetch_insn *code = *pcode; unsigned long param; + int deref = FETCH_OP_DEREF; long offset = 0; char *tmp; int ret = 0; @@ -396,9 +397,14 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '+': /* deref memory */ - arg++; /* Skip '+', because kstrtol() rejects it. */ - /* fall through */ case '-': + if (arg[1] == 'u') { + deref = FETCH_OP_UDEREF; + arg[1] = arg[0]; + arg++; + } + if (arg[0] == '+') + arg++; /* Skip '+', because kstrtol() rejects it. */ tmp = strchr(arg, '('); if (!tmp) { trace_probe_log_err(offs, DEREF_NEED_BRACE); @@ -434,7 +440,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } *pcode = code; - code->op = FETCH_OP_DEREF; + code->op = deref; code->offset = offset; } break; @@ -573,14 +579,15 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, /* Store operation */ if (!strcmp(parg->type->name, "string") || !strcmp(parg->type->name, "ustring")) { - if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && - code->op != FETCH_OP_COMM) { + if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && + code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); ret = -EINVAL; goto fail; } - if (code->op != FETCH_OP_DEREF || parg->count) { + if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) || + parg->count) { /* * IMM and COMM is pointing actual address, those must * be kept, and if parg->count != 0, this is an array @@ -594,7 +601,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } } /* If op == DEREF, replace it with STRING */ - if (!strcmp(parg->type->name, "ustring")) + if (!strcmp(parg->type->name, "ustring") || + code->op == FETCH_OP_UDEREF) code->op = FETCH_OP_ST_USTRING; else code->op = FETCH_OP_ST_STRING; @@ -603,6 +611,9 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } else if (code->op == FETCH_OP_DEREF) { code->op = FETCH_OP_ST_MEM; code->size = parg->type->size; + } else if (code->op == FETCH_OP_UDEREF) { + code->op = FETCH_OP_ST_UMEM; + code->size = parg->type->size; } else { code++; if (code->op != FETCH_OP_NOP) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index c7546e7ff8e2..42816358dd48 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -92,9 +92,11 @@ enum fetch_op { FETCH_OP_FOFFS, /* File offset: .immediate */ // Stage 2 (dereference) op FETCH_OP_DEREF, /* Dereference: .offset */ + FETCH_OP_UDEREF, /* User-space Dereference: .offset */ // Stage 3 (store) ops FETCH_OP_ST_RAW, /* Raw: .size */ FETCH_OP_ST_MEM, /* Mem: .offset, .size */ + FETCH_OP_ST_UMEM, /* Mem: .offset, .size */ FETCH_OP_ST_STRING, /* String: .offset, .size */ FETCH_OP_ST_USTRING, /* User String: .offset, .size */ // Stage 4 (modify) op diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 2e9e4dae8839..e5282828f4a6 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -64,6 +64,8 @@ static nokprobe_inline int fetch_store_string_user(unsigned long addr, void *dest, void *base); static nokprobe_inline int probe_mem_read(void *dest, void *src, size_t size); +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size); /* From the 2nd stage, routine is same */ static nokprobe_inline int @@ -77,14 +79,21 @@ process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, stage2: /* 2nd stage: dereference memory if needed */ - while (code->op == FETCH_OP_DEREF) { - lval = val; - ret = probe_mem_read(&val, (void *)val + code->offset, - sizeof(val)); + do { + if (code->op == FETCH_OP_DEREF) { + lval = val; + ret = probe_mem_read(&val, (void *)val + code->offset, + sizeof(val)); + } else if (code->op == FETCH_OP_UDEREF) { + lval = val; + ret = probe_mem_read_user(&val, + (void *)val + code->offset, sizeof(val)); + } else + break; if (ret) return ret; code++; - } + } while (1); s3 = code; stage3: @@ -109,6 +118,9 @@ stage3: case FETCH_OP_ST_MEM: probe_mem_read(dest, (void *)val + code->offset, code->size); break; + case FETCH_OP_ST_UMEM: + probe_mem_read_user(dest, (void *)val + code->offset, code->size); + break; case FETCH_OP_ST_STRING: loc = *(u32 *)dest; ret = fetch_store_string(val + code->offset, dest, base); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 852e998051f6..3d6b868830f3 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -140,6 +140,13 @@ probe_mem_read(void *dest, void *src, size_t size) return copy_from_user(dest, vaddr, size) ? -EFAULT : 0; } + +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size) +{ + return probe_mem_read(dest, src, size); +} + /* * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. -- cgit v1.2.3 From f08367b3643b5340f9d9ea07808ddd72b74beb30 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Thu, 23 May 2019 12:26:28 -0700 Subject: tracing: Use correct function name in trace_filter_add_remove_task() comment The comment of trace_filter_add_remove_task() refers to the function as 'trace_pid_filter_add_remove_task', use the correct name. Link: http://lkml.kernel.org/r/20190523192628.134406-1-mka@chromium.org Signed-off-by: Matthias Kaehlcke Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6b3b5b0495a8..77b9c4ca5faa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -366,7 +366,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct } /** - * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list + * trace_filter_add_remove_task - Add or remove a task from a pid_list * @pid_list: The list to modify * @self: The current task for fork or NULL for exit * @task: The task to add or remove -- cgit v1.2.3 From 539b75b2b9eece3fc6da5672d4b67440d5e454a9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 22 May 2019 17:27:52 +0900 Subject: tracing/kprobe: Cast user-space address correctly Cast user-space address correctly to pass to probe_user_read(). Reported-by: kbuild test robot Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ff14eb011c1c..2c5357dddb92 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -955,7 +955,9 @@ probe_mem_read(void *dest, void *src, size_t size) static nokprobe_inline int probe_mem_read_user(void *dest, void *src, size_t size) { - return probe_user_read(dest, src, size); + const void __user *uaddr = (__force const void __user *)src; + + return probe_user_read(dest, uaddr, size); } /* Note that we don't verify it, since the code does not come from user space */ -- cgit v1.2.3 From b5f8b32c93b21c457a958d2a6bf938dab41bac4e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 22 May 2019 17:32:27 +0900 Subject: kprobes: Initialize kprobes at postcore_initcall Initialize kprobes at postcore_initcall level instead of module_init since kprobes is not a module, and it depends on only subsystems initialized in core_initcall. This will allow ftrace kprobe event to add new events when it is initializing because ftrace kprobe event is initialized at later initcall level. Link: http://lkml.kernel.org/r/155851394736.15728.13626739508905120098.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b1ea30a5540e..54aaaad00a47 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2289,6 +2289,7 @@ static int __init init_kprobes(void) init_test_probes(); return err; } +postcore_initcall(init_kprobes); #ifdef CONFIG_DEBUG_FS static void report_probe(struct seq_file *pi, struct kprobe *p, @@ -2614,5 +2615,3 @@ error: late_initcall(debugfs_kprobe_init); #endif /* CONFIG_DEBUG_FS */ - -module_init(init_kprobes); -- cgit v1.2.3 From 970988e19eb0a0dc24fe14bf91972019e48336e2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 22 May 2019 17:32:35 +0900 Subject: tracing/kprobe: Add kprobe_event= boot parameter Add kprobe_event= boot parameter to define kprobe events at boot time. The definition syntax is similar to tracefs/kprobe_events interface, but use ',' and ';' instead of ' ' and '\n' respectively. e.g. kprobe_event=p,vfs_read,$arg1,$arg2 This puts a probe on vfs_read with argument1 and 2, and enable the new event. Link: http://lkml.kernel.org/r/155851395498.15728.830529496248543583.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 54 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2c5357dddb92..004fffd24ec1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -12,6 +12,8 @@ #include #include +#include /* for COMMAND_LINE_SIZE */ + #include "trace_dynevent.h" #include "trace_kprobe_selftest.h" #include "trace_probe.h" @@ -19,6 +21,17 @@ #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 +#define MAX_KPROBE_CMDLINE_SIZE 1024 + +/* Kprobe early definition from command line */ +static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; + +static int __init set_kprobe_boot_events(char *str) +{ + strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); + return 0; +} +__setup("kprobe_event=", set_kprobe_boot_events); static int trace_kprobe_create(int argc, const char **argv); static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev); @@ -1494,6 +1507,44 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call) } #endif /* CONFIG_PERF_EVENTS */ +static __init void enable_boot_kprobe_events(void) +{ + struct trace_array *tr = top_trace_array(); + struct trace_event_file *file; + struct trace_kprobe *tk; + struct dyn_event *pos; + + mutex_lock(&event_mutex); + for_each_trace_kprobe(tk, pos) { + list_for_each_entry(file, &tr->events, list) + if (file->event_call == &tk->tp.call) + trace_event_enable_disable(file, 1, 0); + } + mutex_unlock(&event_mutex); +} + +static __init void setup_boot_kprobe_events(void) +{ + char *p, *cmd = kprobe_boot_events_buf; + int ret; + + strreplace(kprobe_boot_events_buf, ',', ' '); + + while (cmd && *cmd != '\0') { + p = strchr(cmd, ';'); + if (p) + *p++ = '\0'; + + ret = trace_run_command(cmd, create_or_delete_trace_kprobe); + if (ret) + pr_warn("Failed to add event(%d): %s\n", ret, cmd); + + cmd = p; + } + + enable_boot_kprobe_events(); +} + /* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) { @@ -1525,6 +1576,9 @@ static __init int init_kprobe_trace(void) if (!entry) pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); + + setup_boot_kprobe_events(); + return 0; } fs_initcall(init_kprobe_trace); -- cgit v1.2.3 From b3015fe41d9af7515a7b7b6f7f8f172d193fb3a6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 May 2019 19:40:17 -0400 Subject: tracing: Make a separate config for trace event self tests The trace event self tests enable loop through *all* events, enables each one, one at a time, runs some code to trigger various events (not necessarily the same events), and checks if anything went wrong. The issue is that trace events are usually the least likely start up test to cause a problem, but they take the longest to run (because there are so many events). When one of the other tests trigger a bug, the trace event start up tests causes the bisect to take much longer, because it takes 10s of seconds to get through the trace event tests. By making them a separate config (even though they are enabled by default if start up tests are set), it is possible to turn them off and still run the other tracing start up tests much quicker. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 12 +++++++++++- kernel/trace/trace_events.c | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d965cef6c77..07d22c61b634 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -596,9 +596,19 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config EVENT_TRACE_STARTUP_TEST + bool "Run selftest on trace events" + depends on FTRACE_STARTUP_TEST + default y + help + This option performs a test on all trace events in the system. + It basically just enables each event and runs some code that + will trigger events (not necessarily the event it enables) + This may take some time run as there are a lot of events. + config EVENT_TRACE_TEST_SYSCALLS bool "Run selftest on syscall events" - depends on FTRACE_STARTUP_TEST + depends on EVENT_TRACE_STARTUP_TEST help This option will also enable testing every syscall event. It only enables the event and disables it and runs various loads diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0ce3db67f556..edc72f3b080c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3190,7 +3190,7 @@ void __init trace_event_init(void) event_trace_enable(); } -#ifdef CONFIG_FTRACE_STARTUP_TEST +#ifdef CONFIG_EVENT_TRACE_STARTUP_TEST static DEFINE_SPINLOCK(test_spinlock); static DEFINE_SPINLOCK(test_spinlock_irq); -- cgit v1.2.3 From b6399cc789341fc49a0603fb7d388a4e50aca212 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 May 2019 19:50:34 -0400 Subject: tracing/kprobe: Do not run kprobe boot tests if kprobe_event is on cmdline When having kprobe trace event start up tests enabled and adding a kprobe_event on the kernel command line, it produced the following: trace_kprobe: Testing kprobe tracing: WARNING: CPU: 5 PID: 1 at kernel/trace/trace_kprobe.c:1724 kprobe_trace_self_tests_init+0x32d/0x36b Modules linked in: CPU: 5 PID: 1 Comm: swapper/0 Not tainted 5.2.0-rc1-test+ #249 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:kprobe_trace_self_tests_init+0x32d/0x36b Code: b7 e8 4f 8d a2 fe 85 c0 74 10 0f 0b 48 c7 c7 c8 1b 0d b7 ff c3 e8 19 af 99 fe 48 c7 c7 40 93 27 b7 e8 7f 1a a5 fe 85 c0 74 10 <0f> 0b 48 c7 c7 f8 1b 0d b7 ff c3 e8 f9 ae 9 a0 fe 85 RSP: 0018:ffffb36e40653e08 EFLAGS: 00010286 RAX: 00000000fffffff0 RBX: 0000000000000000 RCX: ffffb36e40653d5c RDX: 0000000000000000 RSI: ffffffffb72776e0 RDI: 0000000000000246 RBP: ffff98414fe58ff8 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: ffff98415d8aa940 R12: 0000000000000000 R13: ffffffffb737c1b0 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff98415ea80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f959ce741b8 CR3: 000000011a210002 CR4: 00000000001606e0 Call Trace: ? init_kprobe_trace+0x19e/0x19e ? do_early_param+0x8e/0x8e do_one_initcall+0x6f/0x2b4 ? do_early_param+0x8e/0x8e kernel_init_freeable+0x21d/0x2c6 ? rest_init+0x146/0x146 kernel_init+0xa/0x10a ret_from_fork+0x3a/0x50 ---[ end trace 488430c083a4c956 ]--- As with the trace events, if a trace event is set on the kernel command line, the trace events start up tests are suspended. The kprobe start up tests should do the same when a kprobe is enabled on the kernel command line. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 004fffd24ec1..7958da2fd922 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -25,6 +25,7 @@ /* Kprobe early definition from command line */ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; +static bool kprobe_boot_events_enabled __initdata; static int __init set_kprobe_boot_events(char *str) { @@ -1538,6 +1539,8 @@ static __init void setup_boot_kprobe_events(void) ret = trace_run_command(cmd, create_or_delete_trace_kprobe); if (ret) pr_warn("Failed to add event(%d): %s\n", ret, cmd); + else + kprobe_boot_events_enabled = true; cmd = p; } @@ -1611,6 +1614,11 @@ static __init int kprobe_trace_self_tests_init(void) if (tracing_is_disabled()) return -ENODEV; + if (kprobe_boot_events_enabled) { + pr_info("Skipping kprobe tests due to kprobe_event on cmdline\n"); + return 0; + } + target = kprobe_trace_selftest_target; pr_info("Testing kprobe tracing: "); -- cgit v1.2.3 From a124692b698b00026a58d89831ceda2331b2e1d0 Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Sat, 4 May 2019 19:39:39 +0800 Subject: ftrace: Enable trampoline when rec count returns back to one Custom trampolines can only be enabled if there is only a single ops attached to it. If there's only a single callback registered to a function, and the ops has a trampoline registered for it, then we can call the trampoline directly. This is very useful for improving the performance of ftrace and livepatch. If more than one callback is registered to a function, the general trampoline is used, and the custom trampoline is not restored back to the direct call even if all the other callbacks were unregistered and we are back to one callback for the function. To fix this, set FTRACE_FL_TRAMP flag if rec count is decremented to one, and the ops that left has a trampoline. Testing After this patch : insmod livepatch_unshare_files.ko cat /sys/kernel/debug/tracing/enabled_functions unshare_files (1) R I tramp: 0xffffffffc0000000(klp_ftrace_handler+0x0/0xa0) ->ftrace_ops_assist_func+0x0/0xf0 echo unshare_files > /sys/kernel/debug/tracing/set_ftrace_filter echo function > /sys/kernel/debug/tracing/current_tracer cat /sys/kernel/debug/tracing/enabled_functions unshare_files (2) R I ->ftrace_ops_list_func+0x0/0x150 echo nop > /sys/kernel/debug/tracing/current_tracer cat /sys/kernel/debug/tracing/enabled_functions unshare_files (1) R I tramp: 0xffffffffc0000000(klp_ftrace_handler+0x0/0xa0) ->ftrace_ops_assist_func+0x0/0xf0 Link: http://lkml.kernel.org/r/1556969979-111047-1-git-send-email-cj.chengjian@huawei.com Signed-off-by: Cheng Jian Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f2c26bebe2a..5c3eadb143ed 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1622,6 +1622,11 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) return keep_regs; } +static struct ftrace_ops * +ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); +static struct ftrace_ops * +ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); + static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1750,15 +1755,17 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, } /* - * If the rec had TRAMP enabled, then it needs to - * be cleared. As TRAMP can only be enabled iff - * there is only a single ops attached to it. - * In otherwords, always disable it on decrementing. - * In the future, we may set it if rec count is - * decremented to one, and the ops that is left - * has a trampoline. + * The TRAMP needs to be set only if rec count + * is decremented to one, and the ops that is + * left has a trampoline. As TRAMP can only be + * enabled if there is only a single ops attached + * to it. */ - rec->flags &= ~FTRACE_FL_TRAMP; + if (ftrace_rec_count(rec) == 1 && + ftrace_find_tramp_ops_any(rec)) + rec->flags |= FTRACE_FL_TRAMP; + else + rec->flags &= ~FTRACE_FL_TRAMP; /* * flags will be cleared in ftrace_check_record() @@ -1951,11 +1958,6 @@ static void print_ip_ins(const char *fmt, const unsigned char *p) printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); } -static struct ftrace_ops * -ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); -static struct ftrace_ops * -ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); - enum ftrace_bug_type ftrace_bug_type; const void *ftrace_expected; -- cgit v1.2.3 From f9070dc94542093fd516ae4ccea17ef46a4362c5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 May 2019 12:29:52 -0500 Subject: signal/pid_namespace: Fix reboot_pid_ns to use send_sig not force_sig The locking in force_sig_info is not prepared to deal with a task that exits or execs (as sighand may change). The is not a locking problem in force_sig as force_sig is only built to handle synchronous exceptions. Further the function force_sig_info changes the signal state if the signal is ignored, or blocked or if SIGNAL_UNKILLABLE will prevent the delivery of the signal. The signal SIGKILL can not be ignored and can not be blocked and SIGNAL_UNKILLABLE won't prevent it from being delivered. So using force_sig rather than send_sig for SIGKILL is confusing and pointless. Because it won't impact the sending of the signal and and because using force_sig is wrong, replace force_sig with send_sig. Cc: Daniel Lezcano Cc: Serge Hallyn Cc: Oleg Nesterov Fixes: cf3f89214ef6 ("pidns: add reboot_pid_ns() to handle the reboot syscall") Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index aa6e72fb7c08..098233ebe589 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -325,7 +325,7 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) } read_lock(&tasklist_lock); - force_sig(SIGKILL, pid_ns->child_reaper); + send_sig(SIGKILL, pid_ns->child_reaper, 1); read_unlock(&tasklist_lock); do_exit(0); -- cgit v1.2.3 From cb44c9a0ab21a9ae4dfcabac1ed8e38aa872d1af Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 21 May 2019 10:03:48 -0500 Subject: signal: Remove task parameter from force_sigsegv The function force_sigsegv is always called on the current task so passing in current is redundant and not passing in current makes this fact obvious. This also makes it clear force_sigsegv always calls force_sig on the current task. Signed-off-by: "Eric W. Biederman" --- kernel/rseq.c | 2 +- kernel/signal.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rseq.c b/kernel/rseq.c index 9424ee90589e..e1aa3ebee291 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -277,7 +277,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) error: sig = ksig ? ksig->sig : 0; - force_sigsegv(sig, t); + force_sigsegv(sig); } #ifdef CONFIG_DEBUG_RSEQ diff --git a/kernel/signal.c b/kernel/signal.c index 39a3eca5ce22..f7669d240ce4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1607,8 +1607,10 @@ EXPORT_SYMBOL(force_sig); * the problem was already a SIGSEGV, we'll want to * make sure we don't even try to deliver the signal.. */ -void force_sigsegv(int sig, struct task_struct *p) +void force_sigsegv(int sig) { + struct task_struct *p = current; + if (sig == SIGSEGV) { unsigned long flags; spin_lock_irqsave(&p->sighand->siglock, flags); @@ -2717,7 +2719,7 @@ static void signal_delivered(struct ksignal *ksig, int stepping) void signal_setup_done(int failed, struct ksignal *ksig, int stepping) { if (failed) - force_sigsegv(ksig->sig, current); + force_sigsegv(ksig->sig); else signal_delivered(ksig, stepping); } -- cgit v1.2.3 From 3cf5d076fb4d48979f382bc9452765bf8b79e740 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 23 May 2019 10:17:27 -0500 Subject: signal: Remove task parameter from force_sig All of the remaining callers pass current into force_sig so remove the task parameter to make this obvious and to make misuse more difficult in the future. This also makes it clear force_sig passes current into force_sig_info. Signed-off-by: "Eric W. Biederman" --- kernel/events/uprobes.c | 4 ++-- kernel/rseq.c | 2 +- kernel/signal.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 78f61bfc6b79..359122185cfb 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2112,7 +2112,7 @@ static void handle_trampoline(struct pt_regs *regs) sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); - force_sig(SIGILL, current); + force_sig(SIGILL); } @@ -2228,7 +2228,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); - force_sig(SIGILL, current); + force_sig(SIGILL); } } diff --git a/kernel/rseq.c b/kernel/rseq.c index e1aa3ebee291..27c48eb7de40 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -296,7 +296,7 @@ void rseq_syscall(struct pt_regs *regs) return; if (!access_ok(t->rseq, sizeof(*t->rseq)) || rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) - force_sig(SIGSEGV, t); + force_sig(SIGSEGV); } #endif diff --git a/kernel/signal.c b/kernel/signal.c index f7669d240ce4..20878c4c28c2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1595,9 +1595,9 @@ send_sig(int sig, struct task_struct *p, int priv) } EXPORT_SYMBOL(send_sig); -void force_sig(int sig, struct task_struct *p) +void force_sig(int sig) { - force_sig_info(sig, SEND_SIG_PRIV, p); + force_sig_info(sig, SEND_SIG_PRIV, current); } EXPORT_SYMBOL(force_sig); @@ -1617,7 +1617,7 @@ void force_sigsegv(int sig) p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; spin_unlock_irqrestore(&p->sighand->siglock, flags); } - force_sig(SIGSEGV, p); + force_sig(SIGSEGV); } int force_sig_fault(int sig, int code, void __user *addr -- cgit v1.2.3 From f8eac9011b6be56acfb5d1d0dfd5ee30082a12ee Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 5 Feb 2019 18:14:19 -0600 Subject: signal: Remove task parameter from force_sig_mceerr All of the callers pass current into force_sig_mceer so remove the task parameter to make this obvious. This also makes it clear that force_sig_mceerr passes current into force_sig_info. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 20878c4c28c2..398489facf9f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1666,7 +1666,7 @@ int send_sig_fault(int sig, int code, void __user *addr return send_sig_info(info.si_signo, &info, t); } -int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) +int force_sig_mceerr(int code, void __user *addr, short lsb) { struct kernel_siginfo info; @@ -1677,7 +1677,7 @@ int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct info.si_code = code; info.si_addr = addr; info.si_addr_lsb = lsb; - return force_sig_info(info.si_signo, &info, t); + return force_sig_info(info.si_signo, &info, current); } int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) -- cgit v1.2.3 From e1afb70252a8614e1ef7aec05ff1b84fd324b782 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 25 May 2019 11:57:53 -0700 Subject: bpf: check signal validity in nmi for bpf_send_signal() helper Commit 8b401f9ed244 ("bpf: implement bpf_send_signal() helper") introduced bpf_send_signal() helper. If the context is nmi, the sending signal work needs to be deferred to irq_work. If the signal is invalid, the error will appear in irq_work and it won't be propagated to user. This patch did an early check in the helper itself to notify user invalid signal, as suggested by Daniel. Suggested-by: Daniel Borkmann Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 70029eafc71f..fe73926a07cd 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -600,6 +600,12 @@ BPF_CALL_1(bpf_send_signal, u32, sig) return -EPERM; if (in_nmi()) { + /* Do an early check on signal validity. Otherwise, + * the error is lost in deferred irq_work. + */ + if (unlikely(!valid_signal(sig))) + return -EINVAL; + work = this_cpu_ptr(&send_signal_work); if (work->irq_work.flags & IRQ_WORK_BUSY) return -EBUSY; -- cgit v1.2.3 From 86b3de60a0b634cdcef82d0a2091bc5444a00020 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 28 May 2019 09:36:19 -0400 Subject: ring-buffer: Remove HAVE_64BIT_ALIGNED_ACCESS Commit c19fa94a8fed ("Add HAVE_64BIT_ALIGNED_ACCESS") added the config for architectures that required 64bit aligned access for all 64bit words. As the ftrace ring buffer stores data on 4 byte alignment, this config option was used to force it to store data on 8 byte alignment to make sure the data being stored and written directly into the ring buffer was 8 byte aligned as it would cause issues trying to write an 8 byte word on a 4 not 8 byte aligned memory location. But with the removal of the metag architecture, which was the only architecture to use this, there is no architecture supported by Linux that requires 8 byte aligne access for all 8 byte words (4 byte alignment is good enough). Removing this config can simplify the code a bit. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 05b0b3139ebc..66358d66c933 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -128,16 +128,7 @@ int ring_buffer_print_entry_header(struct trace_seq *s) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ - -#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS -# define RB_FORCE_8BYTE_ALIGNMENT 0 -# define RB_ARCH_ALIGNMENT RB_ALIGNMENT -#else -# define RB_FORCE_8BYTE_ALIGNMENT 1 -# define RB_ARCH_ALIGNMENT 8U -#endif - -#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) +#define RB_ALIGN_DATA __aligned(RB_ALIGNMENT) /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -2373,7 +2364,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, event->time_delta = delta; length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + if (length > RB_MAX_SMALL_DATA) { event->type_len = 0; event->array[0] = length; } else @@ -2388,11 +2379,11 @@ static unsigned rb_calculate_event_length(unsigned length) if (!length) length++; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + if (length > RB_MAX_SMALL_DATA) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ARCH_ALIGNMENT); + length = ALIGN(length, RB_ALIGNMENT); /* * In case the time delta is larger than the 27 bits for it -- cgit v1.2.3 From eddded80121f2a7bda810f65bf7cb648a709ed11 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 26 Mar 2019 15:24:09 -0400 Subject: rcu: Add checks for dynticks counters in rcu_is_cpu_rrupt_from_idle() It would be good to combine the dynticks and dynticks_nesting counters in order to simplify the code. Unfortunately, there are concerns about usermode upcalls appearing to RCU as half of an interrupt, as Byungchul learned [1]. The "half" in "half interrupt" is due to an unpaired rcu_irq_enter(): Normally, each rcu_irq_enter() has a later call to rcu_irq_exit(). Out of an abundance of caution, Paul added warnings [2] in the RCU code which if not fired by 2021 will be interpreted as meaning that this half-interrupt scenario cannot happen any more, thus permitting simplification of this code. In the meantime, this commit makes the following changes: (1) Combining these two counters requires that rcu_rrupt_from_idle() is invoked only from hard-interrupt contexts as discussed here [3]. This commit therefore adds the required lockdep_assert_in_irq() to check this constraint. (2) Furthermore, rcu_rrupt_from_idle() is not explicit about how it is using the counters which can lead to weird future bugs. This commit therefore adds comments indicating the meaning and use of each counter. (3) Lastly, this commit checks for counter underflows as another check that half interrupts don't occur. (Previously, the function would simply return true upon underflow.) All these checks checks are NOOPs if PROVE_LOCKING (and thus PROVE_RCU) are disabled. [1] https://lore.kernel.org/patchwork/patch/952349/ [2] Commit e11ec65cc8d6 ("rcu: Add warning to detect half-interrupts") [3] https://lore.kernel.org/lkml/20190312150514.GB249405@google.com/ Cc: byungchul.park@lge.com Cc: kernel-team@android.com Cc: rcu@vger.kernel.org Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7822a2e1370d..b9629cf08f94 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -376,16 +376,29 @@ static void __maybe_unused rcu_momentary_dyntick_idle(void) } /** - * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle + * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle * - * If the current CPU is idle or running at a first-level (not nested) + * If the current CPU is idle and running at a first-level (not nested) * interrupt from idle, return true. The caller must have at least * disabled preemption. */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 && - __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1; + /* Called only from within the scheduling-clock interrupt */ + lockdep_assert_in_irq(); + + /* Check for counter underflows */ + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, + "RCU dynticks_nesting counter underflow!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0, + "RCU dynticks_nmi_nesting counter underflow/zero!"); + + /* Are we at first interrupt nesting level? */ + if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1) + return false; + + /* Does CPU appear to be idle from an RCU standpoint? */ + return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ -- cgit v1.2.3 From 1bb336443cde1154600bd147a45a30baa59c57db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2019 15:51:25 -0700 Subject: rcu: Rename rcu_data's ->deferred_qs to ->exp_deferred_qs The rcu_data structure's ->deferred_qs field is used to indicate that the current CPU is blocking an expedited grace period (perhaps a future one). Given that it is used only for expedited grace periods, its current name is misleading, so this commit renames it to ->exp_deferred_qs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_exp.h | 8 ++++---- kernel/rcu/tree_plugin.h | 14 +++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 21d740f0b8dc..7acaf3a62d39 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -154,7 +154,7 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ - bool deferred_qs; /* This CPU awaiting a deferred QS? */ + bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index de1b4acf6979..e0c928d04be5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -250,7 +250,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { - WRITE_ONCE(rdp->deferred_qs, false); + WRITE_ONCE(rdp->exp_deferred_qs, false); rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } @@ -616,7 +616,7 @@ static void rcu_exp_handler(void *unused) rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); } else { - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; set_tsk_need_resched(t); set_preempt_need_resched(); } @@ -638,7 +638,7 @@ static void rcu_exp_handler(void *unused) if (t->rcu_read_lock_nesting > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; t->rcu_read_unlock_special.b.exp_hint = true; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -661,7 +661,7 @@ static void rcu_exp_handler(void *unused) * * Otherwise, force a context switch after the CPU enables everything. */ - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { rcu_preempt_deferred_qs(t); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 58c7853f19e7..1aeb4ae187ce 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -237,10 +237,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) */ - if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs) + if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); else - WARN_ON_ONCE(rdp->deferred_qs); + WARN_ON_ONCE(rdp->exp_deferred_qs); } /* @@ -337,7 +337,7 @@ void rcu_note_context_switch(bool preempt) * means that we continue to block the current grace period. */ rcu_qs(); - if (rdp->deferred_qs) + if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ @@ -451,7 +451,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ special = t->rcu_read_unlock_special; rdp = this_cpu_ptr(&rcu_data); - if (!special.s && !rdp->deferred_qs) { + if (!special.s && !rdp->exp_deferred_qs) { local_irq_restore(flags); return; } @@ -459,7 +459,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) if (special.b.need_qs) { rcu_qs(); t->rcu_read_unlock_special.b.need_qs = false; - if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) { + if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) { local_irq_restore(flags); return; } @@ -471,7 +471,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->deferred_qs) { + if (rdp->exp_deferred_qs) { rcu_report_exp_rdp(rdp); if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); @@ -560,7 +560,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (__this_cpu_read(rcu_data.deferred_qs) || + return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && t->rcu_read_lock_nesting <= 0; } -- cgit v1.2.3 From f0b635627395223d3c60a3105372b4349e04772f Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Tue, 23 Apr 2019 09:21:55 +0800 Subject: rcu: Remove unused rdp local from synchronize_rcu_expedited() Because rdp is initialized but never used in synchronize_rcu_expedited(), this commit removes it. Signed-off-by: Jiang Biao Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e0c928d04be5..8e539710721a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -793,7 +793,6 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { - struct rcu_data *rdp; struct rcu_exp_work rew; struct rcu_node *rnp; unsigned long s; @@ -830,7 +829,6 @@ void synchronize_rcu_expedited(void) } /* Wait for expedited grace period to complete. */ - rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); -- cgit v1.2.3 From cd6d17b4a4646d4bf2568f3a4de13a5a13e2ed28 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Fri, 29 Mar 2019 15:25:52 +0530 Subject: rcu: Dump specified number of blocked tasks The dump_blkd_tasks() function dumps at most 10 blocked tasks, ignoring the value of the ncheck parameter. This commit therefore substitutes the value of ncheck for the hard-coded value of 10. Because all callers currently pass 10 as the number, this patch does not change behavior, but it is clearly an accident waiting to happen. Signed-off-by: Neeraj Upadhyay Reviewed-by: Mukesh Ojha Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1102765f91fd..3a9891a74ead 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -760,7 +760,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) i = 0; list_for_each(lhp, &rnp->blkd_tasks) { pr_cont(" %p", lhp); - if (++i >= 10) + if (++i >= ncheck) break; } pr_cont("\n"); -- cgit v1.2.3 From 3ae976a7e3e87438b8439a01aeb79d4866b1c444 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Fri, 29 Mar 2019 16:57:08 +0530 Subject: rcu: Correctly unlock root node in rcu_check_gp_start_stall() On systems whose rcu_node tree has only one node, the rcu_check_gp_start_stall() function's values of rnp and rnp_root will be identical. In this case, it clearly does not make sense to release both rnp->lock and rnp_root->lock, but that is exactly what this function does in the last early exit. This commit therefore unlocks only rnp->lock when rnp and rnp_root are equal. Signed-off-by: Neeraj Upadhyay Reviewed-by: Mukesh Ojha Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index f65a73a97323..065183391f75 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -630,7 +630,9 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, time_before(j, rcu_state.gp_req_activity + gpssdelay) || time_before(j, rcu_state.gp_activity + gpssdelay) || atomic_xchg(&warned, 1)) { - raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ + if (rnp_root != rnp) + /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp_root); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } -- cgit v1.2.3 From d5a9a8c3bc8068f2e5dfba30150ac09b596b461a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Apr 2019 17:01:39 -0700 Subject: rcu: Set a maximum limit for back-to-back callback invocation Currently, if a CPU has more than 10,000 callbacks pending, it will increase rdp->blimit to LONG_MAX. If you are lucky, LONG_MAX is only about two billion, but this is still a bit too many callbacks to invoke back-to-back while otherwise ignoring the world. This commit therefore sets a maximum limit of DEFAULT_MAX_RCU_BLIMIT, which is set to 10,000, for rdp->blimit. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 980ca3ca643f..f888a76673da 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -380,7 +380,8 @@ static int rcu_is_cpu_rrupt_from_idle(void) __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1; } -#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ +#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ +#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ static long blimit = DEFAULT_RCU_BLIMIT; #define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ static long qhimark = DEFAULT_RCU_QHIMARK; @@ -2113,7 +2114,7 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Reinstate batch limit if we have worked down the excess. */ count = rcu_segcblist_n_cbs(&rdp->cblist); - if (rdp->blimit == LONG_MAX && count <= qlowmark) + if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark) rdp->blimit = blimit; /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ @@ -2354,7 +2355,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, rcu_accelerate_cbs_unlocked(rdp->mynode, rdp); } else { /* Give the grace period a kick. */ - rdp->blimit = LONG_MAX; + rdp->blimit = DEFAULT_MAX_RCU_BLIMIT; if (rcu_state.n_force_qs == rdp->n_force_qs_snap && rcu_segcblist_first_pend_cb(&rdp->cblist) != head) rcu_force_quiescent_state(); -- cgit v1.2.3 From fe15b50cdeeebd9248bf27e3c31278668f08bc04 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Apr 2019 16:15:00 -0700 Subject: srcu: Allocate per-CPU data for DEFINE_SRCU() in modules Adding DEFINE_SRCU() or DEFINE_STATIC_SRCU() to a loadable module requires that the size of the reserved region be increased, which is not something we want to be doing all that often. One approach would be to require that loadable modules define an srcu_struct and invoke init_srcu_struct() from their module_init function and cleanup_srcu_struct() from their module_exit function. However, this is more than a bit user unfriendly. This commit therefore creates an ___srcu_struct_ptrs linker section, and pointers to srcu_struct structures created by DEFINE_SRCU() and DEFINE_STATIC_SRCU() within a module are placed into that module's ___srcu_struct_ptrs section. The required init_srcu_struct() and cleanup_srcu_struct() functions are then automatically invoked as needed when that module is loaded and unloaded, thus allowing modules to continue to use DEFINE_SRCU() and DEFINE_STATIC_SRCU() while avoiding the need to increase the size of the reserved region. Many of the algorithms and some of the code was cheerfully cherry-picked from other code making use of linker sections, perhaps most notably from tracepoints. All bugs are nevertheless the sole property of the author. Suggested-by: Mathieu Desnoyers [ paulmck: Use __section() and use "default" in srcu_module_notify()'s "switch" statement as suggested by Joel Fernandes. ] Signed-off-by: Paul E. McKenney Tested-by: Joel Fernandes (Google) --- kernel/module.c | 5 ++++ kernel/rcu/srcutree.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6e6712b3aaf5..c79a53b629b6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3095,6 +3095,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->tracepoints_ptrs), &mod->num_tracepoints); #endif +#ifdef CONFIG_TREE_SRCU + mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs", + sizeof(*mod->srcu_struct_ptrs), + &mod->num_srcu_structs); +#endif #ifdef CONFIG_BPF_EVENTS mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", sizeof(*mod->bpf_raw_events), diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9b761e546de8..2ded2614a2f4 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1310,3 +1310,68 @@ void __init srcu_init(void) queue_work(rcu_gp_wq, &ssp->work.work); } } + +#ifdef CONFIG_MODULES + +/* Initialize any global-scope srcu_struct structures used by this module. */ +static int srcu_module_coming(struct module *mod) +{ + int i; + struct srcu_struct **sspp = mod->srcu_struct_ptrs; + int ret; + + for (i = 0; i < mod->num_srcu_structs; i++) { + ret = init_srcu_struct(*(sspp++)); + if (WARN_ON_ONCE(ret)) + return ret; + } + return 0; +} + +/* Clean up any global-scope srcu_struct structures used by this module. */ +static void srcu_module_going(struct module *mod) +{ + int i; + struct srcu_struct **sspp = mod->srcu_struct_ptrs; + + for (i = 0; i < mod->num_srcu_structs; i++) + cleanup_srcu_struct(*(sspp++)); +} + +/* Handle one module, either coming or going. */ +static int srcu_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + int ret = 0; + + switch (val) { + case MODULE_STATE_COMING: + ret = srcu_module_coming(mod); + break; + case MODULE_STATE_GOING: + srcu_module_going(mod); + break; + default: + break; + } + return ret; +} + +static struct notifier_block srcu_module_nb = { + .notifier_call = srcu_module_notify, + .priority = 0, +}; + +static __init int init_srcu_module_notifier(void) +{ + int ret; + + ret = register_module_notifier(&srcu_module_nb); + if (ret) + pr_warn("Failed to register srcu module notifier\n"); + return ret; +} +late_initcall(init_srcu_module_notifier); + +#endif /* #ifdef CONFIG_MODULES */ -- cgit v1.2.3 From 11b000457f4638cf2a9e6794d31636d2d3174842 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Tue, 23 Apr 2019 09:22:56 +0800 Subject: rcu: Make __call_srcu static Because __call_srcu() is not used outside kernel/rcu/srcutree.c, this commit makes it static. Signed-off-by: Jiang Biao Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 2ded2614a2f4..cf0e886314f2 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -831,8 +831,8 @@ static void srcu_leak_callback(struct rcu_head *rhp) * srcu_read_lock(), and srcu_read_unlock() that are all passed the same * srcu_struct structure. */ -void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, - rcu_callback_t func, bool do_norm) +static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, + rcu_callback_t func, bool do_norm) { unsigned long flags; int idx; -- cgit v1.2.3 From 95bf33b55ff4465399bad843f1d8d618c8baf1f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Apr 2019 14:07:24 +0200 Subject: rcu/sync: Kill rcu_sync_type/gp_type Now that the RCU flavors have been consolidated, rcu_sync_type makes no sense because none of internal update functions aside from .held() depend on gp_type. This commit therefore removes this field and consolidates the relevant code. Signed-off-by: Oleg Nesterov [ paulmck: Added RCU and RCU-bh checks to rcu_sync_is_idle(). ] [ paulmck: And applied subsequent feedback from Oleg Nesterov. ] Signed-off-by: Paul E. McKenney --- kernel/locking/percpu-rwsem.c | 2 +- kernel/rcu/sync.c | 55 ++++--------------------------------------- 2 files changed, 6 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f17dad99eec8..48cab93a47fd 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -17,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + rcu_sync_init(&sem->rss); __init_rwsem(&sem->rw_sem, name, rwsem_key); rcuwait_init(&sem->writer); sem->readers_block = 0; diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index a8304d90573f..ee427e138dad 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -10,65 +10,20 @@ #include #include -#ifdef CONFIG_PROVE_RCU -#define __INIT_HELD(func) .held = func, -#else -#define __INIT_HELD(func) -#endif - -static const struct { - void (*sync)(void); - void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); - void (*wait)(void); -#ifdef CONFIG_PROVE_RCU - int (*held)(void); -#endif -} gp_ops[] = { - [RCU_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_held) - }, - [RCU_SCHED_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_sched_held) - }, - [RCU_BH_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_bh_held) - }, -}; - enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; #define rss_lock gp_wait.lock -#ifdef CONFIG_PROVE_RCU -void rcu_sync_lockdep_assert(struct rcu_sync *rsp) -{ - RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), - "suspicious rcu_sync_is_idle() usage"); -} - -EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); -#endif - /** * rcu_sync_init() - Initialize an rcu_sync structure * @rsp: Pointer to rcu_sync structure to be initialized * @type: Flavor of RCU with which to synchronize rcu_sync structure */ -void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) +void rcu_sync_init(struct rcu_sync *rsp) { memset(rsp, 0, sizeof(*rsp)); init_waitqueue_head(&rsp->gp_wait); - rsp->gp_type = type; } /** @@ -114,7 +69,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) WARN_ON_ONCE(need_wait && need_sync); if (need_sync) { - gp_ops[rsp->gp_type].sync(); + synchronize_rcu(); rsp->gp_state = GP_PASSED; wake_up_all(&rsp->gp_wait); } else if (need_wait) { @@ -167,7 +122,7 @@ static void rcu_sync_func(struct rcu_head *rhp) * to catch a later GP. */ rsp->cb_state = CB_PENDING; - gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + call_rcu(&rsp->cb_head, rcu_sync_func); } else { /* * We're at least a GP after rcu_sync_exit(); eveybody will now @@ -195,7 +150,7 @@ void rcu_sync_exit(struct rcu_sync *rsp) if (!--rsp->gp_count) { if (rsp->cb_state == CB_IDLE) { rsp->cb_state = CB_PENDING; - gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + call_rcu(&rsp->cb_head, rcu_sync_func); } else if (rsp->cb_state == CB_PENDING) { rsp->cb_state = CB_REPLAY; } @@ -220,7 +175,7 @@ void rcu_sync_dtor(struct rcu_sync *rsp) spin_unlock_irq(&rsp->rss_lock); if (cb_state != CB_IDLE) { - gp_ops[rsp->gp_type].wait(); + rcu_barrier(); WARN_ON_ONCE(rsp->cb_state != CB_IDLE); } } -- cgit v1.2.3 From 2bf1acc299c9757932ef8c6edfaacca6d08302b1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Apr 2019 17:21:02 +0200 Subject: uprobes: Use DEFINE_STATIC_PERCPU_RWSEM() to initialize dup_mmap_sem Use DEFINE_STATIC_PERCPU_RWSEM() to initialize dup_mmap_sem. Signed-off-by: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/events/uprobes.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 78f61bfc6b79..97c367f0a9aa 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -46,7 +46,7 @@ static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) -static struct percpu_rw_semaphore dup_mmap_sem; +DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 @@ -2302,7 +2302,5 @@ void __init uprobes_init(void) for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); - BUG_ON(percpu_init_rwsem(&dup_mmap_sem)); - BUG_ON(register_die_notifier(&uprobe_exception_nb)); } -- cgit v1.2.3 From 3f2947b78151ec938dc06aea4ba0e11e56becdff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Apr 2019 18:32:41 +0200 Subject: locking/percpu-rwsem: Add DEFINE_PERCPU_RWSEM(), use it to initialize cgroup_threadgroup_rwsem Turn DEFINE_STATIC_PERCPU_RWSEM() into __DEFINE_PERCPU_RWSEM() with the additional "is_static" argument to introduce DEFINE_PERCPU_RWSEM(). Change cgroup.c to use DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem). Signed-off-by: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/cgroup/cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 217cec4e22c6..b112e93388dc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -101,7 +101,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); -struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ @@ -5616,7 +5616,6 @@ int __init cgroup_init(void) int ssid; BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); - BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); -- cgit v1.2.3 From 89da3b94bb97417ca2c5b0ce3a28643819030247 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 25 Apr 2019 18:50:55 +0200 Subject: rcu/sync: Simplify the state machine With this patch rcu_sync has a single state variable and the transition rules become really simple: GP_IDLE - owned by the first rcu_sync_enter() which moves it to GP_ENTER - owned by rcu-callback which moves it to GP_PASSED - owned by the last rcu_sync_exit() which moves it to GP_EXIT - and this is the only "nontrivial" state. rcu-callback moves it back to GP_IDLE unless another enter() comes before a GP pass. If rcu-callback is invoked before the next rcu_sync_exit() it must see gp_count incremented by that enter() and set GP_PASSED. Otherwise, if the next rcu_sync_exit() wins the race, it will move it to GP_REPLAY - owned by rcu-callback which moves it to GP_EXIT Signed-off-by: Oleg Nesterov [ paulmck: While here, apply READ_ONCE() and WRITE_ONCE() to ->gp_state. ] [ paulmck: Tweaks to make htmldocs happy. (Reported by kbuild test robot.) ] Signed-off-by: Paul E. McKenney --- kernel/rcu/sync.c | 165 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 95 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index ee427e138dad..d4558ab7a07d 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -10,15 +10,13 @@ #include #include -enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; -enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; +enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY }; #define rss_lock gp_wait.lock /** * rcu_sync_init() - Initialize an rcu_sync structure * @rsp: Pointer to rcu_sync structure to be initialized - * @type: Flavor of RCU with which to synchronize rcu_sync structure */ void rcu_sync_init(struct rcu_sync *rsp) { @@ -41,56 +39,26 @@ void rcu_sync_enter_start(struct rcu_sync *rsp) rsp->gp_state = GP_PASSED; } -/** - * rcu_sync_enter() - Force readers onto slowpath - * @rsp: Pointer to rcu_sync structure to use for synchronization - * - * This function is used by updaters who need readers to make use of - * a slowpath during the update. After this function returns, all - * subsequent calls to rcu_sync_is_idle() will return false, which - * tells readers to stay off their fastpaths. A later call to - * rcu_sync_exit() re-enables reader slowpaths. - * - * When called in isolation, rcu_sync_enter() must wait for a grace - * period, however, closely spaced calls to rcu_sync_enter() can - * optimize away the grace-period wait via a state machine implemented - * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). - */ -void rcu_sync_enter(struct rcu_sync *rsp) -{ - bool need_wait, need_sync; - spin_lock_irq(&rsp->rss_lock); - need_wait = rsp->gp_count++; - need_sync = rsp->gp_state == GP_IDLE; - if (need_sync) - rsp->gp_state = GP_PENDING; - spin_unlock_irq(&rsp->rss_lock); +static void rcu_sync_func(struct rcu_head *rhp); - WARN_ON_ONCE(need_wait && need_sync); - if (need_sync) { - synchronize_rcu(); - rsp->gp_state = GP_PASSED; - wake_up_all(&rsp->gp_wait); - } else if (need_wait) { - wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED); - } else { - /* - * Possible when there's a pending CB from a rcu_sync_exit(). - * Nobody has yet been allowed the 'fast' path and thus we can - * avoid doing any sync(). The callback will get 'dropped'. - */ - WARN_ON_ONCE(rsp->gp_state != GP_PASSED); - } +static void rcu_sync_call(struct rcu_sync *rsp) +{ + call_rcu(&rsp->cb_head, rcu_sync_func); } /** * rcu_sync_func() - Callback function managing reader access to fastpath * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization * - * This function is passed to one of the call_rcu() functions by + * This function is passed to call_rcu() function by rcu_sync_enter() and * rcu_sync_exit(), so that it is invoked after a grace period following the - * that invocation of rcu_sync_exit(). It takes action based on events that + * that invocation of enter/exit. + * + * If it is called by rcu_sync_enter() it signals that all the readers were + * switched onto slow path. + * + * If it is called by rcu_sync_exit() it takes action based on events that * have taken place in the meantime, so that closely spaced rcu_sync_enter() * and rcu_sync_exit() pairs need not wait for a grace period. * @@ -107,35 +75,88 @@ static void rcu_sync_func(struct rcu_head *rhp) struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); unsigned long flags; - WARN_ON_ONCE(rsp->gp_state != GP_PASSED); - WARN_ON_ONCE(rsp->cb_state == CB_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irqsave(&rsp->rss_lock, flags); if (rsp->gp_count) { /* - * A new rcu_sync_begin() has happened; drop the callback. + * We're at least a GP after the GP_IDLE->GP_ENTER transition. */ - rsp->cb_state = CB_IDLE; - } else if (rsp->cb_state == CB_REPLAY) { + WRITE_ONCE(rsp->gp_state, GP_PASSED); + wake_up_locked(&rsp->gp_wait); + } else if (rsp->gp_state == GP_REPLAY) { /* - * A new rcu_sync_exit() has happened; requeue the callback - * to catch a later GP. + * A new rcu_sync_exit() has happened; requeue the callback to + * catch a later GP. */ - rsp->cb_state = CB_PENDING; - call_rcu(&rsp->cb_head, rcu_sync_func); + WRITE_ONCE(rsp->gp_state, GP_EXIT); + rcu_sync_call(rsp); } else { /* - * We're at least a GP after rcu_sync_exit(); eveybody will now - * have observed the write side critical section. Let 'em rip!. + * We're at least a GP after the last rcu_sync_exit(); eveybody + * will now have observed the write side critical section. + * Let 'em rip!. */ - rsp->cb_state = CB_IDLE; - rsp->gp_state = GP_IDLE; + WRITE_ONCE(rsp->gp_state, GP_IDLE); } spin_unlock_irqrestore(&rsp->rss_lock, flags); } /** - * rcu_sync_exit() - Allow readers back onto fast patch after grace period + * rcu_sync_enter() - Force readers onto slowpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who need readers to make use of + * a slowpath during the update. After this function returns, all + * subsequent calls to rcu_sync_is_idle() will return false, which + * tells readers to stay off their fastpaths. A later call to + * rcu_sync_exit() re-enables reader slowpaths. + * + * When called in isolation, rcu_sync_enter() must wait for a grace + * period, however, closely spaced calls to rcu_sync_enter() can + * optimize away the grace-period wait via a state machine implemented + * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). + */ +void rcu_sync_enter(struct rcu_sync *rsp) +{ + int gp_state; + + spin_lock_irq(&rsp->rss_lock); + gp_state = rsp->gp_state; + if (gp_state == GP_IDLE) { + WRITE_ONCE(rsp->gp_state, GP_ENTER); + WARN_ON_ONCE(rsp->gp_count); + /* + * Note that we could simply do rcu_sync_call(rsp) here and + * avoid the "if (gp_state == GP_IDLE)" block below. + * + * However, synchronize_rcu() can be faster if rcu_expedited + * or rcu_blocking_is_gp() is true. + * + * Another reason is that we can't wait for rcu callback if + * we are called at early boot time but this shouldn't happen. + */ + } + rsp->gp_count++; + spin_unlock_irq(&rsp->rss_lock); + + if (gp_state == GP_IDLE) { + /* + * See the comment above, this simply does the "synchronous" + * call_rcu(rcu_sync_func) which does GP_ENTER -> GP_PASSED. + */ + synchronize_rcu(); + rcu_sync_func(&rsp->cb_head); + /* Not really needed, wait_event() would see GP_PASSED. */ + return; + } + + wait_event(rsp->gp_wait, READ_ONCE(rsp->gp_state) >= GP_PASSED); +} + +/** + * rcu_sync_exit() - Allow readers back onto fast path after grace period * @rsp: Pointer to rcu_sync structure to use for synchronization * * This function is used by updaters who have completed, and can therefore @@ -146,13 +167,16 @@ static void rcu_sync_func(struct rcu_head *rhp) */ void rcu_sync_exit(struct rcu_sync *rsp) { + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0); + spin_lock_irq(&rsp->rss_lock); if (!--rsp->gp_count) { - if (rsp->cb_state == CB_IDLE) { - rsp->cb_state = CB_PENDING; - call_rcu(&rsp->cb_head, rcu_sync_func); - } else if (rsp->cb_state == CB_PENDING) { - rsp->cb_state = CB_REPLAY; + if (rsp->gp_state == GP_PASSED) { + WRITE_ONCE(rsp->gp_state, GP_EXIT); + rcu_sync_call(rsp); + } else if (rsp->gp_state == GP_EXIT) { + WRITE_ONCE(rsp->gp_state, GP_REPLAY); } } spin_unlock_irq(&rsp->rss_lock); @@ -164,18 +188,19 @@ void rcu_sync_exit(struct rcu_sync *rsp) */ void rcu_sync_dtor(struct rcu_sync *rsp) { - int cb_state; + int gp_state; - WARN_ON_ONCE(rsp->gp_count); + WARN_ON_ONCE(READ_ONCE(rsp->gp_count)); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irq(&rsp->rss_lock); - if (rsp->cb_state == CB_REPLAY) - rsp->cb_state = CB_PENDING; - cb_state = rsp->cb_state; + if (rsp->gp_state == GP_REPLAY) + WRITE_ONCE(rsp->gp_state, GP_EXIT); + gp_state = rsp->gp_state; spin_unlock_irq(&rsp->rss_lock); - if (cb_state != CB_IDLE) { + if (gp_state != GP_IDLE) { rcu_barrier(); - WARN_ON_ONCE(rsp->cb_state != CB_IDLE); + WARN_ON_ONCE(rsp->gp_state != GP_IDLE); } } -- cgit v1.2.3 From 140e53f20b159722903f0c87358bcd809aa9767e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Apr 2019 10:08:18 -0700 Subject: rcutorture: Add cond_resched() to forward-progress free-up loop The rcu_torture_fwd_prog_cbfree() function frees callbacks used during rcutorture's call_rcu() forward-progress test, but does so in a tight loop. This could cause problems given a very long list of callbacks to be freed, and actual testing produces lists with as many as 25M callbacks. This commit therefore adds a cond_resched() to this loop. While in the area, this commit also rearranges the lock releases to look a bit more sane. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efaa5b3f4d3f..7906ba2d9dad 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1674,16 +1674,18 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) for (;;) { spin_lock_irqsave(&rcu_fwd_lock, flags); rfcp = rcu_fwd_cb_head; - if (!rfcp) + if (!rfcp) { + spin_unlock_irqrestore(&rcu_fwd_lock, flags); break; + } rcu_fwd_cb_head = rfcp->rfc_next; if (!rcu_fwd_cb_head) rcu_fwd_cb_tail = &rcu_fwd_cb_head; spin_unlock_irqrestore(&rcu_fwd_lock, flags); kfree(rfcp); freed++; + cond_resched(); } - spin_unlock_irqrestore(&rcu_fwd_lock, flags); return freed; } -- cgit v1.2.3 From e8516c64fe97e27a28fd5bc65b616508ae0020cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Apr 2019 11:06:32 -0700 Subject: rcutorture: Fix stutter_wait() return value and freelist checks The stutter_wait() function is supposed to return true if it actually waits and false otherwise, but it instead unconditionally returns false. Which hides a bug in rcu_torture_writer() that fails to account for the fact that one of the rcu_tortures[] array elements will normally be referenced by rcu_torture_current, and thus not be on the freelist. This commit therefore corrects the stutter_wait() return value and adds a check for rcu_torture_current to rcu_torture_writer()'s check that things get freed after everything goes quiescent. In addition, this commit causes torture_stutter() to give a bit more than one second (instead of only one jiffy) warning of the end of the stutter interval. Finally, this commit disables long-delay readers and aggressive update-side forward-progress checks while forward-progress testing is in flight. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 16 ++++++++++++---- kernel/torture.c | 17 +++++++++++++---- 2 files changed, 25 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7906ba2d9dad..954ac2b98619 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1010,10 +1010,13 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; - if (stutter_wait("rcu_torture_writer")) + if (stutter_wait("rcu_torture_writer") && + !READ_ONCE(rcu_fwd_cb_nodelay)) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) - if (list_empty(&rcu_tortures[i].rtort_free)) - WARN_ON_ONCE(1); + if (list_empty(&rcu_tortures[i].rtort_free) && + rcu_access_pointer(rcu_torture_current) != + &rcu_tortures[i]) + WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); } while (!torture_must_stop()); /* Reset expediting back to unexpedited. */ if (expediting > 0) @@ -1709,6 +1712,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) } /* Tight loop containing cond_resched(). */ + WRITE_ONCE(rcu_fwd_cb_nodelay, true); + cur_ops->sync(); /* Later readers see above write. */ if (selfpropcb) { WRITE_ONCE(fcs.stop, 0); cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); @@ -1747,6 +1752,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) WARN_ON(READ_ONCE(fcs.stop) != 2); destroy_rcu_head_on_stack(&fcs.rh); } + schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, false); } /* Carry out call_rcu() forward-progress testing. */ @@ -1816,7 +1823,6 @@ static void rcu_torture_fwd_prog_cr(void) cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ (void)rcu_torture_fwd_prog_cbfree(); - WRITE_ONCE(rcu_fwd_cb_nodelay, false); if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", @@ -1827,6 +1833,8 @@ static void rcu_torture_fwd_prog_cr(void) n_max_gps, n_max_cbs, cver, gps); rcu_torture_fwd_cb_hist(); } + schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, false); } diff --git a/kernel/torture.c b/kernel/torture.c index 17b2be9bde12..de0e0ecf88e1 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -578,10 +578,12 @@ static int stutter; bool stutter_wait(const char *title) { int spt; + bool ret = false; cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { + ret = true; if (spt == 1) { schedule_timeout_interruptible(1); } else if (spt == 2) { @@ -592,7 +594,7 @@ bool stutter_wait(const char *title) } torture_shutdown_absorb(title); } - return !!spt; + return ret; } EXPORT_SYMBOL_GPL(stutter_wait); @@ -602,13 +604,20 @@ EXPORT_SYMBOL_GPL(stutter_wait); */ static int torture_stutter(void *arg) { + int wtime; + VERBOSE_TOROUT_STRING("torture_stutter task started"); do { if (!torture_must_stop() && stutter > 1) { - WRITE_ONCE(stutter_pause_test, 1); - schedule_timeout_interruptible(stutter - 1); + wtime = stutter; + if (stutter > HZ + 1) { + WRITE_ONCE(stutter_pause_test, 1); + wtime = stutter - HZ - 1; + schedule_timeout_interruptible(wtime); + wtime = HZ + 1; + } WRITE_ONCE(stutter_pause_test, 2); - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(wtime); } WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) -- cgit v1.2.3 From ff3bf92d90d396e51eb78c5ecde11a994ab7a179 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Apr 2019 14:44:49 -0700 Subject: torture: Allow inter-stutter interval to be specified Currently, the inter-stutter interval is the same as the stutter duration, that is, whatever number of jiffies is passed into torture_stutter_init(). This has worked well for quite some time, but the addition of forward-progress testing to rcutorture can delay processes for several seconds, which can triple the time that they are stuttered. This commit therefore adds a second argument to torture_stutter_init() that specifies the inter-stutter interval. While locktorture preserves the current behavior, rcutorture uses the RCU CPU stall warning interval to provide a wider inter-stutter interval. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 2 +- kernel/rcu/rcutorture.c | 5 ++++- kernel/torture.c | 6 ++++-- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 80a463d31a8d..c513031cd7e3 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -975,7 +975,7 @@ static int __init lock_torture_init(void) goto unwind; } if (stutter > 0) { - firsterr = torture_stutter_init(stutter); + firsterr = torture_stutter_init(stutter, stutter); if (firsterr) goto unwind; } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 954ac2b98619..a16d6abe1715 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2373,7 +2373,10 @@ rcu_torture_init(void) if (stutter < 0) stutter = 0; if (stutter) { - firsterr = torture_stutter_init(stutter * HZ); + int t; + + t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ; + firsterr = torture_stutter_init(stutter * HZ, t); if (firsterr) goto unwind; } diff --git a/kernel/torture.c b/kernel/torture.c index de0e0ecf88e1..a8d9bdfba7c3 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -570,6 +570,7 @@ static void torture_shutdown_cleanup(void) static struct task_struct *stutter_task; static int stutter_pause_test; static int stutter; +static int stutter_gap; /* * Block until the stutter interval ends. This must be called periodically @@ -621,7 +622,7 @@ static int torture_stutter(void *arg) } WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) - schedule_timeout_interruptible(stutter); + schedule_timeout_interruptible(stutter_gap); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); @@ -631,9 +632,10 @@ static int torture_stutter(void *arg) /* * Initialize and kick off the torture_stutter kthread. */ -int torture_stutter_init(const int s) +int torture_stutter_init(const int s, const int sgap) { stutter = s; + stutter_gap = sgap; return torture_create_kthread(torture_stutter, NULL, stutter_task); } EXPORT_SYMBOL_GPL(torture_stutter_init); -- cgit v1.2.3 From 5eabea594b4ce9ba0fbd8618bd3bf01aa9f48af7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Apr 2019 09:02:46 -0700 Subject: rcutorture: Exempt tasks RCU from timely draining of grace periods After the end of each stutter pause interval, the rcu_torture_writer() kthread checks to be sure that all prior callbacks have completed so that all the test structures have been freed. This works fine except for tasks RCU, in which grace periods can take one good long time. This commit therefore exempts tasks RCU from this check. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a16d6abe1715..6a4558532eac 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -299,6 +299,7 @@ struct rcu_torture_ops { int irq_capable; int can_boost; int extendables; + int slow_gps; const char *name; }; @@ -667,6 +668,7 @@ static struct rcu_torture_ops tasks_ops = { .fqs = NULL, .stats = NULL, .irq_capable = 1, + .slow_gps = 1, .name = "tasks" }; @@ -1011,7 +1013,8 @@ rcu_torture_writer(void *arg) } rcu_torture_writer_state = RTWS_STUTTER; if (stutter_wait("rcu_torture_writer") && - !READ_ONCE(rcu_fwd_cb_nodelay)) + !READ_ONCE(rcu_fwd_cb_nodelay) && + !cur_ops->slow_gps) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != -- cgit v1.2.3 From ab21f6081f7bc09a0918ef888de795d59a907c1a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 14 Apr 2019 18:30:22 -0700 Subject: rcutorture: Give the scheduler a chance on PREEMPT && NO_HZ_FULL kernels In !PREEMPT kernels, cond_resched() is a no-op. In NO_HZ_FULL kernels, in-kernel execution (such as that of rcutorture's kthreads) might extend indefinitely without the scheduler gaining the aid of a scheduling-clock interrupt. This combination can make the interaction of an rcutorture forward-progress test and a CPU-hotplug stop_machine operation make less forward progress than one might like. Additionally, Sebastian Siewior notes that NO_HZ_FULL kernels have a scheduler check upon return to userspace execution, which suggests that in-kernel emulation of tight userspace loops containing system calls doing call_rcu() might also need explicit checks in the PREEMPT && NO_HZ_FULL case. This commit therefore introduces a rcu_torture_fwd_prog_cond_resched() function that explicitly invokes schedule() in such kernels whenever need_resched() returns true, while retaining use of cond_resched() for kernels that are either !PREEMPT or !NO_HZ_FULL. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6a4558532eac..ef6f6dedf4c4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1667,6 +1667,17 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) spin_unlock_irqrestore(&rcu_fwd_lock, flags); } +// Give the scheduler a chance, even on nohz_full CPUs. +static void rcu_torture_fwd_prog_cond_resched(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (need_resched()) + schedule(); + } else { + cond_resched(); + } +} + /* * Free all callbacks on the rcu_fwd_cb_head list, either because the * test is over or because we hit an OOM event. @@ -1690,7 +1701,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) spin_unlock_irqrestore(&rcu_fwd_lock, flags); kfree(rfcp); freed++; - cond_resched(); + rcu_torture_fwd_prog_cond_resched(); } return freed; } @@ -1734,7 +1745,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) udelay(10); cur_ops->readunlock(idx); if (!fwd_progress_need_resched || need_resched()) - cond_resched(); + rcu_torture_fwd_prog_cond_resched(); } (*tested_tries)++; if (!time_before(jiffies, stopat) && @@ -1817,7 +1828,7 @@ static void rcu_torture_fwd_prog_cr(void) rfcp->rfc_gps = 0; } cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); - cond_resched(); + rcu_torture_fwd_prog_cond_resched(); } stoppedat = jiffies; n_launders_cb_snap = READ_ONCE(n_launders_cb); -- cgit v1.2.3 From 3432d765c59ba026de49bd4f1f0c2adeff0e7a16 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Apr 2019 14:50:05 -0700 Subject: rcutorture: Halt forward-progress checks at end of run Once removed, an rcu_torture element can be deferred-freed by a chain of call_rcu() invocations, with each callback invoking another round of call_rcu() until either a fixed number of call_rcu() invocations have been chained or until the test ends. This means that if the test ends, some of the rcu_torture elements will be "stranded" partway through the deferred-free process, which results in false-positive warnings from rcu_torture_writer() due to lack of forward progress should the test end just at the end of a stutter interval. This commit therefore suppresses rcu_torture_writer()'s forward-progress checks when the test ends in order to avoid these false-positive reports.. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ef6f6dedf4c4..a3f5488a319a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1014,7 +1014,8 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state = RTWS_STUTTER; if (stutter_wait("rcu_torture_writer") && !READ_ONCE(rcu_fwd_cb_nodelay) && - !cur_ops->slow_gps) + !cur_ops->slow_gps && + !torture_must_stop()) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != -- cgit v1.2.3 From c682db558e6eec10a711b0a6bcb8c35fd15f6a39 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 19 Apr 2019 07:38:27 -0700 Subject: rcutorture: Add trivial RCU implementation I have been showing off a trivial RCU implementation for non-preemptive environments for some time now: #define rcu_read_lock() #define rcu_read_unlock() #define rcu_dereference(p) READ_ONCE(p) #define rcu_assign_pointer(p, v) smp_store_release(&(p), (v)) void synchronize_rcu(void) { int cpu; for_each_online_cpu(cpu) sched_setaffinity(current->pid, cpumask_of(cpu)); } Trivial or not, as the old saying goes, "if it ain't tested, it don't work!". This commit therefore adds a "trivial" flavor to rcutorture and a corresponding TRIVIAL test scenario. This variant does not handle CPU hotplug, which is unconditionally enabled on x86 for post-v5.1-rc3 kernels, which is why the TRIVIAL.boot says "rcutorture.onoff_interval=0". This commit actually does handle CONFIG_PREEMPT=y kernels, but only because it turns back the Linux-kernel clock in order to provide these alternative definitions (or the moral equivalent thereof): #define rcu_read_lock() preempt_disable() #define rcu_read_unlock() preempt_enable() In CONFIG_PREEMPT=n kernels without debugging, these are equivalent to empty macros give or take a compiler barrier. However, the have been successfully tested with actual empty macros as well. Signed-off-by: Paul E. McKenney [ paulmck: Fix symbol issue reported by kbuild test robot . ] [ paulmck: Work around sched_setaffinity() issue noted by Andrea Parri. ] [ paulmck: Add rcutorture.shuffle_interval=0 to TRIVIAL.boot to fix interaction with shuffler task noted by Peter Zijlstra. ] Tested-by: Andrea Parri --- kernel/rcu/rcu.h | 5 +++++ kernel/rcu/rcutorture.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- kernel/rcu/update.c | 13 +++++++++++++ 3 files changed, 62 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 390aab20115e..5290b01de534 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -446,6 +446,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, RCU_TASKS_FLAVOR, + RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR }; @@ -479,6 +480,10 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #endif #endif +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); +#endif + #ifdef CONFIG_TINY_SRCU static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a3f5488a319a..6b803fb2f7ca 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -672,6 +672,47 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; +/* + * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. + * This implementation does not necessarily work well with CPU hotplug. + */ + +static void synchronize_rcu_trivial(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + WARN_ON_ONCE(raw_smp_processor_id() != cpu); + } +} + +static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +{ + preempt_disable(); + return 0; +} + +static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +{ + preempt_enable(); +} + +static struct rcu_torture_ops trivial_ops = { + .ttype = RCU_TRIVIAL_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_trivial, + .exp_sync = synchronize_rcu_trivial, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "trivial" +}; + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -1789,6 +1830,8 @@ static void rcu_torture_fwd_prog_cr(void) if (READ_ONCE(rcu_fwd_emergency_stop)) return; /* Get out of the way quickly, no GP wait! */ + if (!cur_ops->call) + return; /* Can't do call_rcu() fwd prog without ->call. */ /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); @@ -2265,7 +2308,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, + &busted_srcud_ops, &tasks_ops, &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c3bf44ba42e5..61df2bf08563 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -423,6 +423,19 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); do { } while (0) #endif +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +/* Get rcutorture access to sched_setaffinity(). */ +long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + int ret; + + ret = sched_setaffinity(pid, in_mask); + WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret); + return ret; +} +EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); +#endif + #ifdef CONFIG_RCU_STALL_COMMON int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); -- cgit v1.2.3 From 34aa34b818407bd475786cf160f7838b7a485e87 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 16 May 2019 16:15:16 -0700 Subject: rcutorture: Dump trace buffer for callback pipe drain failures Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6b803fb2f7ca..89be0f492f78 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1060,8 +1060,10 @@ rcu_torture_writer(void *arg) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != - &rcu_tortures[i]) + &rcu_tortures[i]) { + rcu_ftrace_dump(DUMP_ALL); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); + } } while (!torture_must_stop()); /* Reset expediting back to unexpedited. */ if (expediting > 0) -- cgit v1.2.3 From 354ea05d0276384045fabbfd62ccd2d985defa9e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 25 May 2019 12:36:53 -0700 Subject: rcutorture: Upper case solves the case of the vanishing NULL pointer Various security techniques can obfuscate pointer printouts on the console. Unfortunately, rcutorture relies on either "null" or all zeroes to identify the last few statistics printouts at the end of the test. These need to be identified because failing to do so will results in false-positive complaints about grace-period hangs. This commit therefore prints the "ver:" in capitals ("VER:") when the RCU-protected pointer has been set to NULL, which causes rcutorture's parse-console.sh script to correctly ignore these lines. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 89be0f492f78..fce4e7e6f502 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1408,8 +1408,9 @@ rcu_torture_stats_print(void) } pr_alert("%s%s ", torture_type, TORTURE_FLAG); - pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", + pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, + rcu_torture_current ? "ver" : "VER", rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), -- cgit v1.2.3 From 4bfc0bb2c60e2f4cc8eb60f03cf8dfa72336272a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 25 May 2019 09:37:39 -0700 Subject: bpf: decouple the lifetime of cgroup_bpf from cgroup itself Currently the lifetime of bpf programs attached to a cgroup is bound to the lifetime of the cgroup itself. It means that if a user forgets (or intentionally avoids) to detach a bpf program before removing the cgroup, it will stay attached up to the release of the cgroup. Since the cgroup can stay in the dying state (the state between being rmdir()'ed and being released) for a very long time, it leads to a waste of memory. Also, it blocks a possibility to implement the memcg-based memory accounting for bpf objects, because a circular reference dependency will occur. Charged memory pages are pinning the corresponding memory cgroup, and if the memory cgroup is pinning the attached bpf program, nothing will be ever released. A dying cgroup can not contain any processes, so the only chance for an attached bpf program to be executed is a live socket associated with the cgroup. So in order to release all bpf data early, let's count associated sockets using a new percpu refcounter. On cgroup removal the counter is transitioned to the atomic mode, and as soon as it reaches 0, all bpf programs are detached. Because cgroup_bpf_release() can block, it can't be called from the percpu ref counter callback directly, so instead an asynchronous work is scheduled. The reference counter is not socket specific, and can be used for any other types of programs, which can be executed from a cgroup-bpf hook outside of the process context, had such a need arise in the future. Signed-off-by: Roman Gushchin Cc: jolsa@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 41 +++++++++++++++++++++++++++++++++++++---- kernel/cgroup/cgroup.c | 11 ++++++++--- 2 files changed, 45 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index fcde0f7b2585..d995edbe816d 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -22,12 +22,21 @@ DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +void cgroup_bpf_offline(struct cgroup *cgrp) +{ + cgroup_get(cgrp); + percpu_ref_kill(&cgrp->bpf.refcnt); +} + /** - * cgroup_bpf_put() - put references of all bpf programs - * @cgrp: the cgroup to modify + * cgroup_bpf_release() - put references of all bpf programs and + * release all cgroup bpf data + * @work: work structure embedded into the cgroup to modify */ -void cgroup_bpf_put(struct cgroup *cgrp) +static void cgroup_bpf_release(struct work_struct *work) { + struct cgroup *cgrp = container_of(work, struct cgroup, + bpf.release_work); enum bpf_cgroup_storage_type stype; unsigned int type; @@ -47,6 +56,22 @@ void cgroup_bpf_put(struct cgroup *cgrp) } bpf_prog_array_free(cgrp->bpf.effective[type]); } + + percpu_ref_exit(&cgrp->bpf.refcnt); + cgroup_put(cgrp); +} + +/** + * cgroup_bpf_release_fn() - callback used to schedule releasing + * of bpf cgroup data + * @ref: percpu ref counter structure + */ +static void cgroup_bpf_release_fn(struct percpu_ref *ref) +{ + struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); + + INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); + queue_work(system_wq, &cgrp->bpf.release_work); } /* count number of elements in the list. @@ -167,7 +192,12 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) */ #define NR ARRAY_SIZE(cgrp->bpf.effective) struct bpf_prog_array __rcu *arrays[NR] = {}; - int i; + int ret, i; + + ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, + GFP_KERNEL); + if (ret) + return ret; for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); @@ -183,6 +213,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + + percpu_ref_exit(&cgrp->bpf.refcnt); + return -ENOMEM; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 217cec4e22c6..ef9cfbfc82a9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4955,8 +4955,6 @@ static void css_release_work_fn(struct work_struct *work) if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); - - cgroup_bpf_put(cgrp); } mutex_unlock(&cgroup_mutex); @@ -5482,6 +5480,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup1_check_for_release(parent); + cgroup_bpf_offline(cgrp); + /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -6221,6 +6221,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) * Don't use cgroup_get_live(). */ cgroup_get(sock_cgroup_ptr(skcd)); + cgroup_bpf_get(sock_cgroup_ptr(skcd)); return; } @@ -6232,6 +6233,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { skcd->val = (unsigned long)cset->dfl_cgrp; + cgroup_bpf_get(cset->dfl_cgrp); break; } cpu_relax(); @@ -6242,7 +6244,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) void cgroup_sk_free(struct sock_cgroup_data *skcd) { - cgroup_put(sock_cgroup_ptr(skcd)); + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + cgroup_bpf_put(cgrp); + cgroup_put(cgrp); } #endif /* CONFIG_SOCK_CGROUP_DATA */ -- cgit v1.2.3 From 5ca584d935c32906d114924dc0e1dbfcbb13fdb2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 28 May 2019 12:03:45 -0400 Subject: futex: Consolidate duplicated timer setup code Add a new futex_setup_timer() helper function to consolidate all the hrtimer_sleeper setup code. Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Darren Hart Cc: Davidlohr Bueso Link: https://lkml.kernel.org/r/20190528160345.24017-1-longman@redhat.com --- kernel/futex.c | 69 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 2268b97d5439..49bf20a8c512 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -483,6 +483,37 @@ enum futex_access { FUTEX_WRITE }; +/** + * futex_setup_timer - set up the sleeping hrtimer. + * @time: ptr to the given timeout value + * @timeout: the hrtimer_sleeper structure to be set up + * @flags: futex flags + * @range_ns: optional range in ns + * + * Return: Initialized hrtimer_sleeper structure or NULL if no timeout + * value given + */ +static inline struct hrtimer_sleeper * +futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, + int flags, u64 range_ns) +{ + if (!time) + return NULL; + + hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + hrtimer_init_sleeper(timeout, current); + + /* + * If range_ns is 0, calling hrtimer_set_expires_range_ns() is + * effectively the same as calling hrtimer_set_expires(). + */ + hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); + + return timeout; +} + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex @@ -2692,7 +2723,7 @@ out: static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) { - struct hrtimer_sleeper timeout, *to = NULL; + struct hrtimer_sleeper timeout, *to; struct restart_block *restart; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; @@ -2702,17 +2733,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, return -EINVAL; q.bitset = bitset; - if (abs_time) { - to = &timeout; - - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires_range_ns(&to->timer, *abs_time, - current->timer_slack_ns); - } - + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); retry: /* * Prepare to wait on uaddr. On success, holds hb lock and increments @@ -2792,7 +2814,7 @@ static long futex_wait_restart(struct restart_block *restart) static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { - struct hrtimer_sleeper timeout, *to = NULL; + struct hrtimer_sleeper timeout, *to; struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; @@ -2805,13 +2827,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, if (refill_pi_state_cache()) return -ENOMEM; - if (time) { - to = &timeout; - hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires(&to->timer, *time); - } + to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0); retry: ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); @@ -3208,7 +3224,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, u32 __user *uaddr2) { - struct hrtimer_sleeper timeout, *to = NULL; + struct hrtimer_sleeper timeout, *to; struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; @@ -3225,15 +3241,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (!bitset) return -EINVAL; - if (abs_time) { - to = &timeout; - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires_range_ns(&to->timer, *abs_time, - current->timer_slack_ns); - } + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); /* * The waiter is allocated on our stack, manipulated by the requeue -- cgit v1.2.3 From f560201102035ba9def2fc21827dd34690dd126e Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 28 May 2019 21:31:49 +0200 Subject: cpu/hotplug: Fix notify_cpu_starting() reference in bringup_wait_for_ap() bringup_wait_for_ap() comment references cpu_notify_starting(), but the function is actually called notify_cpu_starting(). Fix that. Signed-off-by: Jiri Kosina Signed-off-by: Thomas Gleixner Cc: Josh Poimboeuf Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/nycvar.YFH.7.76.1905282128100.1962@cbobk.fhfr.pm --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index f2ef10460698..be82cbc11a8a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -522,7 +522,7 @@ static int bringup_wait_for_ap(unsigned int cpu) /* * SMT soft disabling on X86 requires to bring the CPU out of the * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The - * CPU marked itself as booted_once in cpu_notify_starting() so the + * CPU marked itself as booted_once in notify_cpu_starting() so the * cpu_smt_allowed() check will now return false if this is not the * primary sibling. */ -- cgit v1.2.3 From 43b98d876f89dce732f50b71607b6d2bbb8d8e6a Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 27 May 2019 13:57:42 +0200 Subject: genirq/irqdomain: Remove WARN_ON() on out-of-memory condition There is no need to print a backtrace when memory allocation fails, as the memory allocation core already takes care of that. Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Link: https://lkml.kernel.org/r/20190527115742.2693-1-geert+renesas@glider.be --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index a453e229f99c..e7d17cc3a3d7 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -139,7 +139,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), GFP_KERNEL, of_node_to_nid(of_node)); - if (WARN_ON(!domain)) + if (!domain) return NULL; if (fwnode && is_fwnode_irqchip(fwnode)) { -- cgit v1.2.3 From 0223fad3c98a9588c159a35dda2ef6e68ca27e3f Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 22 May 2019 17:52:02 -0400 Subject: audit: enforce op for string fields The field operator is ignored on several string fields. WATCH, DIR, PERM and FILETYPE field operators are completely ignored and meaningless since the op is not referenced in audit_filter_rules(). Range and bitwise operators are already addressed in ghak73. Honour the operator for WATCH, DIR, PERM, FILETYPE fields as is done in the EXE field. Please see github issue https://github.com/linux-audit/audit-kernel/issues/114 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9134fe11ff6c..4effe01ebbe2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -601,12 +601,20 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_WATCH: - if (name) - result = audit_watch_compare(rule->watch, name->ino, name->dev); + if (name) { + result = audit_watch_compare(rule->watch, + name->ino, + name->dev); + if (f->op == Audit_not_equal) + result = !result; + } break; case AUDIT_DIR: - if (ctx) + if (ctx) { result = match_tree_refs(ctx, rule->tree); + if (f->op == Audit_not_equal) + result = !result; + } break; case AUDIT_LOGINUID: result = audit_uid_comparator(audit_get_loginuid(tsk), @@ -689,9 +697,13 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_PERM: result = audit_match_perm(ctx, f->val); + if (f->op == Audit_not_equal) + result = !result; break; case AUDIT_FILETYPE: result = audit_match_filetype(ctx, f->val); + if (f->op == Audit_not_equal) + result = !result; break; case AUDIT_FIELD_COMPARE: result = audit_field_compare(tsk, cred, f, ctx, name); -- cgit v1.2.3 From 54e9c9d4b506b611228890752d1cfa960e0965e1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 May 2019 14:14:41 -0700 Subject: bpf: remove __rcu annotations from bpf_prog_array Drop __rcu annotations and rcu read sections from bpf_prog_array helper functions. They are not needed since all existing callers call those helpers from the rcu update side while holding a mutex. This guarantees that use-after-free could not happen. In the next patches I'll fix the callers with missing rcu_dereference_protected to make sparse/lockdep happy, the proper way to use these helpers is: struct bpf_prog_array __rcu *progs = ...; struct bpf_prog_array *p; mutex_lock(&mtx); p = rcu_dereference_protected(progs, lockdep_is_held(&mtx)); bpf_prog_array_length(p); bpf_prog_array_copy_to_user(p, ...); bpf_prog_array_delete_safe(p, ...); bpf_prog_array_copy_info(p, ...); bpf_prog_array_copy(p, ...); bpf_prog_array_free(p); mutex_unlock(&mtx); No functional changes! rcu_dereference_protected with lockdep_is_held should catch any cases where we update prog array without a mutex (I've looked at existing call sites and I think we hold a mutex everywhere). Motivation is to fix sparse warnings: kernel/bpf/core.c:1803:9: warning: incorrect type in argument 1 (different address spaces) kernel/bpf/core.c:1803:9: expected struct callback_head *head kernel/bpf/core.c:1803:9: got struct callback_head [noderef] * kernel/bpf/core.c:1877:44: warning: incorrect type in initializer (different address spaces) kernel/bpf/core.c:1877:44: expected struct bpf_prog_array_item *item kernel/bpf/core.c:1877:44: got struct bpf_prog_array_item [noderef] * kernel/bpf/core.c:1901:26: warning: incorrect type in assignment (different address spaces) kernel/bpf/core.c:1901:26: expected struct bpf_prog_array_item *existing kernel/bpf/core.c:1901:26: got struct bpf_prog_array_item [noderef] * kernel/bpf/core.c:1935:26: warning: incorrect type in assignment (different address spaces) kernel/bpf/core.c:1935:26: expected struct bpf_prog_array_item *[assigned] existing kernel/bpf/core.c:1935:26: got struct bpf_prog_array_item [noderef] * v2: * remove comment about potential race; that can't happen because all callers are in rcu-update section Cc: Roman Gushchin Acked-by: Roman Gushchin Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3675b19ecb90..33fb292f2e30 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1795,38 +1795,33 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) return &empty_prog_array.hdr; } -void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) +void bpf_prog_array_free(struct bpf_prog_array *progs) { - if (!progs || - progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr) + if (!progs || progs == &empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } -int bpf_prog_array_length(struct bpf_prog_array __rcu *array) +int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; u32 cnt = 0; - rcu_read_lock(); - item = rcu_dereference(array)->items; - for (; item->prog; item++) + for (item = array->items; item->prog; item++) if (item->prog != &dummy_bpf_prog.prog) cnt++; - rcu_read_unlock(); return cnt; } -static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, +static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt) { struct bpf_prog_array_item *item; int i = 0; - item = rcu_dereference_check(array, 1)->items; - for (; item->prog; item++) { + for (item = array->items; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; prog_ids[i] = item->prog->aux->id; @@ -1839,7 +1834,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, return !!(item->prog); } -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, +int bpf_prog_array_copy_to_user(struct bpf_prog_array *array, __u32 __user *prog_ids, u32 cnt) { unsigned long err = 0; @@ -1850,18 +1845,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, * cnt = bpf_prog_array_length(); * if (cnt > 0) * bpf_prog_array_copy_to_user(..., cnt); - * so below kcalloc doesn't need extra cnt > 0 check, but - * bpf_prog_array_length() releases rcu lock and - * prog array could have been swapped with empty or larger array, - * so always copy 'cnt' prog_ids to the user. - * In a rare race the user will see zero prog_ids + * so below kcalloc doesn't need extra cnt > 0 check. */ ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; - rcu_read_lock(); nospc = bpf_prog_array_copy_core(array, ids, cnt); - rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); if (err) @@ -1871,19 +1860,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, return 0; } -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, +void bpf_prog_array_delete_safe(struct bpf_prog_array *array, struct bpf_prog *old_prog) { - struct bpf_prog_array_item *item = array->items; + struct bpf_prog_array_item *item; - for (; item->prog; item++) + for (item = array->items; item->prog; item++) if (item->prog == old_prog) { WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } -int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, +int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, struct bpf_prog_array **new_array) @@ -1947,7 +1936,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, return 0; } -int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, +int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { -- cgit v1.2.3 From dbcc1ba26e43bd32cb308e50ac4cb4a29d2f5967 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 May 2019 14:14:43 -0700 Subject: bpf: cgroup: properly use bpf_prog_array api Now that we don't have __rcu markers on the bpf_prog_array helpers, let's use proper rcu_dereference_protected to obtain array pointer under mutex. We also don't need __rcu annotations on cgroup_bpf.inactive since it's not read/updated concurrently. v4: * drop cgroup_rcu_xyz wrappers and use rcu APIs directly; presumably should be more clear to understand which mutex/refcount protects each particular place v3: * amend cgroup_rcu_dereference to include percpu_ref_is_dying; cgroup_bpf is now reference counted and we don't hold cgroup_mutex anymore in cgroup_bpf_release v2: * replace xchg with rcu_swap_protected Cc: Roman Gushchin Signed-off-by: Stanislav Fomichev Acked-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index d995edbe816d..ff594eb86fd7 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -38,6 +38,7 @@ static void cgroup_bpf_release(struct work_struct *work) struct cgroup *cgrp = container_of(work, struct cgroup, bpf.release_work); enum bpf_cgroup_storage_type stype; + struct bpf_prog_array *old_array; unsigned int type; for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { @@ -54,7 +55,10 @@ static void cgroup_bpf_release(struct work_struct *work) kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } - bpf_prog_array_free(cgrp->bpf.effective[type]); + old_array = rcu_dereference_protected( + cgrp->bpf.effective[type], + percpu_ref_is_dying(&cgrp->bpf.refcnt)); + bpf_prog_array_free(old_array); } percpu_ref_exit(&cgrp->bpf.refcnt); @@ -126,7 +130,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, */ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, - struct bpf_prog_array __rcu **array) + struct bpf_prog_array **array) { enum bpf_cgroup_storage_type stype; struct bpf_prog_array *progs; @@ -164,17 +168,16 @@ static int compute_effective_progs(struct cgroup *cgrp, } } while ((p = cgroup_parent(p))); - rcu_assign_pointer(*array, progs); + *array = progs; return 0; } static void activate_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, - struct bpf_prog_array __rcu *array) + struct bpf_prog_array *old_array) { - struct bpf_prog_array __rcu *old_array; - - old_array = xchg(&cgrp->bpf.effective[type], array); + rcu_swap_protected(cgrp->bpf.effective[type], old_array, + lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array */ @@ -191,7 +194,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) * that array below is variable length */ #define NR ARRAY_SIZE(cgrp->bpf.effective) - struct bpf_prog_array __rcu *arrays[NR] = {}; + struct bpf_prog_array *arrays[NR] = {}; int ret, i; ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, @@ -477,10 +480,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, enum bpf_attach_type type = attr->query.attach_type; struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; + struct bpf_prog_array *effective; int cnt, ret = 0, i; + effective = rcu_dereference_protected(cgrp->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) - cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); + cnt = bpf_prog_array_length(effective); else cnt = prog_list_length(progs); @@ -497,8 +504,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { - return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], - prog_ids, cnt); + return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); } else { struct bpf_prog_list *pl; u32 id; -- cgit v1.2.3 From e672db03ab0e43e41ab6f8b2156a10d6e40f243d Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 28 May 2019 14:14:44 -0700 Subject: bpf: tracing: properly use bpf_prog_array api Now that we don't have __rcu markers on the bpf_prog_array helpers, let's use proper rcu_dereference_protected to obtain array pointer under mutex. Cc: Steven Rostedt Cc: Ingo Molnar Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fe73926a07cd..3994a231eb92 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -19,6 +19,9 @@ #include "trace_probe.h" #include "trace.h" +#define bpf_event_rcu_dereference(p) \ + rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) + #ifdef CONFIG_MODULES struct bpf_trace_module { struct module *module; @@ -1099,7 +1102,7 @@ static DEFINE_MUTEX(bpf_event_mutex); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { - struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; int ret = -EEXIST; @@ -1117,7 +1120,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, if (event->prog) goto unlock; - old_array = event->tp_event->prog_array; + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); if (old_array && bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { ret = -E2BIG; @@ -1140,7 +1143,7 @@ unlock: void perf_event_detach_bpf_prog(struct perf_event *event) { - struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; int ret; @@ -1149,7 +1152,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) if (!event->prog) goto unlock; - old_array = event->tp_event->prog_array; + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); if (ret == -ENOENT) goto unlock; @@ -1171,6 +1174,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; + struct bpf_prog_array *progs; u32 *ids, prog_cnt, ids_len; int ret; @@ -1195,10 +1199,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) */ mutex_lock(&bpf_event_mutex); - ret = bpf_prog_array_copy_info(event->tp_event->prog_array, - ids, - ids_len, - &prog_cnt); + progs = bpf_event_rcu_dereference(event->tp_event->prog_array); + ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt); mutex_unlock(&bpf_event_mutex); if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || -- cgit v1.2.3 From 91ca180dbdd687d45fe4aab055b02d29c91b90df Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 6 Feb 2019 16:39:13 -0600 Subject: signal: Use force_sig_fault_to_task for the two calls that don't deliver to current In preparation for removing the task parameter from force_sig_fault introduce force_sig_fault_to_task and use it for the two cases where it matters. On mips force_fcr31_sig calls force_sig_fault and is called on either the current task, or a task that is suspended and is being switched to by the scheduler. This is safe because the task being switched to by the scheduler is guaranteed to be suspended. This ensures that task->sighand is stable while the signal is delivered to it. On parisc user_enable_single_step calls force_sig_fault and is in turn called by ptrace_request. The function ptrace_request always calls user_enable_single_step on a child that is stopped for tracing. The child being traced and not reaped ensures that child->sighand is not NULL, and that the child will not change child->sighand. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 398489facf9f..e420489ac4c9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1620,7 +1620,7 @@ void force_sigsegv(int sig) force_sig(SIGSEGV); } -int force_sig_fault(int sig, int code, void __user *addr +int force_sig_fault_to_task(int sig, int code, void __user *addr ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t) @@ -1643,6 +1643,16 @@ int force_sig_fault(int sig, int code, void __user *addr return force_sig_info(info.si_signo, &info, t); } +int force_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t) +{ + return force_sig_fault_to_task(sig, code, addr + ___ARCH_SI_TRAPNO(trapno) + ___ARCH_SI_IA64(imm, flags, isr), t); +} + int send_sig_fault(int sig, int code, void __user *addr ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) -- cgit v1.2.3 From 2e1661d2673667d886cd40ad9f414cb6db48d8da Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 23 May 2019 11:04:24 -0500 Subject: signal: Remove the task parameter from force_sig_fault As synchronous exceptions really only make sense against the current task (otherwise how are you synchronous) remove the task parameter from from force_sig_fault to make it explicit that is what is going on. The two known exceptions that deliver a synchronous exception to a stopped ptraced task have already been changed to force_sig_fault_to_task. The callers have been changed with the following emacs regular expression (with obvious variations on the architectures that take more arguments) to avoid typos: force_sig_fault[(]\([^,]+\)[,]\([^,]+\)[,]\([^,]+\)[,]\W+current[)] -> force_sig_fault(\1,\2,\3) Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e420489ac4c9..d92b636b4e9d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1645,12 +1645,11 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr int force_sig_fault(int sig, int code, void __user *addr ___ARCH_SI_TRAPNO(int trapno) - ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) - , struct task_struct *t) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)) { return force_sig_fault_to_task(sig, code, addr ___ARCH_SI_TRAPNO(trapno) - ___ARCH_SI_IA64(imm, flags, isr), t); + ___ARCH_SI_IA64(imm, flags, isr), current); } int send_sig_fault(int sig, int code, void __user *addr -- cgit v1.2.3 From 8917bef336f5301edd616cfa97b97d0319fd0496 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 May 2019 22:56:17 -0500 Subject: signal: Properly set TRACE_SIGNAL_LOSE_INFO in __send_signal Any time siginfo is not stored in the signal queue information is lost. Therefore set TRACE_SIGNAL_LOSE_INFO every time the code does not allocate a signal queue entry, and a queue overflow abort is not triggered. Fixes: ba005e1f4172 ("tracepoint: Add signal loss events") Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d92b636b4e9d..b2f0cf3a68aa 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1131,23 +1131,22 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc copy_siginfo(&q->info, info); break; } - } else if (!is_si_special(info)) { - if (sig >= SIGRTMIN && info->si_code != SI_USER) { - /* - * Queue overflow, abort. We may abort if the - * signal was rt and sent by user using something - * other than kill(). - */ - result = TRACE_SIGNAL_OVERFLOW_FAIL; - ret = -EAGAIN; - goto ret; - } else { - /* - * This is a silent loss of information. We still - * send the signal, but the *info bits are lost. - */ - result = TRACE_SIGNAL_LOSE_INFO; - } + } else if (!is_si_special(info) && + sig >= SIGRTMIN && info->si_code != SI_USER) { + /* + * Queue overflow, abort. We may abort if the + * signal was rt and sent by user using something + * other than kill(). + */ + result = TRACE_SIGNAL_OVERFLOW_FAIL; + ret = -EAGAIN; + goto ret; + } else { + /* + * This is a silent loss of information. We still + * send the signal, but the *info bits are lost. + */ + result = TRACE_SIGNAL_LOSE_INFO; } out_set: -- cgit v1.2.3 From 8ad23dea808042ffb8bb1b45111af6ddc1bac5d3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 May 2019 22:23:32 -0500 Subject: signal: Move the computation of force into send_signal and correct it. Forcing a signal or not allowing a pid namespace init to ignore SIGKILL or SIGSTOP is more cleanly computed in send_signal. There are two cases where we don't allow a pid namespace init to ignore SIGKILL or SIGSTOP. If the sending process is from an ancestor pid namespace and as such is effectively the god to the target process, and if the it is the kernel that is sending the signal, not another application. It is known that a process is from an ancestor pid namespace if it can see it's target but it's target does not have a pid for the sender in it's pid namespace. It is know that a signal is sent from the kernel if si_code is set to SI_KERNEL or info is SEND_SIG_PRIV (which ultimately generates a signal with si_code == SI_KERNEL). The only signals that matter are SIGKILL and SIGSTOP neither of which can really be caught, and both of which always have a siginfo layout that includes si_uid and si_pid. Therefore we never need to worry about forcing a signal when si_pid and si_uid are absent. So handle the two special cases of info and the case when si_pid and si_uid are present. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index b2f0cf3a68aa..0da35880261e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1057,7 +1057,7 @@ static inline bool legacy_queue(struct sigpending *signals, int sig) } static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, - enum pid_type type, int from_ancestor_ns) + enum pid_type type, bool force) { struct sigpending *pending; struct sigqueue *q; @@ -1067,8 +1067,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc assert_spin_locked(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, - from_ancestor_ns || (info == SEND_SIG_PRIV))) + if (!prepare_signal(sig, t, force)) goto ret; pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; @@ -1198,13 +1197,17 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type) { - int from_ancestor_ns = 0; - -#ifdef CONFIG_PID_NS - from_ancestor_ns = si_fromuser(info) && - !task_pid_nr_ns(current, task_active_pid_ns(t)); -#endif - if (!is_si_special(info) && has_si_pid_and_uid(info)) { + /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */ + bool force = false; + + if (info == SEND_SIG_NOINFO) { + /* Force if sent from an ancestor pid namespace */ + force = !task_pid_nr_ns(current, task_active_pid_ns(t)); + } else if (info == SEND_SIG_PRIV) { + /* Don't ignore kernel generated signals */ + force = true; + } else if (has_si_pid_and_uid(info)) { + /* SIGKILL and SIGSTOP is special or has ids */ struct user_namespace *t_user_ns; rcu_read_lock(); @@ -1215,10 +1218,16 @@ static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct } rcu_read_unlock(); - if (!task_pid_nr_ns(current, task_active_pid_ns(t))) + /* A kernel generated signal? */ + force = (info->si_code == SI_KERNEL); + + /* From an ancestor pid namespace? */ + if (!task_pid_nr_ns(current, task_active_pid_ns(t))) { info->si_pid = 0; + force = true; + } } - return __send_signal(sig, info, t, type, from_ancestor_ns); + return __send_signal(sig, info, t, type, force); } static void print_fatal_signal(int signr) @@ -1509,7 +1518,7 @@ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, if (sig) { if (lock_task_sighand(p, &flags)) { - ret = __send_signal(sig, &info, p, PIDTYPE_TGID, 0); + ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false); unlock_task_sighand(p, &flags); } else ret = -ESRCH; -- cgit v1.2.3 From ffafd23b2cf14093b559c3e33b427058ce8aa577 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 14 May 2019 19:17:47 -0500 Subject: signal: Generate the siginfo in force_sig In preparation for removing the special case in force_sig_info for only having a signal number generate an appropriate siginfo in force_sig the last caller of force_sig_info that does not pass a filled out siginfo. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 0da35880261e..d5f9ed5da9c5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1605,7 +1605,15 @@ EXPORT_SYMBOL(send_sig); void force_sig(int sig) { - force_sig_info(sig, SEND_SIG_PRIV, current); + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_KERNEL; + info.si_pid = 0; + info.si_uid = 0; + force_sig_info(info.si_signo, &info, current); } EXPORT_SYMBOL(force_sig); -- cgit v1.2.3 From 59c0e696a6c0fe6a8d718a43aecd72347db6a7f0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 7 Feb 2019 11:01:20 -0600 Subject: signal: Factor force_sig_info_to_task out of force_sig_info All callers of force_sig_info pass info.si_signo in for the signal by definition as well as in practice. Further all callers of force_sig_info except force_sig_fault_to_task pass current as the target task to force_sig_info. Factor out a static force_sig_info_to_task that force_sig_fault_to_task can call. This prepares the way for force_sig_info to have it's task and signal parameters removed. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d5f9ed5da9c5..0984158cd41a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1294,12 +1294,13 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p * We don't want to have recursive SIGSEGV's etc, for example, * that is why we also clear SIGNAL_UNKILLABLE. */ -int -force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t) +static int +force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) { unsigned long int flags; int ret, blocked, ignored; struct k_sigaction *action; + int sig = info->si_signo; spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; @@ -1324,6 +1325,11 @@ force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t) return ret; } +int force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t) +{ + return force_sig_info_to_task(info, t); +} + /* * Nuke all other threads in the group. */ @@ -1656,7 +1662,7 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr info.si_flags = flags; info.si_isr = isr; #endif - return force_sig_info(info.si_signo, &info, t); + return force_sig_info_to_task(&info, t); } int force_sig_fault(int sig, int code, void __user *addr -- cgit v1.2.3 From a89e9b8abf82725e4ac96100e07c8104dbe8a240 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 May 2019 10:11:09 -0500 Subject: signal: Remove the signal number and task parameters from force_sig_info force_sig_info always delivers to the current task and the signal parameter always matches info.si_signo. So remove those parameters to make it a simpler less error prone interface, and to make it clear that none of the callers are doing anything clever. This guarantees that force_sig_info will not grow any new buggy callers that attempt to call force_sig on a non-current task, or that pass an signal number that does not match info.si_signo. Signed-off-by: "Eric W. Biederman" --- kernel/seccomp.c | 2 +- kernel/signal.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 811b4a86cdf6..dba52a7db5e8 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -609,7 +609,7 @@ static void seccomp_send_sigsys(int syscall, int reason) { struct kernel_siginfo info; seccomp_init_siginfo(&info, syscall, reason); - force_sig_info(SIGSYS, &info, current); + force_sig_info(&info); } #endif /* CONFIG_SECCOMP_FILTER */ diff --git a/kernel/signal.c b/kernel/signal.c index 0984158cd41a..ff6944e4964e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1325,9 +1325,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) return ret; } -int force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t) +int force_sig_info(struct kernel_siginfo *info) { - return force_sig_info_to_task(info, t); + return force_sig_info_to_task(info, current); } /* @@ -1619,7 +1619,7 @@ void force_sig(int sig) info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; - force_sig_info(info.si_signo, &info, current); + force_sig_info(&info); } EXPORT_SYMBOL(force_sig); @@ -1708,7 +1708,7 @@ int force_sig_mceerr(int code, void __user *addr, short lsb) info.si_code = code; info.si_addr = addr; info.si_addr_lsb = lsb; - return force_sig_info(info.si_signo, &info, current); + return force_sig_info(&info); } int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) @@ -1737,7 +1737,7 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) info.si_addr = addr; info.si_lower = lower; info.si_upper = upper; - return force_sig_info(info.si_signo, &info, current); + return force_sig_info(&info); } #ifdef SEGV_PKUERR @@ -1751,7 +1751,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) info.si_code = SEGV_PKUERR; info.si_addr = addr; info.si_pkey = pkey; - return force_sig_info(info.si_signo, &info, current); + return force_sig_info(&info); } #endif @@ -1767,7 +1767,7 @@ int force_sig_ptrace_errno_trap(int errno, void __user *addr) info.si_errno = errno; info.si_code = TRAP_HWBKPT; info.si_addr = addr; - return force_sig_info(info.si_signo, &info, current); + return force_sig_info(&info); } int kill_pgrp(struct pid *pid, int sig, int priv) -- cgit v1.2.3 From 839d05e413856bd686a33b59294d4e8238169320 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 30 May 2019 12:53:42 -0400 Subject: audit: remove the BUG() calls in the audit rule comparison functions The audit_data_to_entry() function ensures that the operator is valid so we can get rid of these BUG() calls. We keep the "return 0" just so the system behaves in a sane-ish manner should something go horribly wrong. Signed-off-by: Paul Moore Acked-by: Richard Guy Briggs --- kernel/auditfilter.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index e69d136eeaf6..1a21b6aa50d1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1220,7 +1220,6 @@ int audit_comparator(u32 left, u32 op, u32 right) case Audit_bittest: return ((left & right) == right); default: - BUG(); return 0; } } @@ -1243,7 +1242,6 @@ int audit_uid_comparator(kuid_t left, u32 op, kuid_t right) case Audit_bitmask: case Audit_bittest: default: - BUG(); return 0; } } @@ -1266,7 +1264,6 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) case Audit_bitmask: case Audit_bittest: default: - BUG(); return 0; } } -- cgit v1.2.3 From a5e112e6424adb77d953eac20e6936b952fd6b32 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 May 2019 12:37:17 -0700 Subject: cgroup: add cgroup_parse_float() cgroup already uses floating point for percent[ile] numbers and there are several controllers which want to take them as input. Add a generic parse helper to handle inputs. Update the interface convention documentation about the use of percentage numbers. While at it, also clarify the default time unit. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a7df319c2e9a..7dffcfe17441 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6387,4 +6387,47 @@ static int __init cgroup_sysfs_init(void) return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); } subsys_initcall(cgroup_sysfs_init); + +static u64 power_of_ten(int power) +{ + u64 v = 1; + while (power--) + v *= 10; + return v; +} + +/** + * cgroup_parse_float - parse a floating number + * @input: input string + * @dec_shift: number of decimal digits to shift + * @v: output + * + * Parse a decimal floating point number in @input and store the result in + * @v with decimal point right shifted @dec_shift times. For example, if + * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345. + * Returns 0 on success, -errno otherwise. + * + * There's nothing cgroup specific about this function except that it's + * currently the only user. + */ +int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) +{ + s64 whole, frac = 0; + int fstart = 0, fend = 0, flen; + + if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend)) + return -EINVAL; + if (frac < 0) + return -EINVAL; + + flen = fend > fstart ? fend - fstart : 0; + if (flen < dec_shift) + frac *= power_of_ten(dec_shift - flen); + else + frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift)); + + *v = whole * power_of_ten(dec_shift) + frac; + return 0; +} + #endif /* CONFIG_SYSFS */ -- cgit v1.2.3 From 5cf1e91456301f8c4f6bbc63ff76cff12f92f31b Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:36 -0700 Subject: bpf: cgroup inet skb programs can return 0 to 3 Allows cgroup inet skb programs to return values in the range [0, 3]. The second bit is used to deterine if congestion occurred and higher level protocol should decrease rate. E.g. TCP would call tcp_enter_cwr() The bpf_prog must set expected_attach_type to BPF_CGROUP_INET_EGRESS at load time if it uses the new return values (i.e. 2 or 3). The expected_attach_type is currently not enforced for BPF_PROG_TYPE_CGROUP_SKB. e.g Meaning the current bpf_prog with expected_attach_type setting to BPF_CGROUP_INET_EGRESS can attach to BPF_CGROUP_INET_INGRESS. Blindly enforcing expected_attach_type will break backward compatibility. This patch adds a enforce_expected_attach_type bit to only enforce the expected_attach_type when it uses the new return value. Signed-off-by: Lawrence Brakmo Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 12 ++++++++++++ kernel/bpf/verifier.c | 16 +++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3d546b6f4646..1539774d78c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1585,6 +1585,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_CGROUP_SKB: + switch (expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + return 0; + default: + return -EINVAL; + } default: return 0; } @@ -1836,6 +1844,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + case BPF_PROG_TYPE_CGROUP_SKB: + return prog->enforce_expected_attach_type && + prog->expected_attach_type != attach_type ? + -EINVAL : 0; default: return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2778417e6e0c..5c2cb5bd84ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5508,11 +5508,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) static int check_return_code(struct bpf_verifier_env *env) { + struct tnum enforce_attach_type_range = tnum_unknown; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: + if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { + range = tnum_range(0, 3); + enforce_attach_type_range = tnum_range(2, 3); + } case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: @@ -5531,18 +5536,23 @@ static int check_return_code(struct bpf_verifier_env *env) } if (!tnum_in(range, reg->var_off)) { + char tn_buf[48]; + verbose(env, "At program exit the register R0 "); if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "has value %s", tn_buf); } else { verbose(env, "has unknown scalar value"); } - verbose(env, " should have been 0 or 1\n"); + tnum_strn(tn_buf, sizeof(tn_buf), range); + verbose(env, " should have been %s\n", tn_buf); return -EINVAL; } + + if (!tnum_is_unknown(enforce_attach_type_range) && + tnum_in(enforce_attach_type_range, reg->var_off)) + env->prog->enforce_expected_attach_type = 1; return 0; } -- cgit v1.2.3 From e7a3160d092aa4dd7fb2ef23335cb3a98400aec6 Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 28 May 2019 16:59:37 -0700 Subject: bpf: Update __cgroup_bpf_run_filter_skb with cn For egress packets, __cgroup_bpf_fun_filter_skb() will now call BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY() instead of PROG_CGROUP_RUN_ARRAY() in order to propagate congestion notifications (cn) requests to TCP callers. For egress packets, this function can return: NET_XMIT_SUCCESS (0) - continue with packet output NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr NET_XMIT_CN (2) - continue with packet output and notify TCP to call cwr -EPERM - drop packet For ingress packets, this function will return -EPERM if any attached program was found and if it returned != 1 during execution. Otherwise 0 is returned. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ff594eb86fd7..1b65ab0df457 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -587,8 +587,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * The program type passed in via @type must be suitable for network * filtering. No further check is performed to assert that. * - * This function will return %-EPERM if any if an attached program was found - * and if it returned != 1 during execution. In all other cases, 0 is returned. + * For egress packets, this function can return: + * NET_XMIT_SUCCESS (0) - continue with packet output + * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr + * NET_XMIT_CN (2) - continue with packet output and notify TCP + * to call cwr + * -EPERM - drop packet + * + * For ingress packets, this function will return -EPERM if any + * attached program was found and if it returned != 1 during execution. + * Otherwise 0 is returned. */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, @@ -614,12 +622,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - __bpf_prog_run_save_cb); + if (type == BPF_CGROUP_INET_EGRESS) { + ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( + cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + } else { + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + __bpf_prog_run_save_cb); + ret = (ret == 1 ? 0 : -EPERM); + } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; - return ret == 1 ? 0 : -EPERM; + + return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); -- cgit v1.2.3 From ffc8b144d5d056dd0ab8d995c7345cdd6a589fc7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:55 -0700 Subject: bpf: add memlock precharge check for cgroup_local_storage Cgroup local storage maps lack the memlock precharge check, which is performed before the memory allocation for most other bpf map types. Let's add it in order to unify all map types. Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/local_storage.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 980e8f1f6cb5..e48302ecb389 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -272,6 +272,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; + u32 pages; + int ret; if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) return ERR_PTR(-EINVAL); @@ -290,13 +292,18 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); + pages = round_up(sizeof(struct bpf_cgroup_storage_map), PAGE_SIZE) >> + PAGE_SHIFT; + ret = bpf_map_precharge_memlock(pages); + if (ret < 0) + return ERR_PTR(ret); + map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), __GFP_ZERO | GFP_USER, numa_node); if (!map) return ERR_PTR(-ENOMEM); - map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), - PAGE_SIZE) >> PAGE_SHIFT; + map->map.pages = pages; /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); -- cgit v1.2.3 From 3539b96e041c06e4317082816d90ec09160aeb11 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:57 -0700 Subject: bpf: group memory related fields in struct bpf_map_memory Group "user" and "pages" fields of bpf_map into the bpf_map_memory structure. Later it can be extended with "memcg" and other related information. The main reason for a such change (beside cosmetics) is to pass bpf_map_memory structure to charging functions before the actual allocation of bpf_map. Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 2 +- kernel/bpf/cpumap.c | 4 ++-- kernel/bpf/devmap.c | 4 ++-- kernel/bpf/hashtab.c | 4 ++-- kernel/bpf/local_storage.c | 2 +- kernel/bpf/lpm_trie.c | 4 ++-- kernel/bpf/queue_stack_maps.c | 2 +- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/stackmap.c | 4 ++-- kernel/bpf/syscall.c | 19 ++++++++++--------- kernel/bpf/xskmap.c | 4 ++-- 11 files changed, 26 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 584636c9e2eb..8fda24e78193 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -138,7 +138,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.pages = cost; + array->map.memory.pages = cost; array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index cf727d77c6c6..035268add724 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -108,10 +108,10 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); if (cost >= U32_MAX - PAGE_SIZE) goto free_cmap; - cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + cmap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_precharge_memlock(cmap->map.pages); + ret = bpf_map_precharge_memlock(cmap->map.memory.pages); if (ret) { err = ret; goto free_cmap; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1e525d70f833..f6c57efb1d0d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -111,10 +111,10 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_dtab; - dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + dtab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(dtab->map.pages); + err = bpf_map_precharge_memlock(dtab->map.memory.pages); if (err) goto free_dtab; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0f2708fde5f7..15bf228d2e98 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -364,10 +364,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + htab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(htab->map.pages); + err = bpf_map_precharge_memlock(htab->map.memory.pages); if (err) goto free_htab; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index e48302ecb389..574325276650 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -303,7 +303,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (!map) return ERR_PTR(-ENOMEM); - map->map.pages = pages; + map->map.memory.pages = pages; /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e61630c2e50b..8e423a582760 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -578,9 +578,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) goto out_err; } - trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + trie->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(trie->map.pages); + ret = bpf_map_precharge_memlock(trie->map.memory.pages); if (ret) goto out_err; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 0b140d236889..8a510e71d486 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -89,7 +89,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&qs->map, attr); - qs->map.pages = cost; + qs->map.memory.pages = cost; qs->size = size; raw_spin_lock_init(&qs->lock); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 18e225de80ff..819515242739 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -176,7 +176,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.pages = cost; + array->map.memory.pages = cost; return &array->map; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 950ab2f28922..08d4efff73ac 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -131,9 +131,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; smap->n_buckets = n_buckets; - smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + smap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(smap->map.pages); + err = bpf_map_precharge_memlock(smap->map.memory.pages); if (err) goto free_smap; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1539774d78c7..8289a2ce14fc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -222,19 +222,20 @@ static int bpf_map_init_memlock(struct bpf_map *map) struct user_struct *user = get_current_user(); int ret; - ret = bpf_charge_memlock(user, map->pages); + ret = bpf_charge_memlock(user, map->memory.pages); if (ret) { free_uid(user); return ret; } - map->user = user; + map->memory.user = user; return ret; } static void bpf_map_release_memlock(struct bpf_map *map) { - struct user_struct *user = map->user; - bpf_uncharge_memlock(user, map->pages); + struct user_struct *user = map->memory.user; + + bpf_uncharge_memlock(user, map->memory.pages); free_uid(user); } @@ -242,17 +243,17 @@ int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) { int ret; - ret = bpf_charge_memlock(map->user, pages); + ret = bpf_charge_memlock(map->memory.user, pages); if (ret) return ret; - map->pages += pages; + map->memory.pages += pages; return ret; } void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) { - bpf_uncharge_memlock(map->user, pages); - map->pages -= pages; + bpf_uncharge_memlock(map->memory.user, pages); + map->memory.pages -= pages; } static int bpf_map_alloc_id(struct bpf_map *map) @@ -395,7 +396,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->value_size, map->max_entries, map->map_flags, - map->pages * 1ULL << PAGE_SHIFT, + map->memory.pages * 1ULL << PAGE_SHIFT, map->id, READ_ONCE(map->frozen)); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 686d244e798d..f816ee1a0fa0 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -40,10 +40,10 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_m; - m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + m->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_precharge_memlock(m->map.pages); + err = bpf_map_precharge_memlock(m->map.memory.pages); if (err) goto free_m; -- cgit v1.2.3 From b936ca643ade11f265fa10e5fb71c20d9c5243f1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:58 -0700 Subject: bpf: rework memlock-based memory accounting for maps In order to unify the existing memlock charging code with the memcg-based memory accounting, which will be added later, let's rework the current scheme. Currently the following design is used: 1) .alloc() callback optionally checks if the allocation will likely succeed using bpf_map_precharge_memlock() 2) .alloc() performs actual allocations 3) .alloc() callback calculates map cost and sets map.memory.pages 4) map_create() calls bpf_map_init_memlock() which sets map.memory.user and performs actual charging; in case of failure the map is destroyed 1) bpf_map_free_deferred() calls bpf_map_release_memlock(), which performs uncharge and releases the user 2) .map_free() callback releases the memory The scheme can be simplified and made more robust: 1) .alloc() calculates map cost and calls bpf_map_charge_init() 2) bpf_map_charge_init() sets map.memory.user and performs actual charge 3) .alloc() performs actual allocations 1) .map_free() callback releases the memory 2) bpf_map_charge_finish() performs uncharge and releases the user The new scheme also allows to reuse bpf_map_charge_init()/finish() functions for memcg-based accounting. Because charges are performed before actual allocations and uncharges after freeing the memory, no bogus memory pressure can be created. In cases when the map structure is not available (e.g. it's not created yet, or is already destroyed), on-stack bpf_map_memory structure is used. The charge can be transferred with the bpf_map_charge_move() function. Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 10 +++++-- kernel/bpf/cpumap.c | 8 +++-- kernel/bpf/devmap.c | 13 ++++---- kernel/bpf/hashtab.c | 11 +++---- kernel/bpf/local_storage.c | 9 ++++-- kernel/bpf/lpm_trie.c | 5 ++-- kernel/bpf/queue_stack_maps.c | 9 ++++-- kernel/bpf/reuseport_array.c | 9 ++++-- kernel/bpf/stackmap.c | 30 +++++++++++-------- kernel/bpf/syscall.c | 69 +++++++++++++++++++++---------------------- kernel/bpf/xskmap.c | 9 +++--- 11 files changed, 100 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8fda24e78193..3552da4407d9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -83,6 +83,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) u32 elem_size, index_mask, max_entries; bool unpriv = !capable(CAP_SYS_ADMIN); u64 cost, array_size, mask64; + struct bpf_map_memory mem; struct bpf_array *array; elem_size = round_up(attr->value_size, 8); @@ -125,23 +126,26 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(cost); + ret = bpf_map_charge_init(&mem, cost); if (ret < 0) return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); - if (!array) + if (!array) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } array->index_mask = index_mask; array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.memory.pages = cost; + bpf_map_charge_move(&array->map.memory, &mem); array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { + bpf_map_charge_finish(&array->map.memory); bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 035268add724..c633c8d68023 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -108,10 +108,10 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); if (cost >= U32_MAX - PAGE_SIZE) goto free_cmap; - cmap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_precharge_memlock(cmap->map.memory.pages); + ret = bpf_map_charge_init(&cmap->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (ret) { err = ret; goto free_cmap; @@ -121,7 +121,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), __alignof__(unsigned long)); if (!cmap->flush_needed) - goto free_cmap; + goto free_charge; /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * @@ -133,6 +133,8 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_percpu: free_percpu(cmap->flush_needed); +free_charge: + bpf_map_charge_finish(&cmap->map.memory); free_cmap: kfree(cmap); return ERR_PTR(err); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f6c57efb1d0d..371bd880ed58 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -111,10 +111,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_dtab; - dtab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(dtab->map.memory.pages); + /* if map size is larger than memlock limit, reject it */ + err = bpf_map_charge_init(&dtab->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (err) goto free_dtab; @@ -125,19 +124,21 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) __alignof__(unsigned long), GFP_KERNEL | __GFP_NOWARN); if (!dtab->flush_needed) - goto free_dtab; + goto free_charge; dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_dtab; + goto free_charge; spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; +free_charge: + bpf_map_charge_finish(&dtab->map.memory); free_dtab: free_percpu(dtab->flush_needed); kfree(dtab); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 15bf228d2e98..b0bdc7b040ad 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -364,10 +364,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(htab->map.memory.pages); + /* if map size is larger than memlock limit, reject it */ + err = bpf_map_charge_init(&htab->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (err) goto free_htab; @@ -376,7 +375,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) sizeof(struct bucket), htab->map.numa_node); if (!htab->buckets) - goto free_htab; + goto free_charge; if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; @@ -409,6 +408,8 @@ free_prealloc: prealloc_destroy(htab); free_buckets: bpf_map_area_free(htab->buckets); +free_charge: + bpf_map_charge_finish(&htab->map.memory); free_htab: kfree(htab); return ERR_PTR(err); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 574325276650..e49bfd4f4f6d 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -272,6 +272,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; + struct bpf_map_memory mem; u32 pages; int ret; @@ -294,16 +295,18 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) pages = round_up(sizeof(struct bpf_cgroup_storage_map), PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(pages); + ret = bpf_map_charge_init(&mem, pages); if (ret < 0) return ERR_PTR(ret); map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), __GFP_ZERO | GFP_USER, numa_node); - if (!map) + if (!map) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } - map->map.memory.pages = pages; + bpf_map_charge_move(&map->map.memory, &mem); /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 8e423a582760..6345a8d2dcd0 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -578,9 +578,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) goto out_err; } - trie->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - ret = bpf_map_precharge_memlock(trie->map.memory.pages); + ret = bpf_map_charge_init(&trie->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (ret) goto out_err; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8a510e71d486..224cb0fd8f03 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -67,6 +67,7 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int ret, numa_node = bpf_map_attr_numa_node(attr); + struct bpf_map_memory mem = {0}; struct bpf_queue_stack *qs; u64 size, queue_size, cost; @@ -77,19 +78,21 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(cost); + ret = bpf_map_charge_init(&mem, cost); if (ret < 0) return ERR_PTR(ret); qs = bpf_map_area_alloc(queue_size, numa_node); - if (!qs) + if (!qs) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } memset(qs, 0, sizeof(*qs)); bpf_map_init_from_attr(&qs->map, attr); - qs->map.memory.pages = cost; + bpf_map_charge_move(&qs->map.memory, &mem); qs->size = size; raw_spin_lock_init(&qs->lock); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 819515242739..5c6e25b1b9b1 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -151,6 +151,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int err, numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; + struct bpf_map_memory mem; u64 cost, array_size; if (!capable(CAP_SYS_ADMIN)) @@ -165,18 +166,20 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(cost); + err = bpf_map_charge_init(&mem, cost); if (err) return ERR_PTR(err); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); - if (!array) + if (!array) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.memory.pages = cost; + bpf_map_charge_move(&array->map.memory, &mem); return &array->map; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 08d4efff73ac..8da24ca65d97 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -89,6 +89,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_stack_map *smap; + struct bpf_map_memory mem; u64 cost, n_buckets; int err; @@ -116,40 +117,43 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) n_buckets = roundup_pow_of_two(attr->max_entries); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); + err = bpf_map_charge_init(&mem, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + if (err) + return ERR_PTR(err); + smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); - if (!smap) + if (!smap) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); - - err = -E2BIG; - cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_smap; + } bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; smap->n_buckets = n_buckets; - smap->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - err = bpf_map_precharge_memlock(smap->map.memory.pages); - if (err) - goto free_smap; err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) - goto free_smap; + goto free_charge; err = prealloc_elems_and_freelist(smap); if (err) goto put_buffers; + bpf_map_charge_move(&smap->map.memory, &mem); + return &smap->map; put_buffers: put_callchain_buffers(); -free_smap: +free_charge: + bpf_map_charge_finish(&mem); bpf_map_area_free(smap); return ERR_PTR(err); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8289a2ce14fc..4a5ebad99154 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -188,19 +188,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) map->numa_node = bpf_map_attr_numa_node(attr); } -int bpf_map_precharge_memlock(u32 pages) -{ - struct user_struct *user = get_current_user(); - unsigned long memlock_limit, cur; - - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - cur = atomic_long_read(&user->locked_vm); - free_uid(user); - if (cur + pages > memlock_limit) - return -EPERM; - return 0; -} - static int bpf_charge_memlock(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -214,29 +201,40 @@ static int bpf_charge_memlock(struct user_struct *user, u32 pages) static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) { - atomic_long_sub(pages, &user->locked_vm); + if (user) + atomic_long_sub(pages, &user->locked_vm); } -static int bpf_map_init_memlock(struct bpf_map *map) +int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages) { struct user_struct *user = get_current_user(); int ret; - ret = bpf_charge_memlock(user, map->memory.pages); + ret = bpf_charge_memlock(user, pages); if (ret) { free_uid(user); return ret; } - map->memory.user = user; - return ret; + + mem->pages = pages; + mem->user = user; + + return 0; } -static void bpf_map_release_memlock(struct bpf_map *map) +void bpf_map_charge_finish(struct bpf_map_memory *mem) { - struct user_struct *user = map->memory.user; + bpf_uncharge_memlock(mem->user, mem->pages); + free_uid(mem->user); +} - bpf_uncharge_memlock(user, map->memory.pages); - free_uid(user); +void bpf_map_charge_move(struct bpf_map_memory *dst, + struct bpf_map_memory *src) +{ + *dst = *src; + + /* Make sure src will not be used for the redundant uncharging. */ + memset(src, 0, sizeof(struct bpf_map_memory)); } int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) @@ -304,11 +302,13 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); + struct bpf_map_memory mem; - bpf_map_release_memlock(map); + bpf_map_charge_move(&mem, &map->memory); security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); + bpf_map_charge_finish(&mem); } static void bpf_map_put_uref(struct bpf_map *map) @@ -550,6 +550,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_map_memory mem; struct bpf_map *map; int f_flags; int err; @@ -574,7 +575,7 @@ static int map_create(union bpf_attr *attr) err = bpf_obj_name_cpy(map->name, attr->map_name); if (err) - goto free_map_nouncharge; + goto free_map; atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); @@ -584,20 +585,20 @@ static int map_create(union bpf_attr *attr) if (!attr->btf_value_type_id) { err = -EINVAL; - goto free_map_nouncharge; + goto free_map; } btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { err = PTR_ERR(btf); - goto free_map_nouncharge; + goto free_map; } err = map_check_btf(map, btf, attr->btf_key_type_id, attr->btf_value_type_id); if (err) { btf_put(btf); - goto free_map_nouncharge; + goto free_map; } map->btf = btf; @@ -609,15 +610,11 @@ static int map_create(union bpf_attr *attr) err = security_bpf_map_alloc(map); if (err) - goto free_map_nouncharge; - - err = bpf_map_init_memlock(map); - if (err) - goto free_map_sec; + goto free_map; err = bpf_map_alloc_id(map); if (err) - goto free_map; + goto free_map_sec; err = bpf_map_new_fd(map, f_flags); if (err < 0) { @@ -633,13 +630,13 @@ static int map_create(union bpf_attr *attr) return err; -free_map: - bpf_map_release_memlock(map); free_map_sec: security_bpf_map_free(map); -free_map_nouncharge: +free_map: btf_put(map->btf); + bpf_map_charge_move(&mem, &map->memory); map->ops->map_free(map); + bpf_map_charge_finish(&mem); return err; } diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index f816ee1a0fa0..a329dab7c7a4 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -40,10 +40,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_m; - m->map.memory.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_precharge_memlock(m->map.memory.pages); + err = bpf_map_charge_init(&m->map.memory, + round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); if (err) goto free_m; @@ -51,7 +50,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) m->flush_list = alloc_percpu(struct list_head); if (!m->flush_list) - goto free_m; + goto free_charge; for_each_possible_cpu(cpu) INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); @@ -65,6 +64,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) free_percpu: free_percpu(m->flush_list); +free_charge: + bpf_map_charge_finish(&m->map.memory); free_m: kfree(m); return ERR_PTR(err); -- cgit v1.2.3 From c85d69135a9175c50a823d04d62d932312d037b3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 29 May 2019 18:03:59 -0700 Subject: bpf: move memory size checks to bpf_map_charge_init() Most bpf map types doing similar checks and bytes to pages conversion during memory allocation and charging. Let's unify these checks by moving them into bpf_map_charge_init(). Signed-off-by: Roman Gushchin Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 8 +------- kernel/bpf/cpumap.c | 5 +---- kernel/bpf/devmap.c | 5 +---- kernel/bpf/hashtab.c | 7 +------ kernel/bpf/local_storage.c | 5 +---- kernel/bpf/lpm_trie.c | 7 +------ kernel/bpf/queue_stack_maps.c | 4 ---- kernel/bpf/reuseport_array.c | 10 ++-------- kernel/bpf/stackmap.c | 8 +------- kernel/bpf/syscall.c | 9 +++++++-- kernel/bpf/xskmap.c | 5 +---- 11 files changed, 17 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3552da4407d9..0349cbf23cdb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -117,14 +117,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* make sure there is no u32 overflow later in round_up() */ cost = array_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - if (percpu) { + if (percpu) cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - } - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; ret = bpf_map_charge_init(&mem, cost); if (ret < 0) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index c633c8d68023..b31a71909307 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -106,12 +106,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_cmap; /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_charge_init(&cmap->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + ret = bpf_map_charge_init(&cmap->map.memory, cost); if (ret) { err = ret; goto free_cmap; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 371bd880ed58..5ae7cce5ef16 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -108,12 +108,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); cost += dev_map_bitmap_size(attr) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_dtab; /* if map size is larger than memlock limit, reject it */ - err = bpf_map_charge_init(&dtab->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) goto free_dtab; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b0bdc7b040ad..d92e05d9979b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -360,13 +360,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else cost += (u64) htab->elem_size * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - /* make sure page count doesn't overflow */ - goto free_htab; - /* if map size is larger than memlock limit, reject it */ - err = bpf_map_charge_init(&htab->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&htab->map.memory, cost); if (err) goto free_htab; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index e49bfd4f4f6d..addd6fdceec8 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -273,7 +273,6 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; struct bpf_map_memory mem; - u32 pages; int ret; if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) @@ -293,9 +292,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); - pages = round_up(sizeof(struct bpf_cgroup_storage_map), PAGE_SIZE) >> - PAGE_SHIFT; - ret = bpf_map_charge_init(&mem, pages); + ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map)); if (ret < 0) return ERR_PTR(ret); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 6345a8d2dcd0..09334f13a8a0 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -573,13 +573,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) cost_per_node = sizeof(struct lpm_trie_node) + attr->value_size + trie->data_size; cost += (u64) attr->max_entries * cost_per_node; - if (cost >= U32_MAX - PAGE_SIZE) { - ret = -E2BIG; - goto out_err; - } - ret = bpf_map_charge_init(&trie->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + ret = bpf_map_charge_init(&trie->map.memory, cost); if (ret) goto out_err; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 224cb0fd8f03..f697647ceb54 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -73,10 +73,6 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) size = (u64) attr->max_entries + 1; cost = queue_size = sizeof(*qs) + size * attr->value_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-E2BIG); - - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; ret = bpf_map_charge_init(&mem, cost); if (ret < 0) diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 5c6e25b1b9b1..50c083ba978c 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -152,7 +152,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) int err, numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; struct bpf_map_memory mem; - u64 cost, array_size; + u64 array_size; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -160,13 +160,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) array_size = sizeof(*array); array_size += (u64)attr->max_entries * sizeof(struct sock *); - /* make sure there is no u32 overflow later in round_up() */ - cost = array_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - err = bpf_map_charge_init(&mem, cost); + err = bpf_map_charge_init(&mem, array_size); if (err) return ERR_PTR(err); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 8da24ca65d97..3d86072d8e32 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -117,14 +117,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) n_buckets = roundup_pow_of_two(attr->max_entries); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-E2BIG); cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-E2BIG); - - err = bpf_map_charge_init(&mem, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&mem, cost); if (err) return ERR_PTR(err); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4a5ebad99154..4c53cbd3329d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -205,11 +205,16 @@ static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) atomic_long_sub(pages, &user->locked_vm); } -int bpf_map_charge_init(struct bpf_map_memory *mem, u32 pages) +int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size) { - struct user_struct *user = get_current_user(); + u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; + struct user_struct *user; int ret; + if (size >= U32_MAX - PAGE_SIZE) + return -E2BIG; + + user = get_current_user(); ret = bpf_charge_memlock(user, pages); if (ret) { free_uid(user); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index a329dab7c7a4..22066c28ba61 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -37,12 +37,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); cost += sizeof(struct list_head) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_m; /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_charge_init(&m->map.memory, - round_up(cost, PAGE_SIZE) >> PAGE_SHIFT); + err = bpf_map_charge_init(&m->map.memory, cost); if (err) goto free_m; -- cgit v1.2.3 From 3bd3706251ee8ab67e69d9340ac2abdca217e733 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 23 Apr 2019 16:26:36 +0200 Subject: sched/core: Provide a pointer to the valid CPU mask In commit: 4b53a3412d66 ("sched/core: Remove the tsk_nr_cpus_allowed() wrapper") the tsk_nr_cpus_allowed() wrapper was removed. There was not much difference in !RT but in RT we used this to implement migrate_disable(). Within a migrate_disable() section the CPU mask is restricted to single CPU while the "normal" CPU mask remains untouched. As an alternative implementation Ingo suggested to use: struct task_struct { const cpumask_t *cpus_ptr; cpumask_t cpus_mask; }; with t->cpus_ptr = &t->cpus_mask; In -RT we then can switch the cpus_ptr to: t->cpus_ptr = &cpumask_of(task_cpu(p)); in a migration disabled region. The rules are simple: - Code that 'uses' ->cpus_allowed would use the pointer. - Code that 'modifies' ->cpus_allowed would use the direct mask. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/20190423142636.14347-1-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/cgroup/cpuset.c | 2 +- kernel/fork.c | 2 ++ kernel/sched/core.c | 40 ++++++++++++++++++++-------------------- kernel/sched/cpudeadline.c | 4 ++-- kernel/sched/cpupri.c | 4 ++-- kernel/sched/deadline.c | 6 +++--- kernel/sched/fair.c | 34 +++++++++++++++++----------------- kernel/sched/rt.c | 4 ++-- kernel/trace/trace_hwlat.c | 2 +- 9 files changed, 50 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6a1942ed781c..fe90fa1899e6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2829,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task) if (task_css_is_root(task, cpuset_cgrp_id)) return; - set_cpus_allowed_ptr(task, ¤t->cpus_allowed); + set_cpus_allowed_ptr(task, current->cpus_ptr); task->mems_allowed = current->mems_allowed; } diff --git a/kernel/fork.c b/kernel/fork.c index 75675b9bf6df..6be686283e55 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -894,6 +894,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); #endif + if (orig->cpus_ptr == &orig->cpus_mask) + tsk->cpus_ptr = &tsk->cpus_mask; /* * One for us, one for whoever does the "release_task()" (usually diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 874c427742a9..93ab85f0d076 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -930,7 +930,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) */ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; if (is_per_cpu_kthread(p)) @@ -1025,7 +1025,7 @@ static int migration_cpu_stop(void *data) local_irq_disable(); /* * We need to explicitly wake pending tasks before running - * __migrate_task() such that we will not miss enforcing cpus_allowed + * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ sched_ttwu_pending(); @@ -1056,7 +1056,7 @@ static int migration_cpu_stop(void *data) */ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) { - cpumask_copy(&p->cpus_allowed, new_mask); + cpumask_copy(&p->cpus_mask, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } @@ -1126,7 +1126,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (cpumask_equal(&p->cpus_allowed, new_mask)) + if (cpumask_equal(p->cpus_ptr, new_mask)) goto out; if (!cpumask_intersects(new_mask, cpu_valid_mask)) { @@ -1286,10 +1286,10 @@ static int migrate_swap_stop(void *data) if (task_cpu(arg->src_task) != arg->src_cpu) goto unlock; - if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed)) + if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) goto unlock; - if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed)) + if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) goto unlock; __migrate_swap_task(arg->src_task, arg->dst_cpu); @@ -1331,10 +1331,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) goto out; - if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed)) + if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) goto out; - if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed)) + if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) goto out; trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); @@ -1479,7 +1479,7 @@ void kick_process(struct task_struct *p) EXPORT_SYMBOL_GPL(kick_process); /* - * ->cpus_allowed is protected by both rq->lock and p->pi_lock + * ->cpus_ptr is protected by both rq->lock and p->pi_lock * * A few notes on cpu_active vs cpu_online: * @@ -1519,14 +1519,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for_each_cpu(dest_cpu, nodemask) { if (!cpu_active(dest_cpu)) continue; - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) return dest_cpu; } } for (;;) { /* Any allowed, online CPU? */ - for_each_cpu(dest_cpu, &p->cpus_allowed) { + for_each_cpu(dest_cpu, p->cpus_ptr) { if (!is_cpu_allowed(p, dest_cpu)) continue; @@ -1570,7 +1570,7 @@ out: } /* - * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) @@ -1580,11 +1580,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) if (p->nr_cpus_allowed > 1) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); else - cpu = cpumask_any(&p->cpus_allowed); + cpu = cpumask_any(p->cpus_ptr); /* * In order not to call set_task_cpu() on a blocking task we need - * to rely on ttwu() to place the task on a valid ->cpus_allowed + * to rely on ttwu() to place the task on a valid ->cpus_ptr * CPU. * * Since this is common to all placement strategies, this lives here. @@ -2395,7 +2395,7 @@ void wake_up_new_task(struct task_struct *p) #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: - * - cpus_allowed can change in the fork path + * - cpus_ptr can change in the fork path * - any previously selected CPU might disappear through hotplug * * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, @@ -4267,7 +4267,7 @@ change: * the entire root_domain to become SCHED_DEADLINE. We * will also fail if there's no bandwidth available. */ - if (!cpumask_subset(span, &p->cpus_allowed) || + if (!cpumask_subset(span, p->cpus_ptr) || rq->rd->dl_bw.bw == 0) { task_rq_unlock(rq, p, &rf); return -EPERM; @@ -4866,7 +4866,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); + cpumask_and(mask, &p->cpus_mask, cpu_active_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: @@ -5443,7 +5443,7 @@ int task_can_attach(struct task_struct *p, * allowed nodes is unnecessary. Thus, cpusets are not * applicable for such threads. This prevents checking for * success of set_cpus_allowed_ptr() on all attached tasks - * before cpus_allowed may be changed. + * before cpus_mask may be changed. */ if (p->flags & PF_NO_SETAFFINITY) { ret = -EINVAL; @@ -5470,7 +5470,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) if (curr_cpu == target_cpu) return 0; - if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; /* TODO: This is not properly updating schedstats */ @@ -5608,7 +5608,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) put_prev_task(rq, next); /* - * Rules for changing task_struct::cpus_allowed are holding + * Rules for changing task_struct::cpus_mask are holding * both pi_lock and rq->lock, such that holding either * stabilizes the mask. * diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 50316455ea66..d57fb2f8ae67 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -124,14 +124,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { + cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { return 1; } else { int best_cpu = cpudl_maximum(cp); WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); - if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && + if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { if (later_mask) cpumask_set_cpu(best_cpu, later_mask); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index daaadf939ccb..f7d2c10b4c92 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -98,11 +98,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (skip) continue; - if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) + if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) continue; if (lowest_mask) { - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); /* * We have to ensure that we have at least one bit diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 43901fa3f269..c1ef30861068 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -538,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * If we cannot preempt any rq, fall back to pick any * online CPU: */ - cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); + cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr); if (cpu >= nr_cpu_ids) { /* * Failed to find any suitable CPU. @@ -1824,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_allowed)) + cpumask_test_cpu(cpu, p->cpus_ptr)) return 1; return 0; } @@ -1974,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || + !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || task_running(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f35930f5e528..8691a8fffe40 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1621,7 +1621,7 @@ static void task_numa_compare(struct task_numa_env *env, * be incurred if the tasks were swapped. */ /* Skip this swap candidate if cannot move to the source cpu */ - if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) + if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) goto unlock; /* @@ -1718,7 +1718,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ - if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) continue; env->dst_cpu = cpu; @@ -5831,7 +5831,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, /* Skip over this group if it has no CPUs allowed */ if (!cpumask_intersects(sched_group_span(group), - &p->cpus_allowed)) + p->cpus_ptr)) continue; local_group = cpumask_test_cpu(this_cpu, @@ -5963,7 +5963,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -6003,7 +6003,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p { int new_cpu = cpu; - if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) return prev_cpu; /* @@ -6120,7 +6120,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int if (!test_idle_cores(target, false)) return -1; - cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); for_each_cpu_wrap(core, cpus, target) { bool idle = true; @@ -6154,7 +6154,7 @@ static int select_idle_smt(struct task_struct *p, int target) return -1; for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; if (available_idle_cpu(cpu)) return cpu; @@ -6217,7 +6217,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { if (!--nr) return -1; - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; if (available_idle_cpu(cpu)) break; @@ -6254,7 +6254,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && available_idle_cpu(recent_used_cpu) && - cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { /* * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: @@ -6600,7 +6600,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) int max_spare_cap_cpu = -1; for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; /* Skip CPUs that will be overutilized. */ @@ -6689,7 +6689,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && - cpumask_test_cpu(cpu, &p->cpus_allowed); + cpumask_test_cpu(cpu, p->cpus_ptr); } rcu_read_lock(); @@ -7445,14 +7445,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or - * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 2) cannot be migrated to this CPU due to cpus_ptr, or * 3) running (obviously), or * 4) are cache-hot on their current CPU. */ if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; - if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; schedstat_inc(p->se.statistics.nr_failed_migrations_affine); @@ -7472,7 +7472,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Prevent to re-select dst_cpu via env's CPUs: */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) { env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; @@ -8099,7 +8099,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) /* * Group imbalance indicates (and tries to solve) the problem where balancing - * groups is inadequate due to ->cpus_allowed constraints. + * groups is inadequate due to ->cpus_ptr constraints. * * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. @@ -8768,7 +8768,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) /* * If the busiest group is imbalanced the below checks don't * work because they assume all things are equal, which typically - * isn't true due to cpus_allowed constraints and the like. + * isn't true due to cpus_ptr constraints and the like. */ if (busiest->group_type == group_imbalanced) goto force_balance; @@ -9210,7 +9210,7 @@ more_balance: * if the curr task on busiest CPU can't be * moved to this_cpu: */ - if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { raw_spin_unlock_irqrestore(&busiest->lock, flags); env.flags |= LBF_ALL_PINNED; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1e6b909dca36..63ad7c90822c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_allowed)) + cpumask_test_cpu(cpu, p->cpus_ptr)) return 1; return 0; @@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || + !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || task_running(rq, task) || !rt_task(task) || !task_on_rq_queued(task))) { diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 1e6db9cbe4dc..fa95139445b2 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -277,7 +277,7 @@ static void move_to_next_cpu(void) * of this thread, than stop migrating for the duration * of the current test. */ - if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) + if (!cpumask_equal(current_mask, current->cpus_ptr)) goto disable; get_online_cpus(); -- cgit v1.2.3 From f2bedc4705659216bd60948029ad8dfedf923ad9 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 24 Apr 2019 09:45:56 +0100 Subject: sched/fair: Remove rq->load The CFS class is the only one maintaining and using the CPU wide load (rq->load(.weight)). The last use case of the CPU wide load in CFS's set_next_entity() can be replaced by using the load of the CFS class (rq->cfs.load(.weight)) instead. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190424084556.604-1-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 -- kernel/sched/fair.c | 7 ++----- kernel/sched/sched.h | 2 -- 3 files changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 678bfb9bd87f..150043e1d716 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -656,8 +656,6 @@ do { \ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) P(nr_running); - SEQ_printf(m, " .%-30s: %lu\n", "load", - rq->load.weight); P(nr_switches); P(nr_load_updates); P(nr_uninterruptible); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8691a8fffe40..08b1cb06f968 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2686,8 +2686,6 @@ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) { struct rq *rq = rq_of(cfs_rq); @@ -2703,8 +2701,6 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); @@ -4100,7 +4096,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ - if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + if (schedstat_enabled() && + rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { schedstat_set(se->statistics.slice_max, max((u64)schedstat_val(se->statistics.slice_max), se->sum_exec_runtime - se->prev_sum_exec_runtime)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b52ed1ada0be..c308410675ed 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -830,8 +830,6 @@ struct rq { atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ - /* capture load from *all* tasks on this CPU: */ - struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; -- cgit v1.2.3 From 5e83eafbfd3b351537c0d74467fc43e8a88f4ae4 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:10 +0100 Subject: sched/fair: Remove the rq->cpu_load[] update code With LB_BIAS disabled, there is no need to update the rq->cpu_load[idx] any more. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/20190527062116.11512-2-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - kernel/sched/fair.c | 255 ----------------------------------------------- kernel/sched/sched.h | 6 -- kernel/time/tick-sched.c | 2 - 4 files changed, 264 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 93ab85f0d076..00b8966802a8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3033,7 +3033,6 @@ void scheduler_tick(void) update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); - cpu_load_update_active(rq); calc_global_load_tick(rq); psi_task_tick(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08b1cb06f968..1aab323f1b4b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5322,71 +5322,6 @@ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); #ifdef CONFIG_NO_HZ_COMMON -/* - * per rq 'load' arrray crap; XXX kill this. - */ - -/* - * The exact cpuload calculated at every tick would be: - * - * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load - * - * If a CPU misses updates for n ticks (as it was idle) and update gets - * called on the n+1-th tick when CPU may be busy, then we have: - * - * load_n = (1 - 1/2^i)^n * load_0 - * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load - * - * decay_load_missed() below does efficient calculation of - * - * load' = (1 - 1/2^i)^n * load - * - * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. - * This allows us to precompute the above in said factors, thereby allowing the - * reduction of an arbitrary n in O(log_2 n) steps. (See also - * fixed_power_int()) - * - * The calculation is approximated on a 128 point scale. - */ -#define DEGRADE_SHIFT 7 - -static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - { 0, 0, 0, 0, 0, 0, 0, 0 }, - { 64, 32, 8, 0, 0, 0, 0, 0 }, - { 96, 72, 40, 12, 1, 0, 0, 0 }, - { 112, 98, 75, 43, 15, 1, 0, 0 }, - { 120, 112, 98, 76, 45, 16, 2, 0 } -}; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} static struct { cpumask_var_t idle_cpus_mask; @@ -5398,201 +5333,12 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -/** - * __cpu_load_update - update the rq->cpu_load[] statistics - * @this_rq: The rq to update statistics for - * @this_load: The current load - * @pending_updates: The number of missed updates - * - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). - * - * This function computes a decaying average: - * - * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load - * - * Because of NOHZ it might not get called on every tick which gives need for - * the @pending_updates argument. - * - * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 - * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load - * = A * (A * load[i]_n-2 + B) + B - * = A * (A * (A * load[i]_n-3 + B) + B) + B - * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B - * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B - * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B - * = (1 - 1/2^i)^n * (load[i]_0 - load) + load - * - * In the above we've assumed load_n := load, which is true for NOHZ_FULL as - * any change in load would have resulted in the tick being turned back on. - * - * For regular NOHZ, this reduces to: - * - * load[i]_n = (1 - 1/2^i)^n * load[i]_0 - * - * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra - * term. - */ -static void cpu_load_update(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) -{ - unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0]; - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; -#ifdef CONFIG_NO_HZ_COMMON - old_load = decay_load_missed(old_load, pending_updates - 1, i); - if (tickless_load) { - old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); - /* - * old_load can never be a negative value because a - * decayed tickless_load cannot be greater than the - * original tickless_load. - */ - old_load += tickless_load; - } -#endif - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } -} - /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(struct rq *rq) { return cfs_rq_runnable_load_avg(&rq->cfs); } -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we need to avoid the delta approach from the regular tick when - * possible since that would seriously skew the load calculation. This is why we - * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on - * jiffies deltas for updates happening while in nohz mode (idle ticks, idle - * loop exit, nohz_idle_balance, nohz full exit...) - * - * This means we might still be one tick off for nohz periods. - */ - -static void cpu_load_update_nohz(struct rq *this_rq, - unsigned long curr_jiffies, - unsigned long load) -{ - unsigned long pending_updates; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * In the regular NOHZ case, we were idle, this means load 0. - * In the NOHZ_FULL case, we were non-idle, we should consider - * its weighted load. - */ - cpu_load_update(this_rq, load, pending_updates); - } -} - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -static void cpu_load_update_idle(struct rq *this_rq) -{ - /* - * bail if there's load or we're actually up-to-date. - */ - if (weighted_cpuload(this_rq)) - return; - - cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); -} - -/* - * Record CPU load on nohz entry so we know the tickless load to account - * on nohz exit. cpu_load[0] happens then to be updated more frequently - * than other cpu_load[idx] but it should be fine as cpu_load readers - * shouldn't rely into synchronized cpu_load[*] updates. - */ -void cpu_load_update_nohz_start(void) -{ - struct rq *this_rq = this_rq(); - - /* - * This is all lockless but should be fine. If weighted_cpuload changes - * concurrently we'll exit nohz. And cpu_load write can race with - * cpu_load_update_idle() but both updater would be writing the same. - */ - this_rq->cpu_load[0] = weighted_cpuload(this_rq); -} - -/* - * Account the tickless load in the end of a nohz frame. - */ -void cpu_load_update_nohz_stop(void) -{ - unsigned long curr_jiffies = READ_ONCE(jiffies); - struct rq *this_rq = this_rq(); - unsigned long load; - struct rq_flags rf; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - load = weighted_cpuload(this_rq); - rq_lock(this_rq, &rf); - update_rq_clock(this_rq); - cpu_load_update_nohz(this_rq, curr_jiffies, load); - rq_unlock(this_rq, &rf); -} -#else /* !CONFIG_NO_HZ_COMMON */ -static inline void cpu_load_update_nohz(struct rq *this_rq, - unsigned long curr_jiffies, - unsigned long load) { } -#endif /* CONFIG_NO_HZ_COMMON */ - -static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) -{ -#ifdef CONFIG_NO_HZ_COMMON - /* See the mess around cpu_load_update_nohz(). */ - this_rq->last_load_update_tick = READ_ONCE(jiffies); -#endif - cpu_load_update(this_rq, load, 1); -} - -/* - * Called from scheduler_tick() - */ -void cpu_load_update_active(struct rq *this_rq) -{ - unsigned long load = weighted_cpuload(this_rq); - - if (tick_nohz_tick_stopped()) - cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); - else - cpu_load_update_periodic(this_rq, load); -} - /* * Return a low guess at the load of a migration-source CPU weighted * according to the scheduling class and "nice" value. @@ -9876,7 +9622,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - cpu_load_update_idle(rq); rq_unlock_irqrestore(rq, &rf); if (flags & NOHZ_BALANCE_KICK) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c308410675ed..3750b5e53792 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); -#ifdef CONFIG_SMP -extern void cpu_load_update_active(struct rq *this_rq); -#else -static inline void cpu_load_update_active(struct rq *this_rq) { } -#endif - /* * Helpers for converting nanosecond timing to jiffy resolution */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4ee1a3428ae..be9707f68024 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -782,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) */ if (!ts->tick_stopped) { calc_load_nohz_start(); - cpu_load_update_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); @@ -829,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); - cpu_load_update_nohz_stop(); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and -- cgit v1.2.3 From 1c1b8a7b03ef50f80f5d0c871ee261c04a6c967e Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:11 +0100 Subject: sched/fair: Replace source_load() & target_load() with weighted_cpuload() With LB_BIAS disabled, source_load() & target_load() return weighted_cpuload(). Replace both with calls to weighted_cpuload(). The function to obtain the load index (sd->*_idx) for an sd, get_sd_load_idx(), can be removed as well. Finally, get rid of the sched feature LB_BIAS. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/20190527062116.11512-3-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 90 +++---------------------------------------------- kernel/sched/features.h | 1 - 2 files changed, 4 insertions(+), 87 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1aab323f1b4b..5b9691e5ea59 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1467,8 +1467,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, } static unsigned long weighted_cpuload(struct rq *rq); -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -5333,45 +5331,11 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -/* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(struct rq *rq) { return cfs_rq_runnable_load_avg(&rq->cfs); } -/* - * Return a low guess at the load of a migration-source CPU weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(rq); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target CPU weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(rq); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - static unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; @@ -5479,7 +5443,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, s64 this_eff_load, prev_eff_load; unsigned long task_load; - this_eff_load = target_load(this_cpu, sd->wake_idx); + this_eff_load = weighted_cpuload(cpu_rq(this_cpu)); if (sync) { unsigned long current_load = task_h_load(current); @@ -5497,7 +5461,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); - prev_eff_load = source_load(prev_cpu, sd->wake_idx); + prev_eff_load = weighted_cpuload(cpu_rq(prev_cpu)); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; @@ -5558,14 +5522,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, unsigned long this_runnable_load = ULONG_MAX; unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; - int load_idx = sd->forkexec_idx; int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; unsigned long imbalance = scale_load_down(NICE_0_LOAD) * (sd->imbalance_pct-100) / 100; - if (sd_flag & SD_BALANCE_WAKE) - load_idx = sd->wake_idx; - do { unsigned long load, avg_load, runnable_load; unsigned long spare_cap, max_spare_cap; @@ -5589,12 +5549,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, max_spare_cap = 0; for_each_cpu(i, sched_group_span(group)) { - /* Bias balancing toward CPUs of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - + load = weighted_cpuload(cpu_rq(i)); runnable_load += load; avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); @@ -7676,34 +7631,6 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) }; } -/** - * get_sd_load_idx - Obtain the load index for a given sched domain. - * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The idle status of the CPU for whose sd load_idx is obtained. - * - * Return: The load index. - */ -static inline int get_sd_load_idx(struct sched_domain *sd, - enum cpu_idle_type idle) -{ - int load_idx; - - switch (idle) { - case CPU_NOT_IDLE: - load_idx = sd->busy_idx; - break; - - case CPU_NEWLY_IDLE: - load_idx = sd->newidle_idx; - break; - default: - load_idx = sd->idle_idx; - break; - } - - return load_idx; -} - static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) { struct rq *rq = cpu_rq(cpu); @@ -7992,9 +7919,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sg_lb_stats *sgs, int *sg_status) { - int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); - int load_idx = get_sd_load_idx(env->sd, env->idle); - unsigned long load; int i, nr_running; memset(sgs, 0, sizeof(*sgs)); @@ -8005,13 +7929,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; - /* Bias balancing toward CPUs of our domain: */ - if (local_group) - load = target_load(i, load_idx); - else - load = source_load(i, load_idx); - - sgs->group_load += load; + sgs->group_load += weighted_cpuload(rq); sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 858589b83377..2410db5e9a35 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) -SCHED_FEAT(LB_BIAS, false) /* * Decrement CPU capacity based on time not spent running tasks -- cgit v1.2.3 From 3d8d53554405952993bb0279ef3ebebc51740074 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:12 +0100 Subject: sched/debug: Remove sd->*_idx range on sysctl This reverts: commit 201c373e8e48 ("sched/debug: Limit sd->*_idx range on sysctl") Load indexes (sd->*_idx) are no longer needed without rq->cpu_load[]. The range check for load indexes can be removed as well. Get rid of it before the rq->cpu_load[] since it uses CPU_LOAD_IDX_MAX. At the same time, fix the following coding style issues detected by scripts/checkpatch.pl: ERROR: space prohibited before that ',' ERROR: space prohibited before that close parenthesis ')' Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/20190527062116.11512-4-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 150043e1d716..5c7b066d7de6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -236,25 +236,16 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) *tablep = NULL; } -static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX-1; - static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler, - bool load_idx) + umode_t mode, proc_handler *proc_handler) { entry->procname = procname; entry->data = data; entry->maxlen = maxlen; entry->mode = mode; entry->proc_handler = proc_handler; - - if (load_idx) { - entry->extra1 = &min_load_idx; - entry->extra2 = &max_load_idx; - } } static struct ctl_table * @@ -265,19 +256,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) if (table == NULL) return NULL; - set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); - set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); + set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[9], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); /* &table[13] is terminator */ return table; -- cgit v1.2.3 From 55627e3cd22c315c4a02fe3bbbb7234ec439cb1d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:13 +0100 Subject: sched/core: Remove rq->cpu_load[] The per rq load array values also disappear from the cpu#X sections in /proc/sched_debug. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/20190527062116.11512-5-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +----- kernel/sched/debug.c | 5 ----- kernel/sched/sched.h | 2 -- 3 files changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 00b8966802a8..29984d8c41f0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5901,8 +5901,8 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); void __init sched_init(void) { - int i, j; unsigned long alloc_size = 0, ptr; + int i; wait_bit_init(); @@ -6004,10 +6004,6 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif - - for (j = 0; j < CPU_LOAD_IDX_MAX; j++) - rq->cpu_load[j] = 0; - #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5c7b066d7de6..a0b0d6e21e5b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -654,11 +654,6 @@ do { \ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); PN(clock_task); - P(cpu_load[0]); - P(cpu_load[1]); - P(cpu_load[2]); - P(cpu_load[3]); - P(cpu_load[4]); #undef P #undef PN diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3750b5e53792..607859a18b2a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -812,8 +812,6 @@ struct rq { unsigned int nr_preferred_running; unsigned int numa_migrate_on; #endif - #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_SMP unsigned long last_load_update_tick; -- cgit v1.2.3 From 0e1fef63d92d61ed561e504c3a078a827a0f9bfe Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:14 +0100 Subject: sched/core: Remove sd->*_idx The sched domain per rq load index files also disappear from the /proc/sys/kernel/sched_domain/cpuX/domainY directories. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/20190527062116.11512-6-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 25 ++++++++++--------------- kernel/sched/topology.c | 10 ---------- 2 files changed, 10 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a0b0d6e21e5b..7ffde8ce82fd 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -251,25 +251,20 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table = sd_alloc_ctl_entry(9); if (table == NULL) return NULL; - set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[9], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); - /* &table[13] is terminator */ + set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[8] is terminator */ return table; } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f53f89df837d..63184cf0d0d7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1344,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl, .imbalance_pct = 125, .cache_nice_tries = 0, - .busy_idx = 0, - .idle_idx = 0, - .newidle_idx = 0, - .wake_idx = 0, - .forkexec_idx = 0, .flags = 1*SD_LOAD_BALANCE | 1*SD_BALANCE_NEWIDLE @@ -1400,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl, } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->imbalance_pct = 117; sd->cache_nice_tries = 1; - sd->busy_idx = 2; #ifdef CONFIG_NUMA } else if (sd->flags & SD_NUMA) { sd->cache_nice_tries = 2; - sd->busy_idx = 3; - sd->idle_idx = 2; sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; @@ -1419,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl, #endif } else { sd->cache_nice_tries = 1; - sd->busy_idx = 2; - sd->idle_idx = 1; } /* -- cgit v1.2.3 From af75d1a9a9f75bf030c2f35705f1ff6d226f96fe Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 27 May 2019 07:21:15 +0100 Subject: sched/fair: Remove sgs->sum_weighted_load Since sg_lb_stats::sum_weighted_load is now identical with sg_lb_stats::group_load remove it and replace its use case (calculating load per task) with the latter. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Acked-by: Vincent Guittot Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Valentin Schneider Link: https://lkml.kernel.org/r/20190527062116.11512-7-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b9691e5ea59..7f8d477f90fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7577,7 +7577,6 @@ static unsigned long task_h_load(struct task_struct *p) struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ @@ -7944,7 +7943,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; #endif - sgs->sum_weighted_load += weighted_cpuload(rq); /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -7963,7 +7961,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; if (sgs->sum_nr_running) - sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + sgs->load_per_task = sgs->group_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; -- cgit v1.2.3 From f7c1c6b36a3874d3a7987fb3af829d5b0d75bda7 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:17 +0800 Subject: locking/lockdep: Change all print_*() return type to void Since none of the print_*() function's return value is necessary, change their return type to void. No functional change. In cases where an invariable return value is used, this change slightly improves readability, i.e.: print_x(); return 0; is definitely better than: return print_x(); /* where print_x() always returns 0 */ Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-2-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 209 ++++++++++++++++++++++++----------------------- 1 file changed, 108 insertions(+), 101 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8d32ae7768a7..109b56267c8f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1427,16 +1427,15 @@ static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) * Print a dependency chain entry (this is only done when a deadlock * has been detected): */ -static noinline int +static noinline void print_circular_bug_entry(struct lock_list *target, int depth) { if (debug_locks_silent) - return 0; + return; printk("\n-> #%u", depth); print_lock_name(target->class); printk(KERN_CONT ":\n"); print_lock_trace(&target->trace, 6); - return 0; } static void @@ -1493,7 +1492,7 @@ print_circular_lock_scenario(struct held_lock *src, * When a circular dependency is detected, print the * header first: */ -static noinline int +static noinline void print_circular_bug_header(struct lock_list *entry, unsigned int depth, struct held_lock *check_src, struct held_lock *check_tgt) @@ -1501,7 +1500,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, struct task_struct *curr = current; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("======================================================\n"); @@ -1519,8 +1518,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); print_circular_bug_entry(entry, depth); - - return 0; } static inline int class_equal(struct lock_list *entry, void *data) @@ -1528,10 +1525,10 @@ static inline int class_equal(struct lock_list *entry, void *data) return entry->class == data; } -static noinline int print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt) +static noinline void print_circular_bug(struct lock_list *this, + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; struct lock_list *parent; @@ -1539,10 +1536,10 @@ static noinline int print_circular_bug(struct lock_list *this, int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; if (!save_trace(&this->trace)) - return 0; + return; depth = get_lock_depth(target); @@ -1564,21 +1561,17 @@ static noinline int print_circular_bug(struct lock_list *this, printk("\nstack backtrace:\n"); dump_stack(); - - return 0; } -static noinline int print_bfs_bug(int ret) +static noinline void print_bfs_bug(int ret) { if (!debug_locks_off_graph_unlock()) - return 0; + return; /* * Breadth-first-search failed, graph got corrupted? */ WARN(1, "lockdep bfs error:%d\n", ret); - - return 0; } static int noop_count(struct lock_list *entry, void *data) @@ -1767,7 +1760,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) */ static void __used print_shortest_lock_dependencies(struct lock_list *leaf, - struct lock_list *root) + struct lock_list *root) { struct lock_list *entry = leaf; int depth; @@ -1789,8 +1782,6 @@ print_shortest_lock_dependencies(struct lock_list *leaf, entry = get_lock_parent(entry); depth--; } while (entry && (depth >= 0)); - - return; } static void @@ -1849,7 +1840,7 @@ print_irq_lock_scenario(struct lock_list *safe_entry, printk("\n *** DEADLOCK ***\n\n"); } -static int +static void print_bad_irq_dependency(struct task_struct *curr, struct lock_list *prev_root, struct lock_list *next_root, @@ -1862,7 +1853,7 @@ print_bad_irq_dependency(struct task_struct *curr, const char *irqclass) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("=====================================================\n"); @@ -1908,19 +1899,17 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); if (!save_trace(&prev_root->trace)) - return 0; + return; print_shortest_lock_dependencies(backwards_entry, prev_root); pr_warn("\nthe dependencies between the lock to be acquired"); pr_warn(" and %s-irq-unsafe lock:\n", irqclass); if (!save_trace(&next_root->trace)) - return 0; + return; print_shortest_lock_dependencies(forwards_entry, next_root); pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static const char *state_names[] = { @@ -2067,8 +2056,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, this.class = hlock_class(prev); ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } usage_mask &= LOCKF_USED_IN_IRQ_ALL; if (!usage_mask) @@ -2084,8 +2075,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, that.class = hlock_class(next); ret = find_usage_forwards(&that, forward_mask, &target_entry1); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (ret == 1) return ret; @@ -2097,8 +2090,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, backward_mask = original_mask(target_entry1->class->usage_mask); ret = find_usage_backwards(&this, backward_mask, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (DEBUG_LOCKS_WARN_ON(ret == 1)) return 1; @@ -2112,11 +2107,13 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, if (DEBUG_LOCKS_WARN_ON(ret == -1)) return 1; - return print_bad_irq_dependency(curr, &this, &that, - target_entry, target_entry1, - prev, next, - backward_bit, forward_bit, - state_name(backward_bit)); + print_bad_irq_dependency(curr, &this, &that, + target_entry, target_entry1, + prev, next, + backward_bit, forward_bit, + state_name(backward_bit)); + + return 0; } static void inc_chains(void) @@ -2147,8 +2144,7 @@ static inline void inc_chains(void) #endif static void -print_deadlock_scenario(struct held_lock *nxt, - struct held_lock *prv) +print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) { struct lock_class *next = hlock_class(nxt); struct lock_class *prev = hlock_class(prv); @@ -2166,12 +2162,12 @@ print_deadlock_scenario(struct held_lock *nxt, printk(" May be due to missing lock nesting notation\n\n"); } -static int +static void print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("============================================\n"); @@ -2190,8 +2186,6 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } /* @@ -2233,7 +2227,8 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; - return print_deadlock_bug(curr, prev, next); + print_deadlock_bug(curr, prev, next); + return 0; } return 1; } @@ -2308,10 +2303,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ save_trace(trace); } - return print_circular_bug(&this, target_entry, next, prev); + print_circular_bug(&this, target_entry, next, prev); + return 0; + } + else if (unlikely(ret < 0)) { + print_bfs_bug(ret); + return 0; } - else if (unlikely(ret < 0)) - return print_bfs_bug(ret); if (!check_irq_usage(curr, prev, next)) return 0; @@ -2352,8 +2350,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, debug_atomic_inc(nr_redundant); return 2; } - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (!trace->nr_entries && !save_trace(trace)) @@ -2877,8 +2877,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) -static void -print_usage_bug_scenario(struct held_lock *lock) +static void print_usage_bug_scenario(struct held_lock *lock) { struct lock_class *class = hlock_class(lock); @@ -2895,12 +2894,12 @@ print_usage_bug_scenario(struct held_lock *lock) printk("\n *** DEADLOCK ***\n\n"); } -static int +static void print_usage_bug(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("================================\n"); @@ -2930,8 +2929,6 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } /* @@ -2941,8 +2938,10 @@ static inline int valid_state(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) { - if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) - return print_usage_bug(curr, this, bad_bit, new_bit); + if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) { + print_usage_bug(curr, this, bad_bit, new_bit); + return 0; + } return 1; } @@ -2950,7 +2949,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, /* * print irq inversion bug: */ -static int +static void print_irq_inversion_bug(struct task_struct *curr, struct lock_list *root, struct lock_list *other, struct held_lock *this, int forwards, @@ -2961,7 +2960,7 @@ print_irq_inversion_bug(struct task_struct *curr, int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("========================================================\n"); @@ -3002,13 +3001,11 @@ print_irq_inversion_bug(struct task_struct *curr, pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); if (!save_trace(&root->trace)) - return 0; + return; print_shortest_lock_dependencies(other, root); pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } /* @@ -3026,13 +3023,16 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (ret == 1) return ret; - return print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, irqclass); + return 0; } /* @@ -3050,13 +3050,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (ret == 1) return ret; - return print_irq_inversion_bug(curr, &root, target_entry, - this, 0, irqclass); + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, irqclass); + return 0; } void print_irqtrace_events(struct task_struct *curr) @@ -3599,15 +3602,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); -static int +static void print_lock_nested_lock_not_held(struct task_struct *curr, struct held_lock *hlock, unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("==================================\n"); @@ -3629,8 +3632,6 @@ print_lock_nested_lock_not_held(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static int __lock_is_held(const struct lockdep_map *lock, int read); @@ -3779,8 +3780,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } chain_key = iterate_chain_key(chain_key, class_idx); - if (nest_lock && !__lock_is_held(nest_lock, -1)) - return print_lock_nested_lock_not_held(curr, hlock, ip); + if (nest_lock && !__lock_is_held(nest_lock, -1)) { + print_lock_nested_lock_not_held(curr, hlock, ip); + return 0; + } if (!debug_locks_silent) { WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key); @@ -3816,14 +3819,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, return 1; } -static int -print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) +static void print_unlock_imbalance_bug(struct task_struct *curr, + struct lockdep_map *lock, + unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("=====================================\n"); @@ -3841,8 +3844,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static int match_held_lock(const struct held_lock *hlock, @@ -3961,8 +3962,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, return 0; hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } lockdep_init_map(lock, name, key, 0); class = register_lock_class(lock, subclass, 0); @@ -4002,8 +4005,10 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) return 0; hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; @@ -4047,16 +4052,20 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) * So we're all set to release this lock.. wait what lock? We don't * own any locks, you've been drinking again? */ - if (DEBUG_LOCKS_WARN_ON(depth <= 0)) - return print_unlock_imbalance_bug(curr, lock, ip); + if (DEBUG_LOCKS_WARN_ON(depth <= 0)) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } /* * Check whether the lock exists in the current stack * of held locks: */ hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } if (hlock->instance == lock) lock_release_holdtime(hlock); @@ -4399,14 +4408,14 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) EXPORT_SYMBOL_GPL(lock_unpin_lock); #ifdef CONFIG_LOCK_STAT -static int -print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) +static void print_lock_contention_bug(struct task_struct *curr, + struct lockdep_map *lock, + unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("=================================\n"); @@ -4424,8 +4433,6 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static void -- cgit v1.2.3 From c52478f4f38ace598475413a08dba9b9fd827eaf Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:19 +0800 Subject: locking/lockdep: Adjust lock usage bit character checks The lock usage bit characters are defined and determined with tricks. Add some explanation to make it a bit clearer, then adjust the logic to check the usage, which optimizes the code a bit. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-4-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 109b56267c8f..a033df00fd1d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -500,15 +500,26 @@ static inline unsigned long lock_flag(enum lock_usage_bit bit) static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) { + /* + * The usage character defaults to '.' (i.e., irqs disabled and not in + * irq context), which is the safest usage category. + */ char c = '.'; - if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) + /* + * The order of the following usage checks matters, which will + * result in the outcome character as follows: + * + * - '+': irq is enabled and not in irq context + * - '-': in irq context and irq is disabled + * - '?': in irq context and irq is enabled + */ + if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) { c = '+'; - if (class->usage_mask & lock_flag(bit)) { - c = '-'; - if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) + if (class->usage_mask & lock_flag(bit)) c = '?'; - } + } else if (class->usage_mask & lock_flag(bit)) + c = '-'; return c; } -- cgit v1.2.3 From e7a38f63ba50dc95426dd50c43383dfecaa35d7f Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:20 +0800 Subject: locking/lockdep: Remove useless conditional macro Since #defined(CONFIG_PROVE_LOCKING) is used in the scope of #ifdef CONFIG_PROVE_LOCKING, it can be removed. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-5-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a033df00fd1d..3c477018e184 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1674,7 +1674,7 @@ check_redundant(struct lock_list *root, struct lock_class *target, return result; } -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_TRACE_IRQFLAGS static inline int usage_accumulate(struct lock_list *entry, void *mask) { @@ -2152,7 +2152,7 @@ static inline void inc_chains(void) nr_process_chains++; } -#endif +#endif /* CONFIG_TRACE_IRQFLAGS */ static void print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) @@ -2829,7 +2829,7 @@ static inline int validate_chain(struct task_struct *curr, { return 1; } -#endif +#endif /* CONFIG_PROVE_LOCKING */ /* * We are building curr_chain_key incrementally, so double-check -- cgit v1.2.3 From 834494b28024b39d45aea6bcc642b0fe94fe2503 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:21 +0800 Subject: locking/lockdep: Print the right depth for chain key collision Since chains are separated by IRQ context, so when printing a chain the depth should be consistent with it. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-6-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 3c477018e184..bc1efc12a8c5 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2519,10 +2519,11 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne struct held_lock *hlock; u64 chain_key = 0; int depth = curr->lockdep_depth; - int i; + int i = get_first_held_lock(curr, hlock_next); - printk("depth: %u\n", depth + 1); - for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) { + printk("depth: %u (irq_context %u)\n", depth - i + 1, + hlock_next->irq_context); + for (; i < depth; i++) { hlock = curr->held_locks + i; chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); -- cgit v1.2.3 From e196e479a3b844da6e6e71e0d2a8694040cb4e52 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:23 +0800 Subject: locking/lockdep: Use lockdep_init_task for task initiation consistently Despite that there is a lockdep_init_task() which does nothing, lockdep initiates tasks by assigning lockdep fields and does so inconsistently. Fix this by using lockdep_init_task(). Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-8-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 3 --- kernel/locking/lockdep.c | 11 ++++++++--- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 75675b9bf6df..735d0b4a89e2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1984,9 +1984,6 @@ static __latent_entropy struct task_struct *copy_process( p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP - p->lockdep_depth = 0; /* no locks held yet */ - p->curr_chain_key = 0; - p->lockdep_recursion = 0; lockdep_init_task(p); #endif diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bc1efc12a8c5..b7d9c28ecf3b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -359,6 +359,13 @@ static inline u64 iterate_chain_key(u64 key, u32 idx) return k0 | (u64)k1 << 32; } +void lockdep_init_task(struct task_struct *task) +{ + task->lockdep_depth = 0; /* no locks held yet */ + task->curr_chain_key = 0; + task->lockdep_recursion = 0; +} + void lockdep_off(void) { current->lockdep_recursion++; @@ -4589,9 +4596,7 @@ void lockdep_reset(void) int i; raw_local_irq_save(flags); - current->curr_chain_key = 0; - current->lockdep_depth = 0; - current->lockdep_recursion = 0; + lockdep_init_task(current); memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); nr_hardirq_chains = 0; nr_softirq_chains = 0; -- cgit v1.2.3 From f6ec8829ac9d59b637366c13038f15d6f6156fe1 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:24 +0800 Subject: locking/lockdep: Define INITIAL_CHAIN_KEY for chain keys to start with Chain keys are computed using Jenkins hash function, which needs an initial hash to start with. Dedicate a macro to make this clear and configurable. A later patch changes this initial chain key. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-9-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b7d9c28ecf3b..9edf6f12b711 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -362,7 +362,7 @@ static inline u64 iterate_chain_key(u64 key, u32 idx) void lockdep_init_task(struct task_struct *task) { task->lockdep_depth = 0; /* no locks held yet */ - task->curr_chain_key = 0; + task->curr_chain_key = INITIAL_CHAIN_KEY; task->lockdep_recursion = 0; } @@ -857,7 +857,7 @@ static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; static bool check_lock_chain_key(struct lock_chain *chain) { #ifdef CONFIG_PROVE_LOCKING - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int i; for (i = chain->base; i < chain->base + chain->depth; i++) @@ -2524,7 +2524,7 @@ static void print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next) { struct held_lock *hlock; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int depth = curr->lockdep_depth; int i = get_first_held_lock(curr, hlock_next); @@ -2544,7 +2544,7 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne static void print_chain_keys_chain(struct lock_chain *chain) { int i; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int class_id; printk("depth: %u\n", chain->depth); @@ -2848,7 +2848,7 @@ static void check_chain_key(struct task_struct *curr) #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; unsigned int i; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; @@ -2872,7 +2872,7 @@ static void check_chain_key(struct task_struct *curr) if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; chain_key = iterate_chain_key(chain_key, hlock->class_idx); prev_hlock = hlock; } @@ -3787,14 +3787,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, /* * How can we have a chain hash when we ain't got no keys?! */ - if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) + if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY)) return 0; chain_head = 1; } hlock->prev_chain_key = chain_key; if (separate_irq_context(curr, hlock)) { - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; chain_head = 1; } chain_key = iterate_chain_key(chain_key, class_idx); @@ -4636,7 +4636,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf, return; recalc: - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; for (i = chain->base; i < chain->base + chain->depth; i++) chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); if (chain->depth && chain->chain_key == chain_key) -- cgit v1.2.3 From 01bb6f0af992a1e6b7797d92fd31a7864872e347 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:25 +0800 Subject: locking/lockdep: Change the range of class_idx in held_lock struct held_lock->class_idx is used to point to the class of the held lock. The index is shifted by 1 to make index 0 mean no class, which results in class index shifting back and forth but is not worth doing so. The reason is: (1) there will be no "no-class" held_lock to begin with, and (2) index 0 seems to be used for error checking, but if something wrong indeed happened, the index can't be counted on to distinguish it as that something won't set the class_idx to 0 on purpose to tell us it is wrong. Therefore, change the index to start from 0. This saves a lot of back-and-forth shifts and a class slot back to lock_classes. Since index 0 is now used for lock class, we change the initial chain key to -1 to avoid key collision, which is due to the fact that __jhash_mix(0, 0, 0) = 0. Actually, the initial chain key can be any arbitrary value other than 0. In addition, a bitmap is maintained to keep track of the used lock classes, and we check the validity of the held lock against that bitmap. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-10-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 59 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 9edf6f12b711..3eecae315885 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -151,17 +151,28 @@ unsigned long nr_lock_classes; static #endif struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); static inline struct lock_class *hlock_class(struct held_lock *hlock) { - if (!hlock->class_idx) { + unsigned int class_idx = hlock->class_idx; + + /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */ + barrier(); + + if (!test_bit(class_idx, lock_classes_in_use)) { /* * Someone passed in garbage, we give up. */ DEBUG_LOCKS_WARN_ON(1); return NULL; } - return lock_classes + hlock->class_idx - 1; + + /* + * At this point, if the passed hlock->class_idx is still garbage, + * we just have to live with it + */ + return lock_classes + class_idx; } #ifdef CONFIG_LOCK_STAT @@ -590,19 +601,22 @@ static void print_lock(struct held_lock *hlock) /* * We can be called locklessly through debug_show_all_locks() so be * extra careful, the hlock might have been released and cleared. + * + * If this indeed happens, lets pretend it does not hurt to continue + * to print the lock unless the hlock class_idx does not point to a + * registered class. The rationale here is: since we don't attempt + * to distinguish whether we are in this situation, if it just + * happened we can't count on class_idx to tell either. */ - unsigned int class_idx = hlock->class_idx; + struct lock_class *lock = hlock_class(hlock); - /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */ - barrier(); - - if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { + if (!lock) { printk(KERN_CONT "\n"); return; } printk(KERN_CONT "%p", hlock->instance); - print_lock_name(lock_classes + class_idx - 1); + print_lock_name(lock); printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } @@ -861,7 +875,7 @@ static bool check_lock_chain_key(struct lock_chain *chain) int i; for (i = chain->base; i < chain->base + chain->depth; i++) - chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); /* * The 'unsigned long long' casts avoid that a compiler warning * is reported when building tools/lib/lockdep. @@ -1136,6 +1150,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) return NULL; } nr_lock_classes++; + __set_bit(class - lock_classes, lock_classes_in_use); debug_atomic_inc(nr_unused_locks); class->key = key; class->name = lock->name; @@ -2550,7 +2565,7 @@ static void print_chain_keys_chain(struct lock_chain *chain) printk("depth: %u\n", chain->depth); for (i = 0; i < chain->depth; i++) { class_id = chain_hlocks[chain->base + i]; - chain_key = print_chain_key_iteration(class_id + 1, chain_key); + chain_key = print_chain_key_iteration(class_id, chain_key); print_lock_name(lock_classes + class_id); printk("\n"); @@ -2601,7 +2616,7 @@ static int check_no_collision(struct task_struct *curr, } for (j = 0; j < chain->depth - 1; j++, i++) { - id = curr->held_locks[i].class_idx - 1; + id = curr->held_locks[i].class_idx; if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { print_collision(curr, hlock, chain); @@ -2684,7 +2699,7 @@ static inline int add_chain_cache(struct task_struct *curr, if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = nr_chain_hlocks; for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx - 1; + int lock_id = curr->held_locks[i].class_idx; chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; @@ -2864,10 +2879,12 @@ static void check_chain_key(struct task_struct *curr) (unsigned long long)hlock->prev_chain_key); return; } + /* - * Whoops ran out of static storage again? + * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is + * it registered lock class index? */ - if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use))) return; if (prev_hlock && (prev_hlock->irq_context != @@ -3715,7 +3732,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) return 0; - class_idx = class - lock_classes + 1; + class_idx = class - lock_classes; if (depth) { hlock = curr->held_locks + depth - 1; @@ -3777,9 +3794,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * the hash, not class->key. */ /* - * Whoops, we did it again.. ran straight out of our static allocation. + * Whoops, we did it again.. class_idx is invalid. */ - if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use))) return 0; chain_key = curr->curr_chain_key; @@ -3894,7 +3911,7 @@ static int match_held_lock(const struct held_lock *hlock, if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) return 0; - if (hlock->class_idx == class - lock_classes + 1) + if (hlock->class_idx == class - lock_classes) return 1; } @@ -3988,7 +4005,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name, lockdep_init_map(lock, name, key, 0); class = register_lock_class(lock, subclass, 0); - hlock->class_idx = class - lock_classes + 1; + hlock->class_idx = class - lock_classes; curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; @@ -4638,7 +4655,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf, recalc: chain_key = INITIAL_CHAIN_KEY; for (i = chain->base; i < chain->base + chain->depth; i++) - chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); if (chain->depth && chain->chain_key == chain_key) return; /* Overwrite the chain key for concurrent RCU readers. */ @@ -4712,6 +4729,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) WRITE_ONCE(class->key, NULL); WRITE_ONCE(class->name, NULL); nr_lock_classes--; + __clear_bit(class - lock_classes, lock_classes_in_use); } else { WARN_ONCE(true, "%s() failed for class %s\n", __func__, class->name); @@ -5057,6 +5075,7 @@ void __init lockdep_init(void) printk(" memory used by lock dependency info: %zu kB\n", (sizeof(lock_classes) + + sizeof(lock_classes_in_use) + sizeof(classhash_table) + sizeof(list_entries) + sizeof(list_entries_in_use) + -- cgit v1.2.3 From 0b9fc8ecfa30000c8900da7adbbef23438de9ec0 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:26 +0800 Subject: locking/lockdep: Remove unused argument in validate_chain() and check_deadlock() The lockdep_map argument in them is not used, remove it. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Bart Van Assche Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-11-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 3eecae315885..6cf14c84eb6d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2230,8 +2230,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read */ static int -check_deadlock(struct task_struct *curr, struct held_lock *next, - struct lockdep_map *next_instance, int read) +check_deadlock(struct task_struct *curr, struct held_lock *next, int read) { struct held_lock *prev; struct held_lock *nest = NULL; @@ -2789,8 +2788,9 @@ cache_hit: return 1; } -static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, - struct held_lock *hlock, int chain_head, u64 chain_key) +static int validate_chain(struct task_struct *curr, + struct held_lock *hlock, + int chain_head, u64 chain_key) { /* * Trylock needs to maintain the stack of held locks, but it @@ -2816,7 +2816,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * any of these scenarios could lead to a deadlock. If * All validations */ - int ret = check_deadlock(curr, hlock, lock, hlock->read); + int ret = check_deadlock(curr, hlock, hlock->read); if (!ret) return 0; @@ -2847,8 +2847,8 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, } #else static inline int validate_chain(struct task_struct *curr, - struct lockdep_map *lock, struct held_lock *hlock, - int chain_head, u64 chain_key) + struct held_lock *hlock, + int chain_head, u64 chain_key) { return 1; } @@ -3826,7 +3826,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, WARN_ON_ONCE(!hlock_class(hlock)->key); } - if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) + if (!validate_chain(curr, hlock, chain_head, chain_key)) return 0; curr->curr_chain_key = chain_key; -- cgit v1.2.3 From 31a490e5c54f5499aa744f8524611e2a4b19f8ba Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:27 +0800 Subject: locking/lockdep: Update comment A leftover comment is removed. While at it, add more explanatory comments. Such a trivial patch! Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-12-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 6cf14c84eb6d..a9799f9ed093 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2811,10 +2811,16 @@ static int validate_chain(struct task_struct *curr, * - is softirq-safe, if this lock is hardirq-unsafe * * And check whether the new lock's dependency graph - * could lead back to the previous lock. + * could lead back to the previous lock: * - * any of these scenarios could lead to a deadlock. If - * All validations + * - within the current held-lock stack + * - across our accumulated lock dependency records + * + * any of these scenarios could lead to a deadlock. + */ + /* + * The simple case: does the current hold the same lock + * already? */ int ret = check_deadlock(curr, hlock, hlock->read); -- cgit v1.2.3 From aa4807719e076bfb2dee9c96adf2c648e47d472f Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:28 +0800 Subject: locking/lockdep: Change type of the element field in circular_queue The element field is an array in struct circular_queue to keep track of locks in the search. Making it the same type as the locks avoids type cast. Also fix a typo and elaborate the comment above struct circular_queue. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Bart Van Assche Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-13-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a9799f9ed093..d467ba825dca 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1262,13 +1262,17 @@ static int add_lock_to_list(struct lock_class *this, #define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) /* - * The circular_queue and helpers is used to implement the - * breadth-first search(BFS)algorithem, by which we can build - * the shortest path from the next lock to be acquired to the - * previous held lock if there is a circular between them. + * The circular_queue and helpers are used to implement graph + * breadth-first search (BFS) algorithm, by which we can determine + * whether there is a path from a lock to another. In deadlock checks, + * a path from the next lock to be acquired to a previous held lock + * indicates that adding the -> lock dependency will + * produce a circle in the graph. Breadth-first search instead of + * depth-first search is used in order to find the shortest (circular) + * path. */ struct circular_queue { - unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; + struct lock_list *element[MAX_CIRCULAR_QUEUE_SIZE]; unsigned int front, rear; }; @@ -1294,7 +1298,7 @@ static inline int __cq_full(struct circular_queue *cq) return ((cq->rear + 1) & CQ_MASK) == cq->front; } -static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) +static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem) { if (__cq_full(cq)) return -1; @@ -1304,7 +1308,7 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) return 0; } -static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) +static inline int __cq_dequeue(struct circular_queue *cq, struct lock_list **elem) { if (__cq_empty(cq)) return -1; @@ -1382,12 +1386,12 @@ static int __bfs(struct lock_list *source_entry, goto exit; __cq_init(cq); - __cq_enqueue(cq, (unsigned long)source_entry); + __cq_enqueue(cq, source_entry); while (!__cq_empty(cq)) { struct lock_list *lock; - __cq_dequeue(cq, (unsigned long *)&lock); + __cq_dequeue(cq, &lock); if (!lock->class) { ret = -2; @@ -1411,7 +1415,7 @@ static int __bfs(struct lock_list *source_entry, goto exit; } - if (__cq_enqueue(cq, (unsigned long)entry)) { + if (__cq_enqueue(cq, entry)) { ret = -1; goto exit; } -- cgit v1.2.3 From c1661325597f68bc9e632c4fa9c86983d56fba4f Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:29 +0800 Subject: locking/lockdep: Change the return type of __cq_dequeue() With the change, we can slightly adjust the code to iterate the queue in BFS search, which simplifies the code. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-14-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d467ba825dca..d23dcb47389e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1308,14 +1308,21 @@ static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem return 0; } -static inline int __cq_dequeue(struct circular_queue *cq, struct lock_list **elem) +/* + * Dequeue an element from the circular_queue, return a lock_list if + * the queue is not empty, or NULL if otherwise. + */ +static inline struct lock_list * __cq_dequeue(struct circular_queue *cq) { + struct lock_list * lock; + if (__cq_empty(cq)) - return -1; + return NULL; - *elem = cq->element[cq->front]; + lock = cq->element[cq->front]; cq->front = (cq->front + 1) & CQ_MASK; - return 0; + + return lock; } static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) @@ -1367,6 +1374,7 @@ static int __bfs(struct lock_list *source_entry, int forward) { struct lock_list *entry; + struct lock_list *lock; struct list_head *head; struct circular_queue *cq = &lock_cq; int ret = 1; @@ -1388,10 +1396,7 @@ static int __bfs(struct lock_list *source_entry, __cq_init(cq); __cq_enqueue(cq, source_entry); - while (!__cq_empty(cq)) { - struct lock_list *lock; - - __cq_dequeue(cq, &lock); + while ((lock = __cq_dequeue(cq))) { if (!lock->class) { ret = -2; -- cgit v1.2.3 From 77a806922cfdebcf3ae89d31a8b592a7f7fbe537 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:30 +0800 Subject: locking/lockdep: Avoid constant checks in __bfs by using offset reference In search of a dependency in the lock graph, there is contant checks for forward or backward search. Directly reference the field offset of the struct that differentiates the type of search to avoid those checks. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-15-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d23dcb47389e..2e8ef6082f72 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1367,11 +1367,25 @@ static inline int get_lock_depth(struct lock_list *child) return depth; } +/* + * Return the forward or backward dependency list. + * + * @lock: the lock_list to get its class's dependency list + * @offset: the offset to struct lock_class to determine whether it is + * locks_after or locks_before + */ +static inline struct list_head *get_dep_list(struct lock_list *lock, int offset) +{ + void *lock_class = lock->class; + + return lock_class + offset; +} + static int __bfs(struct lock_list *source_entry, void *data, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry, - int forward) + int offset) { struct lock_list *entry; struct lock_list *lock; @@ -1385,11 +1399,7 @@ static int __bfs(struct lock_list *source_entry, goto exit; } - if (forward) - head = &source_entry->class->locks_after; - else - head = &source_entry->class->locks_before; - + head = get_dep_list(source_entry, offset); if (list_empty(head)) goto exit; @@ -1403,10 +1413,7 @@ static int __bfs(struct lock_list *source_entry, goto exit; } - if (forward) - head = &lock->class->locks_after; - else - head = &lock->class->locks_before; + head = get_dep_list(lock, offset); DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -1439,7 +1446,8 @@ static inline int __bfs_forwards(struct lock_list *src_entry, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, 1); + return __bfs(src_entry, data, match, target_entry, + offsetof(struct lock_class, locks_after)); } @@ -1448,7 +1456,8 @@ static inline int __bfs_backwards(struct lock_list *src_entry, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, 0); + return __bfs(src_entry, data, match, target_entry, + offsetof(struct lock_class, locks_before)); } -- cgit v1.2.3 From 154f185e9c0f6c50ac8e901630e14aa5b36f9414 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:31 +0800 Subject: locking/lockdep: Update comments on dependency search The breadth-first search is implemented as flat-out non-recursive now, but the comments are still describing it as recursive, update the comments in that regard. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-16-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2e8ef6082f72..b2ca20aa69aa 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1381,6 +1381,10 @@ static inline struct list_head *get_dep_list(struct lock_list *lock, int offset) return lock_class + offset; } +/* + * Forward- or backward-dependency search, used for both circular dependency + * checking and hardirq-unsafe/softirq-unsafe checking. + */ static int __bfs(struct lock_list *source_entry, void *data, int (*match)(struct lock_list *entry, void *data), @@ -1461,12 +1465,6 @@ static inline int __bfs_backwards(struct lock_list *src_entry, } -/* - * Recursive, forwards-direction lock-dependency checking, used for - * both noncyclic checking and for hardirq-unsafe/softirq-unsafe - * checking. - */ - static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) { unsigned long *entries = stack_trace + trace->offset; @@ -2285,7 +2283,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, int read) /* * There was a chain-cache miss, and we are about to add a new dependency - * to a previous lock. We recursively validate the following rules: + * to a previous lock. We validate the following rules: * * - would the adding of the -> dependency create a * circular dependency in the graph? [== circular deadlock] @@ -2335,11 +2333,12 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, /* * Prove that the new -> dependency would not * create a circular dependency in the graph. (We do this by - * forward-recursing into the graph starting at , and - * checking whether we can reach .) + * a breadth-first search into the graph starting at , + * and check whether we can reach .) * - * We are using global variables to control the recursion, to - * keep the stackframe size of the recursive functions low: + * The search is limited by the size of the circular queue (i.e., + * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes + * in the graph whose neighbours are to be checked. */ this.class = hlock_class(next); this.parent = NULL; -- cgit v1.2.3 From 4609c4f963f353613812f999bb027aac795bcde8 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:33 +0800 Subject: locking/lockdep: Remove redundant argument in check_deadlock In check_deadlock(), the third argument read comes from the second argument hlock so that it can be removed. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-18-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b2ca20aa69aa..be4c1348ddcd 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2246,7 +2246,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read */ static int -check_deadlock(struct task_struct *curr, struct held_lock *next, int read) +check_deadlock(struct task_struct *curr, struct held_lock *next) { struct held_lock *prev; struct held_lock *nest = NULL; @@ -2265,7 +2265,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, int read) * Allow read-after-read recursion of the same * lock class (i.e. read_lock(lock)+read_lock(lock)): */ - if ((read == 2) && prev->read) + if ((next->read == 2) && prev->read) return 2; /* @@ -2839,7 +2839,7 @@ static int validate_chain(struct task_struct *curr, * The simple case: does the current hold the same lock * already? */ - int ret = check_deadlock(curr, hlock, hlock->read); + int ret = check_deadlock(curr, hlock); if (!ret) return 0; -- cgit v1.2.3 From b4adfe8e05f15d7e73309c93c2c337df7eb5278f Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:34 +0800 Subject: locking/lockdep: Remove unused argument in __lock_release The @nested is not used in __release_lock so remove it despite that it is not used in lock_release in the first place. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-19-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index be4c1348ddcd..8169706df767 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4096,7 +4096,7 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) * @nested is an hysterical artifact, needs a tree wide cleanup. */ static int -__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +__lock_release(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; struct held_lock *hlock; @@ -4384,7 +4384,7 @@ void lock_release(struct lockdep_map *lock, int nested, check_flags(flags); current->lockdep_recursion = 1; trace_lock_release(lock, ip); - if (__lock_release(lock, nested, ip)) + if (__lock_release(lock, ip)) check_chain_key(current); current->lockdep_recursion = 0; raw_local_irq_restore(flags); -- cgit v1.2.3 From 8c2c2b449aa50463ba4cc1f33cdfc98750ed03ab Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:35 +0800 Subject: locking/lockdep: Refactorize check_noncircular and check_redundant These two functions now handle different check results themselves. A new check_path function is added to check whether there is a path in the dependency graph. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-20-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 118 +++++++++++++++++++++++++++++------------------ 1 file changed, 74 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8169706df767..30a1c0e32573 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1683,33 +1683,90 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) } /* - * Prove that the dependency graph starting at can not - * lead to . Print an error and return 0 if it does. + * Check that the dependency graph starting at can lead to + * or not. Print an error and return 0 if it does. */ static noinline int -check_noncircular(struct lock_list *root, struct lock_class *target, - struct lock_list **target_entry) +check_path(struct lock_class *target, struct lock_list *src_entry, + struct lock_list **target_entry) { - int result; + int ret; + + ret = __bfs_forwards(src_entry, (void *)target, class_equal, + target_entry); + + if (unlikely(ret < 0)) + print_bfs_bug(ret); + + return ret; +} + +/* + * Prove that the dependency graph starting at can not + * lead to . If it can, there is a circle when adding + * -> dependency. + * + * Print an error and return 0 if it does. + */ +static noinline int +check_noncircular(struct held_lock *src, struct held_lock *target, + struct lock_trace *trace) +{ + int ret; + struct lock_list *uninitialized_var(target_entry); + struct lock_list src_entry = { + .class = hlock_class(src), + .parent = NULL, + }; debug_atomic_inc(nr_cyclic_checks); - result = __bfs_forwards(root, target, class_equal, target_entry); + ret = check_path(hlock_class(target), &src_entry, &target_entry); - return result; + if (unlikely(!ret)) { + if (!trace->nr_entries) { + /* + * If save_trace fails here, the printing might + * trigger a WARN but because of the !nr_entries it + * should not do bad things. + */ + save_trace(trace); + } + + print_circular_bug(&src_entry, target_entry, src, target); + } + + return ret; } +/* + * Check that the dependency graph starting at can lead to + * or not. If it can, -> dependency is already + * in the graph. + * + * Print an error and return 2 if it does or 1 if it does not. + */ static noinline int -check_redundant(struct lock_list *root, struct lock_class *target, - struct lock_list **target_entry) +check_redundant(struct held_lock *src, struct held_lock *target) { - int result; + int ret; + struct lock_list *uninitialized_var(target_entry); + struct lock_list src_entry = { + .class = hlock_class(src), + .parent = NULL, + }; debug_atomic_inc(nr_redundant_checks); - result = __bfs_forwards(root, target, class_equal, target_entry); + ret = check_path(hlock_class(target), &src_entry, &target_entry); - return result; + if (!ret) { + debug_atomic_inc(nr_redundant); + ret = 2; + } else if (ret < 0) + ret = 0; + + return ret; } #ifdef CONFIG_TRACE_IRQFLAGS @@ -2307,9 +2364,7 @@ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, struct held_lock *next, int distance, struct lock_trace *trace) { - struct lock_list *uninitialized_var(target_entry); struct lock_list *entry; - struct lock_list this; int ret; if (!hlock_class(prev)->key || !hlock_class(next)->key) { @@ -2340,25 +2395,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes * in the graph whose neighbours are to be checked. */ - this.class = hlock_class(next); - this.parent = NULL; - ret = check_noncircular(&this, hlock_class(prev), &target_entry); - if (unlikely(!ret)) { - if (!trace->nr_entries) { - /* - * If save_trace fails here, the printing might - * trigger a WARN but because of the !nr_entries it - * should not do bad things. - */ - save_trace(trace); - } - print_circular_bug(&this, target_entry, next, prev); + ret = check_noncircular(next, prev, trace); + if (unlikely(ret <= 0)) return 0; - } - else if (unlikely(ret < 0)) { - print_bfs_bug(ret); - return 0; - } if (!check_irq_usage(curr, prev, next)) return 0; @@ -2392,18 +2431,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, /* * Is the -> link redundant? */ - this.class = hlock_class(prev); - this.parent = NULL; - ret = check_redundant(&this, hlock_class(next), &target_entry); - if (!ret) { - debug_atomic_inc(nr_redundant); - return 2; - } - if (ret < 0) { - print_bfs_bug(ret); - return 0; - } - + ret = check_redundant(prev, next); + if (ret != 1) + return ret; if (!trace->nr_entries && !save_trace(trace)) return 0; -- cgit v1.2.3 From 68e9dc29f8f42c79d2a3755223ed910ce36b4ae2 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:36 +0800 Subject: locking/lockdep: Check redundant dependency only when CONFIG_LOCKDEP_SMALL As Peter has put it all sound and complete for the cause, I simply quote: "It (check_redundant) was added for cross-release (which has since been reverted) which would generate a lot of redundant links (IIRC) but having it makes the reports more convoluted -- basically, if we had an A-B-C relation, then A-C will not be added to the graph because it is already covered. This then means any report will include B, even though a shorter cycle might have been possible." This would increase the number of direct dependencies. For a simple workload (make clean; reboot; make vmlinux -j8), the data looks like this: CONFIG_LOCKDEP_SMALL: direct dependencies: 6926 !CONFIG_LOCKDEP_SMALL: direct dependencies: 9052 (+30.7%) Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-21-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 30a1c0e32573..63b82921698d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1739,6 +1739,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target, return ret; } +#ifdef CONFIG_LOCKDEP_SMALL /* * Check that the dependency graph starting at can lead to * or not. If it can, -> dependency is already @@ -1768,6 +1769,7 @@ check_redundant(struct held_lock *src, struct held_lock *target) return ret; } +#endif #ifdef CONFIG_TRACE_IRQFLAGS @@ -2428,12 +2430,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } +#ifdef CONFIG_LOCKDEP_SMALL /* * Is the -> link redundant? */ ret = check_redundant(prev, next); if (ret != 1) return ret; +#endif if (!trace->nr_entries && !save_trace(trace)) return 0; -- cgit v1.2.3 From 091806515124b20f8cff7927b4b7ff399483b109 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:37 +0800 Subject: locking/lockdep: Consolidate lock usage bit initialization Lock usage bit initialization is consolidated into one function mark_usage(). Trivial readability improvement. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-22-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 63b82921698d..1123e7e6c78d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3460,8 +3460,12 @@ void trace_softirqs_off(unsigned long ip) debug_atomic_inc(redundant_softirqs_off); } -static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) +static int +mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) { + if (!check) + goto lock_used; + /* * If non-trylock use in a hardirq or softirq context, then * mark the lock as used in these contexts: @@ -3505,6 +3509,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) } } +lock_used: + /* mark it as used: */ + if (!mark_lock(curr, hlock, LOCK_USED)) + return 0; + return 1; } @@ -3546,8 +3555,8 @@ int mark_lock_irq(struct task_struct *curr, struct held_lock *this, return 1; } -static inline int mark_irqflags(struct task_struct *curr, - struct held_lock *hlock) +static inline int +mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) { return 1; } @@ -3833,11 +3842,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif hlock->pin_count = pin_count; - if (check && !mark_irqflags(curr, hlock)) - return 0; - - /* mark it as used: */ - if (!mark_lock(curr, hlock, LOCK_USED)) + /* Initialize the lock usage bit */ + if (!mark_usage(curr, hlock, check)) return 0; /* -- cgit v1.2.3 From 4d56330df22dd9dd9a24f147014f60ee4c914fb8 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:38 +0800 Subject: locking/lockdep: Adjust new bit cases in mark_lock The new bit can be any possible lock usage except it is garbage, so the cases in switch can be made simpler. Warn early on if wrong usage bit is passed without taking locks. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-23-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 1123e7e6c78d..9c4e2a7547d3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3582,6 +3582,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, { unsigned int new_mask = 1 << new_bit, ret = 1; + if (new_bit >= LOCK_USAGE_STATES) { + DEBUG_LOCKS_WARN_ON(1); + return 0; + } + /* * If already set then do not dirty the cacheline, * nor do any checks: @@ -3605,25 +3610,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; switch (new_bit) { -#define LOCKDEP_STATE(__STATE) \ - case LOCK_USED_IN_##__STATE: \ - case LOCK_USED_IN_##__STATE##_READ: \ - case LOCK_ENABLED_##__STATE: \ - case LOCK_ENABLED_##__STATE##_READ: -#include "lockdep_states.h" -#undef LOCKDEP_STATE - ret = mark_lock_irq(curr, this, new_bit); - if (!ret) - return 0; - break; case LOCK_USED: debug_atomic_dec(nr_unused_locks); break; default: - if (!debug_locks_off_graph_unlock()) + ret = mark_lock_irq(curr, this, new_bit); + if (!ret) return 0; - WARN_ON(1); - return 0; } graph_unlock(); -- cgit v1.2.3 From bf998b98f5bce4ebc97b3980016f54fabb7a4958 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:39 +0800 Subject: locking/lockdep: Remove !dir in lock irq usage check In mark_lock_irq(), the following checks are performed: ---------------------------------- | -> | unsafe | read unsafe | |----------------------------------| | safe | F B | F* B* | |----------------------------------| | read safe | F? B* | - | ---------------------------------- Where: F: check_usage_forwards B: check_usage_backwards *: check enabled by STRICT_READ_CHECKS ?: check enabled by the !dir condition From checking point of view, the special F? case does not make sense, whereas it perhaps is made for peroformance concern. As later patch will address this issue, remove this exception, which makes the checks consistent later. With STRICT_READ_CHECKS = 1 which is default, there is no functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-24-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 9c4e2a7547d3..2168e94715b9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3235,7 +3235,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, * Validate that the lock dependencies don't have conflicting usage * states. */ - if ((!read || !dir || STRICT_READ_CHECKS) && + if ((!read || STRICT_READ_CHECKS) && !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) return 0; -- cgit v1.2.3 From 9fd2e48b9ae17978b2c2a98c055c774d5d90bce8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 7 May 2019 09:15:45 -0700 Subject: perf/core: Allow non-privileged uprobe for user processes Currently, non-privileged user could only use uprobe with kernel.perf_event_paranoid = -1 However, setting perf_event_paranoid to -1 leaks other users' processes to non-privileged uprobes. To introduce proper permission control of uprobes, we are building the following system: A daemon with CAP_SYS_ADMIN is in charge to create uprobes via tracefs; Users asks the daemon to create uprobes; Then user can attach uprobe only to processes owned by the user. This patch allows non-privileged user to attach uprobe to processes owned by the user. The following example shows how to use uprobe with non-privileged user. This is based on Brendan's blog post [1] 1. Create uprobe with root: sudo perf probe -x 'readline%return +0($retval):string' 2. Then non-root user can use the uprobe as: perf record -vvv -e probe_bash:readline__return -p sleep 20 perf script [1] http://www.brendangregg.com/blog/2015-06-28/linux-ftrace-uprobe.html Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190507161545.788381-1-songliubraving@fb.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- kernel/trace/trace_uprobe.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index abbd4b3b96c2..3005c80f621d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8532,9 +8532,9 @@ static int perf_tp_event_match(struct perf_event *event, if (event->hw.state & PERF_HES_STOPPED) return 0; /* - * All tracepoints are from kernel-space. + * If exclude_kernel, only trace user-space tracepoints (uprobes) */ - if (event->attr.exclude_kernel) + if (event->attr.exclude_kernel && !user_mode(regs)) return 0; if (!perf_tp_filter_match(event, data)) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index eb7e06b54741..0d60d6856de5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1331,7 +1331,7 @@ static inline void init_trace_event_call(struct trace_uprobe *tu, call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; - call->flags = TRACE_EVENT_FL_UPROBE; + call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY; call->class->reg = trace_uprobe_register; call->data = tu; } -- cgit v1.2.3 From f3a3a8257e5a1a5e67cbb1afdbc4c1c6a26f1b22 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 12 May 2019 17:55:11 +0200 Subject: perf/core: Add attr_groups_update into struct pmu Adding attr_update attribute group into pmu, to allow having multiple attribute groups for same group name. This will allow us to update "events" or "format" directories with attributes that depend on various HW conditions. For example having group_format_extra group that updates "format" directory only if pmu version is 2 and higher: static umode_t exra_is_visible(struct kobject *kobj, struct attribute *attr, int i) { return x86_pmu.version >= 2 ? attr->mode : 0; } static struct attribute_group group_format_extra = { .name = "format", .is_visible = exra_is_visible, }; Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190512155518.21468-3-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3005c80f621d..118ad1aef6af 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9874,6 +9874,12 @@ static int pmu_dev_alloc(struct pmu *pmu) if (ret) goto del_dev; + if (pmu->attr_update) + ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); + + if (ret) + goto del_dev; + out: return ret; -- cgit v1.2.3 From 8c8889d8eaf4501ae4aaf870b6f8f55db5d5109a Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 24 May 2019 23:15:08 +0300 Subject: locking/lockdep: Fix OOO unlock when hlocks need merging The sequence static DEFINE_WW_CLASS(test_ww_class); struct ww_acquire_ctx ww_ctx; struct ww_mutex ww_lock_a; struct ww_mutex ww_lock_b; struct mutex lock_c; struct mutex lock_d; ww_acquire_init(&ww_ctx, &test_ww_class); ww_mutex_init(&ww_lock_a, &test_ww_class); ww_mutex_init(&ww_lock_b, &test_ww_class); mutex_init(&lock_c); ww_mutex_lock(&ww_lock_a, &ww_ctx); mutex_lock(&lock_c); ww_mutex_lock(&ww_lock_b, &ww_ctx); mutex_unlock(&lock_c); (*) ww_mutex_unlock(&ww_lock_b); ww_mutex_unlock(&ww_lock_a); ww_acquire_fini(&ww_ctx); triggers the following WARN in __lock_release() when doing the unlock at *: DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1); The problem is that the WARN check doesn't take into account the merging of ww_lock_a and ww_lock_b which results in decreasing curr->lockdep_depth by 2 not only 1. Note that the following sequence doesn't trigger the WARN, since there won't be any hlock merging. ww_acquire_init(&ww_ctx, &test_ww_class); ww_mutex_init(&ww_lock_a, &test_ww_class); ww_mutex_init(&ww_lock_b, &test_ww_class); mutex_init(&lock_c); mutex_init(&lock_d); ww_mutex_lock(&ww_lock_a, &ww_ctx); mutex_lock(&lock_c); mutex_lock(&lock_d); ww_mutex_lock(&ww_lock_b, &ww_ctx); mutex_unlock(&lock_d); ww_mutex_unlock(&ww_lock_b); ww_mutex_unlock(&ww_lock_a); mutex_unlock(&lock_c); ww_acquire_fini(&ww_ctx); In general both of the above two sequences are valid and shouldn't trigger any lockdep warning. Fix this by taking the decrement due to the hlock merging into account during lock release and hlock class re-setting. Merging can't happen during lock downgrading since there won't be a new possibility to merge hlocks in that case, so add a WARN if merging still happens then. Signed-off-by: Imre Deak Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: ville.syrjala@linux.intel.com Link: https://lkml.kernel.org/r/20190524201509.9199-1-imre.deak@intel.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2168e94715b9..6c97f67ec321 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3808,7 +3808,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->references = 2; } - return 1; + return 2; } } @@ -4011,22 +4011,33 @@ out: } static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, - int idx) + int idx, unsigned int *merged) { struct held_lock *hlock; + int first_idx = idx; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { - if (!__lock_acquire(hlock->instance, + switch (__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) + hlock->references, hlock->pin_count)) { + case 0: return 1; + case 1: + break; + case 2: + *merged += (idx == first_idx); + break; + default: + WARN_ON(1); + return 0; + } } return 0; } @@ -4037,9 +4048,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 0; struct held_lock *hlock; struct lock_class *class; - unsigned int depth; int i; if (unlikely(!debug_locks)) @@ -4066,14 +4077,14 @@ __lock_set_class(struct lockdep_map *lock, const char *name, curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - if (reacquire_held_locks(curr, depth, i)) + if (reacquire_held_locks(curr, depth, i, &merged)) return 0; /* * I took it apart and put it back together again, except now I have * these 'spare' parts.. where shall I put them. */ - if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged)) return 0; return 1; } @@ -4081,8 +4092,8 @@ __lock_set_class(struct lockdep_map *lock, const char *name, static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 0; struct held_lock *hlock; - unsigned int depth; int i; if (unlikely(!debug_locks)) @@ -4109,7 +4120,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) hlock->read = 1; hlock->acquire_ip = ip; - if (reacquire_held_locks(curr, depth, i)) + if (reacquire_held_locks(curr, depth, i, &merged)) + return 0; + + /* Merging can't happen with unchanged classes.. */ + if (DEBUG_LOCKS_WARN_ON(merged)) return 0; /* @@ -4118,6 +4133,7 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) */ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) return 0; + return 1; } @@ -4132,8 +4148,8 @@ static int __lock_release(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 1; struct held_lock *hlock; - unsigned int depth; int i; if (unlikely(!debug_locks)) @@ -4192,14 +4208,15 @@ __lock_release(struct lockdep_map *lock, unsigned long ip) if (i == depth-1) return 1; - if (reacquire_held_locks(curr, depth, i + 1)) + if (reacquire_held_locks(curr, depth, i + 1, &merged)) return 0; /* * We had N bottles of beer on the wall, we drank one, but now * there's not N-1 bottles of beer left on the wall... + * Pouring two of the bottles together is acceptable. */ - DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth-1); + DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged); /* * Since reacquire_held_locks() would have called check_chain_key() -- cgit v1.2.3 From d9349850e188b8b59e5322fda17ff389a1c0cd7d Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 24 May 2019 23:15:09 +0300 Subject: locking/lockdep: Fix merging of hlocks with non-zero references The sequence static DEFINE_WW_CLASS(test_ww_class); struct ww_acquire_ctx ww_ctx; struct ww_mutex ww_lock_a; struct ww_mutex ww_lock_b; struct ww_mutex ww_lock_c; struct mutex lock_c; ww_acquire_init(&ww_ctx, &test_ww_class); ww_mutex_init(&ww_lock_a, &test_ww_class); ww_mutex_init(&ww_lock_b, &test_ww_class); ww_mutex_init(&ww_lock_c, &test_ww_class); mutex_init(&lock_c); ww_mutex_lock(&ww_lock_a, &ww_ctx); mutex_lock(&lock_c); ww_mutex_lock(&ww_lock_b, &ww_ctx); ww_mutex_lock(&ww_lock_c, &ww_ctx); mutex_unlock(&lock_c); (*) ww_mutex_unlock(&ww_lock_c); ww_mutex_unlock(&ww_lock_b); ww_mutex_unlock(&ww_lock_a); ww_acquire_fini(&ww_ctx); (**) will trigger the following error in __lock_release() when calling mutex_release() at **: DEBUG_LOCKS_WARN_ON(depth <= 0) The problem is that the hlock merging happening at * updates the references for test_ww_class incorrectly to 3 whereas it should've updated it to 4 (representing all the instances for ww_ctx and ww_lock_[abc]). Fix this by updating the references during merging correctly taking into account that we can have non-zero references (both for the hlock that we merge into another hlock or for the hlock we are merging into). Signed-off-by: Imre Deak Signed-off-by: Peter Zijlstra (Intel) Cc: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: https://lkml.kernel.org/r/20190524201509.9199-2-imre.deak@intel.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 6c97f67ec321..48a840adb281 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3796,17 +3796,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { - if (hlock->references) { - /* - * Check: unsigned int references:12, overflow. - */ - if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1)) - return 0; + if (!references) + references++; + if (!hlock->references) hlock->references++; - } else { - hlock->references = 2; - } + + hlock->references += references; + + /* Overflow */ + if (DEBUG_LOCKS_WARN_ON(hlock->references < references)) + return 0; return 2; } -- cgit v1.2.3 From 24811637dbfd07c69da7e9db586d35d17e6afca3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 May 2019 10:23:26 +0200 Subject: locking/lock_events: Use raw_cpu_{add,inc}() for stats Instead of playing silly games with CONFIG_DEBUG_PREEMPT toggling between this_cpu_*() and __this_cpu_*() use raw_cpu_*(), which is exactly what we want here. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Waiman Long Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190527082326.GP2623@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/lock_events.h | 45 ++++---------------------------------------- 1 file changed, 4 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index 46b71af8eef2..8c7e7d25f09c 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -31,50 +31,13 @@ enum lock_events { DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); /* - * The purpose of the lock event counting subsystem is to provide a low - * overhead way to record the number of specific locking events by using - * percpu counters. It is the percpu sum that matters, not specifically - * how many of them happens in each cpu. - * - * It is possible that the same percpu counter may be modified in both - * the process and interrupt contexts. For architectures that perform - * percpu operation with multiple instructions, it is possible to lose - * count if a process context percpu update is interrupted in the middle - * and the same counter is updated in the interrupt context. Therefore, - * the generated percpu sum may not be precise. The error, if any, should - * be small and insignificant. - * - * For those architectures that do multi-instruction percpu operation, - * preemption in the middle and moving the task to another cpu may cause - * a larger error in the count. Again, this will be few and far between. - * Given the imprecise nature of the count and the possibility of resetting - * the count and doing the measurement again, this is not really a big - * problem. - * - * To get a better picture of what is happening under the hood, it is - * suggested that a few measurements should be taken with the counts - * reset in between to stamp out outliner because of these possible - * error conditions. - * - * To minimize overhead, we use __this_cpu_*() in all cases except when - * CONFIG_DEBUG_PREEMPT is defined. In this particular case, this_cpu_*() - * will be used to avoid the appearance of unwanted BUG messages. - */ -#ifdef CONFIG_DEBUG_PREEMPT -#define lockevent_percpu_inc(x) this_cpu_inc(x) -#define lockevent_percpu_add(x, v) this_cpu_add(x, v) -#else -#define lockevent_percpu_inc(x) __this_cpu_inc(x) -#define lockevent_percpu_add(x, v) __this_cpu_add(x, v) -#endif - -/* - * Increment the PV qspinlock statistical counters + * Increment the statistical counters. use raw_cpu_inc() because of lower + * overhead and we don't care if we loose the occasional update. */ static inline void __lockevent_inc(enum lock_events event, bool cond) { if (cond) - lockevent_percpu_inc(lockevents[event]); + raw_cpu_inc(lockevents[event]); } #define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) @@ -82,7 +45,7 @@ static inline void __lockevent_inc(enum lock_events event, bool cond) static inline void __lockevent_add(enum lock_events event, int inc) { - lockevent_percpu_add(lockevents[event], inc); + raw_cpu_add(lockevents[event], inc); } #define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) -- cgit v1.2.3 From 6a54cd872f50ef3b090fdd53d1b67f5b43e97315 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 22 Jan 2019 16:21:40 +0100 Subject: trace: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Cc: Steven Rostedt Cc: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1c80521fd436..a95c5cd28135 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8604,10 +8604,6 @@ struct dentry *tracing_init_dentry(void) */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); - if (!tr->dir) { - pr_warn_once("Could not create debugfs directory 'tracing'\n"); - return ERR_PTR(-ENOMEM); - } return NULL; } -- cgit v1.2.3 From 3e6f176f304ee8effec698ebaff6b7baecb5e1e6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 22 Jan 2019 16:21:41 +0100 Subject: blktrace: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Cc: Jens Axboe Cc: Steven Rostedt Cc: Ingo Molnar Cc: linux-block@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- kernel/trace/blktrace.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e1c6d79fb4cc..2d6e93ab0478 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -512,8 +512,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, dir = debugfs_lookup(buts->name, blk_debugfs_root); if (!dir) bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); - if (!dir) - goto err; bt->dev = dev; atomic_set(&bt->dropped, 0); @@ -522,12 +520,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -EIO; bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); - if (!bt->dropped_file) - goto err; bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); - if (!bt->msg_file) - goto err; bt->rchan = relay_open("trace", dir, buts->buf_size, buts->buf_nr, &blk_relay_callbacks, bt); -- cgit v1.2.3 From 4aa3b1f67d3dbac6864095273bc3466d2099ad05 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 22 Jan 2019 16:21:44 +0100 Subject: fail_function: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Cc: Kees Cook Cc: Josef Bacik Cc: Thomas Gleixner Cc: "Naveen N. Rao" Cc: zhong jiang Acked-by: Masami Hiramatsu Signed-off-by: Greg Kroah-Hartman --- kernel/fail_function.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/fail_function.c b/kernel/fail_function.c index feb80712b913..63b349168da7 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -152,20 +152,13 @@ static int fei_retval_get(void *data, u64 *val) DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set, "%llx\n"); -static int fei_debugfs_add_attr(struct fei_attr *attr) +static void fei_debugfs_add_attr(struct fei_attr *attr) { struct dentry *dir; dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir); - if (!dir) - return -ENOMEM; - - if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) { - debugfs_remove_recursive(dir); - return -ENOMEM; - } - return 0; + debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops); } static void fei_debugfs_remove_attr(struct fei_attr *attr) @@ -306,7 +299,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, ret = register_kprobe(&attr->kp); if (!ret) - ret = fei_debugfs_add_attr(attr); + fei_debugfs_add_attr(attr); if (ret < 0) fei_attr_remove(attr); else { @@ -337,19 +330,13 @@ static int __init fei_debugfs_init(void) return PTR_ERR(dir); /* injectable attribute is just a symlink of error_inject/list */ - if (!debugfs_create_symlink("injectable", dir, - "../error_injection/list")) - goto error; + debugfs_create_symlink("injectable", dir, "../error_injection/list"); - if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops)) - goto error; + debugfs_create_file("inject", 0600, dir, NULL, &fei_ops); fei_debugfs_dir = dir; return 0; -error: - debugfs_remove_recursive(dir); - return -ENOMEM; } late_initcall(fei_debugfs_init); -- cgit v1.2.3 From 8c0fd1fa64c6120f3f02d4e8d8949e7599530286 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 22 Jan 2019 16:21:46 +0100 Subject: kprobes: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Cc: "Naveen N. Rao" Cc: Anil S Keshavamurthy Cc: "David S. Miller" Acked-by: Masami Hiramatsu Signed-off-by: Greg Kroah-Hartman --- kernel/kprobes.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 445337c107e0..9f5433a52488 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2570,33 +2570,20 @@ static const struct file_operations fops_kp = { static int __init debugfs_kprobe_init(void) { - struct dentry *dir, *file; + struct dentry *dir; unsigned int value = 1; dir = debugfs_create_dir("kprobes", NULL); - if (!dir) - return -ENOMEM; - file = debugfs_create_file("list", 0400, dir, NULL, - &debugfs_kprobes_operations); - if (!file) - goto error; + debugfs_create_file("list", 0400, dir, NULL, + &debugfs_kprobes_operations); - file = debugfs_create_file("enabled", 0600, dir, - &value, &fops_kp); - if (!file) - goto error; + debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); - file = debugfs_create_file("blacklist", 0400, dir, NULL, - &debugfs_kprobe_blacklist_ops); - if (!file) - goto error; + debugfs_create_file("blacklist", 0400, dir, NULL, + &debugfs_kprobe_blacklist_ops); return 0; - -error: - debugfs_remove(dir); - return -ENOMEM; } late_initcall(debugfs_kprobe_init); -- cgit v1.2.3 From b1d2dc009dece4cd7e629419b52266ba51960a6b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 23 May 2019 21:06:32 -0700 Subject: dma-contiguous: add dma_{alloc,free}_contiguous() helpers Both dma_alloc_from_contiguous() and dma_release_from_contiguous() are very simply implemented, but requiring callers to pass certain parameters like count and align, and taking a boolean parameter to check __GFP_NOWARN in the allocation flags. So every function call duplicates similar work: unsigned long order = get_order(size); size_t count = size >> PAGE_SHIFT; page = dma_alloc_from_contiguous(dev, count, order, gfp & __GFP_NOWARN); [...] dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT); Additionally, as CMA can be used only in the context which permits sleeping, most of callers do a gfpflags_allow_blocking() check and a corresponding fallback allocation of normal pages upon any false result: if (gfpflags_allow_blocking(flag)) page = dma_alloc_from_contiguous(); if (!page) page = alloc_pages(); [...] if (!dma_release_from_contiguous(dev, page, count)) __free_pages(page, get_order(size)); So this patch simplifies those function calls by abstracting these operations into the two new functions: dma_{alloc,free}_contiguous. As some callers of dma_{alloc,release}_from_contiguous() might be complicated, this patch just implements these two new functions to kernel/dma/direct.c only as an initial step. Suggested-by: Christoph Hellwig Signed-off-by: Nicolin Chen Tested-by: dann frazier Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/dma/direct.c | 24 ++++-------------------- 2 files changed, 51 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index b2a87905846d..637b120d647b 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -214,6 +214,53 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, return cma_release(dev_get_cma_area(dev), pages, count); } +/** + * dma_alloc_contiguous() - allocate contiguous pages + * @dev: Pointer to device for which the allocation is performed. + * @size: Requested allocation size. + * @gfp: Allocation flags. + * + * This function allocates contiguous memory buffer for specified device. It + * first tries to use device specific contiguous memory area if available or + * the default global one, then tries a fallback allocation of normal pages. + */ +struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) +{ + int node = dev ? dev_to_node(dev) : NUMA_NO_NODE; + size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT; + size_t align = get_order(PAGE_ALIGN(size)); + struct cma *cma = dev_get_cma_area(dev); + struct page *page = NULL; + + /* CMA can be used only in the context which permits sleeping */ + if (cma && gfpflags_allow_blocking(gfp)) { + align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); + page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN); + } + + /* Fallback allocation of normal pages */ + if (!page) + page = alloc_pages_node(node, gfp, align); + return page; +} + +/** + * dma_free_contiguous() - release allocated pages + * @dev: Pointer to device for which the pages were allocated. + * @page: Pointer to the allocated pages. + * @size: Size of allocated pages. + * + * This function releases memory allocated by dma_alloc_contiguous(). As the + * cma_release returns false when provided pages do not belong to contiguous + * area and true otherwise, this function then does a fallback __free_pages() + * upon a false-return. + */ +void dma_free_contiguous(struct device *dev, struct page *page, size_t size) +{ + if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT)) + __free_pages(page, get_order(size)); +} + /* * Support for reserved memory regions defined in device tree */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 2c2772e9702a..0816c1e8b05a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -96,8 +96,6 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - int page_order = get_order(size); struct page *page = NULL; u64 phys_mask; @@ -109,20 +107,9 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_mask); again: - /* CMA can be used only in the context which permits sleeping */ - if (gfpflags_allow_blocking(gfp)) { - page = dma_alloc_from_contiguous(dev, count, page_order, - gfp & __GFP_NOWARN); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - dma_release_from_contiguous(dev, page, count); - page = NULL; - } - } - if (!page) - page = alloc_pages_node(dev_to_node(dev), gfp, page_order); - + page = dma_alloc_contiguous(dev, size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - __free_pages(page, page_order); + dma_free_contiguous(dev, page, size); page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && @@ -154,7 +141,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, if (PageHighMem(page)) { /* * Depending on the cma= arguments and per-arch setup - * dma_alloc_from_contiguous could return highmem pages. + * dma_alloc_contiguous could return highmem pages. * Without remapping there is no way to return them here, * so log an error and fail. */ @@ -176,10 +163,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page) { - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - - if (!dma_release_from_contiguous(dev, page, count)) - __free_pages(page, get_order(size)); + dma_free_contiguous(dev, page, size); } void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, -- cgit v1.2.3 From bd2e75633c8012fc8a7431c82fda66237133bf7e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 23 May 2019 21:06:33 -0700 Subject: dma-contiguous: use fallback alloc_pages for single pages The addresses within a single page are always contiguous, so it's not so necessary to always allocate one single page from CMA area. Since the CMA area has a limited predefined size of space, it may run out of space in heavy use cases, where there might be quite a lot CMA pages being allocated for single pages. However, there is also a concern that a device might care where a page comes from -- it might expect the page from CMA area and act differently if the page doesn't. This patch tries to use the fallback alloc_pages path, instead of one-page size allocations from the global CMA area in case that a device does not have its own CMA area. This'd save resources from the CMA global area for more CMA allocations, and also reduce CMA fragmentations resulted from trivial allocations. Signed-off-by: Nicolin Chen Tested-by: dann frazier Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 637b120d647b..bfc0c17f2a3d 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -223,14 +223,23 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, * This function allocates contiguous memory buffer for specified device. It * first tries to use device specific contiguous memory area if available or * the default global one, then tries a fallback allocation of normal pages. + * + * Note that it byapss one-page size of allocations from the global area as + * the addresses within one page are always contiguous, so there is no need + * to waste CMA pages for that kind; it also helps reduce fragmentations. */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { int node = dev ? dev_to_node(dev) : NUMA_NO_NODE; size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT; size_t align = get_order(PAGE_ALIGN(size)); - struct cma *cma = dev_get_cma_area(dev); struct page *page = NULL; + struct cma *cma = NULL; + + if (dev && dev->cma_area) + cma = dev->cma_area; + else if (count > 1) + cma = dma_contiguous_default_area; /* CMA can be used only in the context which permits sleeping */ if (cma && gfpflags_allow_blocking(gfp)) { -- cgit v1.2.3 From c30700db9eaabb35e0b123301df35a6846e6b6b4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Jun 2019 08:43:51 +0200 Subject: dma-direct: provide generic support for uncached kernel segments A few architectures support uncached kernel segments. In that case we get an uncached mapping for a given physica address by using an offset in the uncached segement. Implement support for this scheme in the generic dma-direct code instead of duplicating it in arch hooks. Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 0816c1e8b05a..b67f0aa08aa3 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -158,6 +158,13 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, *dma_handle = phys_to_dma(dev, page_to_phys(page)); } memset(ret, 0, size); + + if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + !dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_NON_CONSISTENT)) { + arch_dma_prep_coherent(page, size); + ret = uncached_kernel_address(ret); + } + return ret; } @@ -173,13 +180,18 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, if (force_dma_unencrypted()) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); + + if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + !dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_NON_CONSISTENT)) + cpu_addr = cached_kernel_address(cpu_addr); __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr)); } void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { - if (!dev_is_dma_coherent(dev)) + if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + !dev_is_dma_coherent(dev)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); } @@ -187,7 +199,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { - if (!dev_is_dma_coherent(dev)) + if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + !dev_is_dma_coherent(dev)) arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); else dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); -- cgit v1.2.3 From 1c769fc41ac574e4957a6a874334eed1631e5f59 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 22 Jan 2019 16:21:48 +0100 Subject: gcov: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Also delete the dentry variable as it is never needed. Acked-by: Peter Oberparleiter Signed-off-by: Greg Kroah-Hartman --- kernel/gcov/fs.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 6e40ff6be083..e5eb5ea7ea59 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -64,7 +64,6 @@ struct gcov_node { static const char objtree[] = OBJTREE; static const char srctree[] = SRCTREE; static struct gcov_node root_node; -static struct dentry *reset_dentry; static LIST_HEAD(all_head); static DEFINE_MUTEX(node_lock); @@ -387,8 +386,6 @@ static void add_links(struct gcov_node *node, struct dentry *parent) goto out_err; node->links[i] = debugfs_create_symlink(deskew(basename), parent, target); - if (!node->links[i]) - goto out_err; kfree(target); } @@ -450,11 +447,6 @@ static struct gcov_node *new_node(struct gcov_node *parent, parent->dentry, node, &gcov_data_fops); } else node->dentry = debugfs_create_dir(node->name, parent->dentry); - if (!node->dentry) { - pr_warn("could not create file\n"); - kfree(node); - return NULL; - } if (info) add_links(node, parent->dentry); list_add(&node->list, &parent->children); @@ -761,32 +753,20 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) /* Create debugfs entries. */ static __init int gcov_fs_init(void) { - int rc = -EIO; - init_node(&root_node, NULL, NULL, NULL); /* * /sys/kernel/debug/gcov will be parent for the reset control file * and all profiling files. */ root_node.dentry = debugfs_create_dir("gcov", NULL); - if (!root_node.dentry) - goto err_remove; /* * Create reset file which resets all profiling counts when written * to. */ - reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry, - NULL, &gcov_reset_fops); - if (!reset_dentry) - goto err_remove; + debugfs_create_file("reset", 0600, root_node.dentry, NULL, + &gcov_reset_fops); /* Replay previous events to get our fs hierarchy up-to-date. */ gcov_enable_events(); return 0; - -err_remove: - pr_err("init failed\n"); - debugfs_remove(root_node.dentry); - - return rc; } device_initcall(gcov_fs_init); -- cgit v1.2.3 From 6685699e4ef5e9903d5c8bc6c2e6e13b931c98e1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 4 Jun 2019 09:21:46 +0100 Subject: bpf: remove redundant assignment to err The variable err is assigned with the value -EINVAL that is never read and it is re-assigned a new value later on. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 2 +- kernel/bpf/xskmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 5ae7cce5ef16..b58a33ca8a27 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -88,8 +88,8 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr) static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; - int err = -EINVAL; u64 cost; + int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 22066c28ba61..413d75f4fc72 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -17,8 +17,8 @@ struct xsk_map { static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { - int cpu, err = -EINVAL; struct xsk_map *m; + int cpu, err; u64 cost; if (!capable(CAP_NET_ADMIN)) -- cgit v1.2.3 From 6e6de3dee51a439f76eb73c22ae2ffd2c9384712 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 29 May 2019 07:26:25 -0400 Subject: kernel/module.c: Only return -EEXIST for modules that have finished loading Microsoft HyperV disables the X86_FEATURE_SMCA bit on AMD systems, and linux guests boot with repeated errors: amd64_edac_mod: Unknown symbol amd_unregister_ecc_decoder (err -2) amd64_edac_mod: Unknown symbol amd_register_ecc_decoder (err -2) amd64_edac_mod: Unknown symbol amd_report_gart_errors (err -2) amd64_edac_mod: Unknown symbol amd_unregister_ecc_decoder (err -2) amd64_edac_mod: Unknown symbol amd_register_ecc_decoder (err -2) amd64_edac_mod: Unknown symbol amd_report_gart_errors (err -2) The warnings occur because the module code erroneously returns -EEXIST for modules that have failed to load and are in the process of being removed from the module list. module amd64_edac_mod has a dependency on module edac_mce_amd. Using modules.dep, systemd will load edac_mce_amd for every request of amd64_edac_mod. When the edac_mce_amd module loads, the module has state MODULE_STATE_UNFORMED and once the module load fails and the state becomes MODULE_STATE_GOING. Another request for edac_mce_amd module executes and add_unformed_module() will erroneously return -EEXIST even though the previous instance of edac_mce_amd has MODULE_STATE_GOING. Upon receiving -EEXIST, systemd attempts to load amd64_edac_mod, which fails because of unknown symbols from edac_mce_amd. add_unformed_module() must wait to return for any case other than MODULE_STATE_LIVE to prevent a race between multiple loads of dependent modules. Signed-off-by: Prarit Bhargava Signed-off-by: Barret Rhoden Cc: David Arcari Cc: Jessica Yu Cc: Heiko Carstens Signed-off-by: Jessica Yu --- kernel/module.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6e6712b3aaf5..1e7dcbe527af 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3397,8 +3397,7 @@ static bool finished_loading(const char *name) sched_annotate_sleep(); mutex_lock(&module_mutex); mod = find_module_all(name, strlen(name), true); - ret = !mod || mod->state == MODULE_STATE_LIVE - || mod->state == MODULE_STATE_GOING; + ret = !mod || mod->state == MODULE_STATE_LIVE; mutex_unlock(&module_mutex); return ret; @@ -3588,8 +3587,7 @@ again: mutex_lock(&module_mutex); old = find_module_all(mod->name, strlen(mod->name), true); if (old != NULL) { - if (old->state == MODULE_STATE_COMING - || old->state == MODULE_STATE_UNFORMED) { + if (old->state != MODULE_STATE_LIVE) { /* Wait in case it fails to load. */ mutex_unlock(&module_mutex); err = wait_event_interruptible(module_wq, -- cgit v1.2.3 From f36e664516b02c7f54bbd3094bab047d54bb5488 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 31 May 2019 09:41:47 +0200 Subject: livepatch: Use static buffer for debugging messages under rq lock The err_buf array uses 128 bytes of stack space. Move it off the stack by making it static. It's safe to use a shared buffer because klp_try_switch_task() is called under klp_mutex. Acked-by: Miroslav Benes Acked-by: Josh Poimboeuf Reviewed-by: Kamalesh Babulal Signed-off-by: Petr Mladek --- kernel/livepatch/transition.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index c53370d596be..0a3889c4f617 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -293,11 +293,11 @@ static int klp_check_stack(struct task_struct *task, char *err_buf) */ static bool klp_try_switch_task(struct task_struct *task) { + static char err_buf[STACK_ERR_BUF_SIZE]; struct rq *rq; struct rq_flags flags; int ret; bool success = false; - char err_buf[STACK_ERR_BUF_SIZE]; err_buf[0] = '\0'; @@ -340,7 +340,6 @@ done: pr_debug("%s", err_buf); return success; - } /* -- cgit v1.2.3 From 15532fd6f57c297c45ef3f5c17d2fbcdcc8092e4 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 23 May 2019 10:06:15 +0100 Subject: ptrace: move clearing of TIF_SYSCALL_EMU flag to core While the TIF_SYSCALL_EMU is set in ptrace_resume independent of any architecture, currently only powerpc and x86 unset the TIF_SYSCALL_EMU flag in ptrace_disable which gets called from ptrace_detach. Let's move the clearing of TIF_SYSCALL_EMU flag to __ptrace_unlink which gets executed from ptrace_detach and also keep it along with or close to clearing of TIF_SYSCALL_TRACE. Cc: Paul Mackerras Cc: Michael Ellerman Cc: Thomas Gleixner Cc: Ingo Molnar Acked-by: Oleg Nesterov Signed-off-by: Sudeep Holla Signed-off-by: Catalin Marinas --- kernel/ptrace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 5710d07e67cf..ab14654b2436 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -118,6 +118,9 @@ void __ptrace_unlink(struct task_struct *child) BUG_ON(!child->ptrace); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +#endif child->parent = child->real_parent; list_del_init(&child->ptrace_entry); -- cgit v1.2.3 From 8d1b73dd25ff92c3fa9807a20c22fa2b44c07336 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 6 Jun 2019 13:18:53 -0500 Subject: kernel: module: Use struct_size() helper One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct module_sect_attrs { ... struct module_sect_attr attrs[0]; }; Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. So, replace the following form: sizeof(*sect_attrs) + nloaded * sizeof(sect_attrs->attrs[0] with: struct_size(sect_attrs, attrs, nloaded) This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jessica Yu --- kernel/module.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1e7dcbe527af..1a0d8cab9eb7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1504,8 +1504,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) for (i = 0; i < info->hdr->e_shnum; i++) if (!sect_empty(&info->sechdrs[i])) nloaded++; - size[0] = ALIGN(sizeof(*sect_attrs) - + nloaded * sizeof(sect_attrs->attrs[0]), + size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), sizeof(sect_attrs->grp.attrs[0])); size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); -- cgit v1.2.3 From 7f192e3cd316ba58c88dfa26796cf77789dd9872 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 25 May 2019 11:36:41 +0200 Subject: fork: add clone3 This adds the clone3 system call. As mentioned several times already (cf. [7], [8]) here's the promised patchset for clone3(). We recently merged the CLONE_PIDFD patchset (cf. [1]). It took the last free flag from clone(). Independent of the CLONE_PIDFD patchset a time namespace has been discussed at Linux Plumber Conference last year and has been sent out and reviewed (cf. [5]). It is expected that it will go upstream in the not too distant future. However, it relies on the addition of the CLONE_NEWTIME flag to clone(). The only other good candidate - CLONE_DETACHED - is currently not recyclable as we have identified at least two large or widely used codebases that currently pass this flag (cf. [2], [3], and [4]). Given that CLONE_PIDFD grabbed the last clone() flag the time namespace is effectively blocked. clone3() has the advantage that it will unblock this patchset again. In general, clone3() is extensible and allows for the implementation of new features. The idea is to keep clone3() very simple and close to the original clone(), specifically, to keep on supporting old clone()-based workloads. We know there have been various creative proposals how a new process creation syscall or even api is supposed to look like. Some people even going so far as to argue that the traditional fork()+exec() split should be abandoned in favor of an in-kernel version of spawn(). Independent of whether or not we personally think spawn() is a good idea this patchset has and does not want to have anything to do with this. One stance we take is that there's no real good alternative to clone()+exec() and we need and want to support this model going forward; independent of spawn(). The following requirements guided clone3(): - bump the number of available flags - move arguments that are currently passed as separate arguments in clone() into a dedicated struct clone_args - choose a struct layout that is easy to handle on 32 and on 64 bit - choose a struct layout that is extensible - give new flags that currently need to abuse another flag's dedicated return argument in clone() their own dedicated return argument (e.g. CLONE_PIDFD) - use a separate kernel internal struct kernel_clone_args that is properly typed according to current kernel conventions in fork.c and is different from the uapi struct clone_args - port _do_fork() to use kernel_clone_args so that all process creation syscalls such as fork(), vfork(), clone(), and clone3() behave identical (Arnd suggested, that we can probably also port do_fork() itself in a separate patchset.) - ease of transition for userspace from clone() to clone3() This very much means that we do *not* remove functionality that userspace currently relies on as the latter is a good way of creating a syscall that won't be adopted. - do not try to be clever or complex: keep clone3() as dumb as possible In accordance with Linus suggestions (cf. [11]), clone3() has the following signature: /* uapi */ struct clone_args { __aligned_u64 flags; __aligned_u64 pidfd; __aligned_u64 child_tid; __aligned_u64 parent_tid; __aligned_u64 exit_signal; __aligned_u64 stack; __aligned_u64 stack_size; __aligned_u64 tls; }; /* kernel internal */ struct kernel_clone_args { u64 flags; int __user *pidfd; int __user *child_tid; int __user *parent_tid; int exit_signal; unsigned long stack; unsigned long stack_size; unsigned long tls; }; long sys_clone3(struct clone_args __user *uargs, size_t size) clone3() cleanly supports all of the supported flags from clone() and thus all legacy workloads. The advantage of sticking close to the old clone() is the low cost for userspace to switch to this new api. Quite a lot of userspace apis (e.g. pthreads) are based on the clone() syscall. With the new clone3() syscall supporting all of the old workloads and opening up the ability to add new features should make switching to it for userspace more appealing. In essence, glibc can just write a simple wrapper to switch from clone() to clone3(). There has been some interest in this patchset already. We have received a patch from the CRIU corner for clone3() that would set the PID/TID of a restored process without /proc/sys/kernel/ns_last_pid to eliminate a race. /* User visible differences to legacy clone() */ - CLONE_DETACHED will cause EINVAL with clone3() - CSIGNAL is deprecated It is superseeded by a dedicated "exit_signal" argument in struct clone_args freeing up space for additional flags. This is based on a suggestion from Andrei and Linus (cf. [9] and [10]) /* References */ [1]: b3e5838252665ee4cfa76b82bdf1198dca81e5be [2]: https://dxr.mozilla.org/mozilla-central/source/security/sandbox/linux/SandboxFilter.cpp#343 [3]: https://git.musl-libc.org/cgit/musl/tree/src/thread/pthread_create.c#n233 [4]: https://sources.debian.org/src/blcr/0.8.5-2.3/cr_module/cr_dump_self.c/?hl=740#L740 [5]: https://lore.kernel.org/lkml/20190425161416.26600-1-dima@arista.com/ [6]: https://lore.kernel.org/lkml/20190425161416.26600-2-dima@arista.com/ [7]: https://lore.kernel.org/lkml/CAHrFyr5HxpGXA2YrKza-oB-GGwJCqwPfyhD-Y5wbktWZdt0sGQ@mail.gmail.com/ [8]: https://lore.kernel.org/lkml/20190524102756.qjsjxukuq2f4t6bo@brauner.io/ [9]: https://lore.kernel.org/lkml/20190529222414.GA6492@gmail.com/ [10]: https://lore.kernel.org/lkml/CAHk-=whQP-Ykxi=zSYaV9iXsHsENa+2fdj-zYKwyeyed63Lsfw@mail.gmail.com/ [11]: https://lore.kernel.org/lkml/CAHk-=wieuV4hGwznPsX-8E0G2FKhx3NjZ9X3dTKh5zKd+iqOBw@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner Acked-by: Arnd Bergmann Acked-by: Serge Hallyn Cc: Kees Cook Cc: Pavel Emelyanov Cc: Jann Horn Cc: David Howells Cc: Andrew Morton Cc: Oleg Nesterov Cc: Adrian Reber Cc: Linus Torvalds Cc: Andrei Vagin Cc: Al Viro Cc: Florian Weimer Cc: linux-api@vger.kernel.org --- kernel/fork.c | 201 ++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 153 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b4cba953040a..08ff131f26b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1760,19 +1760,15 @@ static __always_inline void delayed_free_task(struct task_struct *tsk) * flags). The actual kick-off is left to the caller. */ static __latent_entropy struct task_struct *copy_process( - unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr, struct pid *pid, int trace, - unsigned long tls, - int node) + int node, + struct kernel_clone_args *args) { int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; + u64 clone_flags = args->flags; /* * Don't allow sharing the root directory with processes in a different @@ -1821,27 +1817,12 @@ static __latent_entropy struct task_struct *copy_process( } if (clone_flags & CLONE_PIDFD) { - int reserved; - /* - * - CLONE_PARENT_SETTID is useless for pidfds and also - * parent_tidptr is used to return pidfds. * - CLONE_DETACHED is blocked so that we can potentially * reuse it later for CLONE_PIDFD. * - CLONE_THREAD is blocked until someone really needs it. */ - if (clone_flags & - (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) - return ERR_PTR(-EINVAL); - - /* - * Verify that parent_tidptr is sane so we can potentially - * reuse it later. - */ - if (get_user(reserved, parent_tidptr)) - return ERR_PTR(-EFAULT); - - if (reserved != 0) + if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) return ERR_PTR(-EINVAL); } @@ -1874,11 +1855,11 @@ static __latent_entropy struct task_struct *copy_process( * p->set_child_tid which is (ab)used as a kthread's data pointer for * kernel threads (PF_KTHREAD). */ - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; ftrace_graph_init_task(p); @@ -2037,7 +2018,8 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); + retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p, + args->tls); if (retval) goto bad_fork_cleanup_io; @@ -2062,7 +2044,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_free_pid; pidfd = retval; - retval = put_user(pidfd, parent_tidptr); + retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; } @@ -2105,7 +2087,7 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PARENT) p->exit_signal = current->group_leader->exit_signal; else - p->exit_signal = (clone_flags & CSIGNAL); + p->exit_signal = args->exit_signal; p->group_leader = p; p->tgid = p->pid; } @@ -2313,8 +2295,11 @@ static inline void init_idle_pids(struct task_struct *idle) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, - cpu_to_node(cpu)); + struct kernel_clone_args args = { + .flags = CLONE_VM, + }; + + task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); if (!IS_ERR(task)) { init_idle_pids(task); init_idle(task, cpu); @@ -2334,13 +2319,9 @@ struct mm_struct *copy_init_mm(void) * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -long _do_fork(unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr, - unsigned long tls) +long _do_fork(struct kernel_clone_args *args) { + u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; @@ -2356,7 +2337,7 @@ long _do_fork(unsigned long clone_flags, if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; - else if ((clone_flags & CSIGNAL) != SIGCHLD) + else if (args->exit_signal != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; @@ -2365,8 +2346,7 @@ long _do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, - child_tidptr, NULL, trace, tls, NUMA_NO_NODE); + p = copy_process(NULL, trace, NUMA_NO_NODE, args); add_latent_entropy(); if (IS_ERR(p)) @@ -2382,7 +2362,7 @@ long _do_fork(unsigned long clone_flags, nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) - put_user(nr, parent_tidptr); + put_user(nr, args->parent_tid); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; @@ -2414,8 +2394,16 @@ long do_fork(unsigned long clone_flags, int __user *parent_tidptr, int __user *child_tidptr) { - return _do_fork(clone_flags, stack_start, stack_size, - parent_tidptr, child_tidptr, 0); + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = stack_start, + .stack_size = stack_size, + }; + + return _do_fork(&args); } #endif @@ -2424,15 +2412,25 @@ long do_fork(unsigned long clone_flags, */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { - return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, - (unsigned long)arg, NULL, NULL, 0); + struct kernel_clone_args args = { + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), + .stack = (unsigned long)fn, + .stack_size = (unsigned long)arg, + }; + + return _do_fork(&args); } #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU - return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); + struct kernel_clone_args args = { + .exit_signal = SIGCHLD, + }; + + return _do_fork(&args); #else /* can not support in nommu mode */ return -EINVAL; @@ -2443,8 +2441,12 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, - 0, NULL, NULL, 0); + struct kernel_clone_args args = { + .flags = CLONE_VFORK | CLONE_VM, + .exit_signal = SIGCHLD, + }; + + return _do_fork(&args); } #endif @@ -2472,7 +2474,110 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, unsigned long, tls) #endif { - return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = newsp, + .tls = tls, + }; + + /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ + if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID)) + return -EINVAL; + + return _do_fork(&args); +} + +noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, + struct clone_args __user *uargs, + size_t size) +{ + struct clone_args args; + + if (unlikely(size > PAGE_SIZE)) + return -E2BIG; + + if (unlikely(size < sizeof(struct clone_args))) + return -EINVAL; + + if (unlikely(!access_ok(uargs, size))) + return -EFAULT; + + if (size > sizeof(struct clone_args)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uargs + sizeof(struct clone_args); + end = (void __user *)uargs + size; + + for (; addr < end; addr++) { + if (get_user(val, addr)) + return -EFAULT; + if (val) + return -E2BIG; + } + + size = sizeof(struct clone_args); + } + + if (copy_from_user(&args, uargs, size)) + return -EFAULT; + + *kargs = (struct kernel_clone_args){ + .flags = args.flags, + .pidfd = u64_to_user_ptr(args.pidfd), + .child_tid = u64_to_user_ptr(args.child_tid), + .parent_tid = u64_to_user_ptr(args.parent_tid), + .exit_signal = args.exit_signal, + .stack = args.stack, + .stack_size = args.stack_size, + .tls = args.tls, + }; + + return 0; +} + +static bool clone3_args_valid(const struct kernel_clone_args *kargs) +{ + /* + * All lower bits of the flag word are taken. + * Verify that no other unknown flags are passed along. + */ + if (kargs->flags & ~CLONE_LEGACY_FLAGS) + return false; + + /* + * - make the CLONE_DETACHED bit reuseable for clone3 + * - make the CSIGNAL bits reuseable for clone3 + */ + if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) + return false; + + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && + kargs->exit_signal) + return false; + + return true; +} + +SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) +{ + int err; + + struct kernel_clone_args kargs; + + err = copy_clone_args_from_user(&kargs, uargs, size); + if (err) + return err; + + if (!clone3_args_valid(&kargs)) + return -EINVAL; + + return _do_fork(&kargs); } #endif -- cgit v1.2.3 From c8a53b2db0aec40d8b217936e1b7f3d840c50390 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 23 May 2019 10:36:46 -0300 Subject: mm/hmm: Hold a mmgrab from hmm to mm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So long as a struct hmm pointer exists, so should the struct mm it is linked too. Hold the mmgrab() as soon as a hmm is created, and mmdrop() it once the hmm refcount goes to zero. Since mmdrop() (ie a 0 kref on struct mm) is now impossible with a !NULL mm->hmm delete the hmm_hmm_destroy(). Signed-off-by: Jason Gunthorpe Reviewed-by: Jérôme Glisse Reviewed-by: John Hubbard Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Reviewed-by: Christoph Hellwig Tested-by: Philip Yang --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 75675b9bf6df..c704c3cedee7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -673,7 +673,6 @@ void __mmdrop(struct mm_struct *mm) WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); - hmm_mm_destroy(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); -- cgit v1.2.3 From fada7fdc83c0bf8755956bff707c42b609223301 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Thu, 6 Jun 2019 13:59:40 -0700 Subject: bpf: Allow bpf_map_lookup_elem() on an xskmap Currently, the AF_XDP code uses a separate map in order to determine if an xsk is bound to a queue. Instead of doing this, have bpf_map_lookup_elem() return a xdp_sock. Rearrange some xdp_sock members to eliminate structure holes. Remove selftest - will be added back in later patch. Signed-off-by: Jonathan Lemon Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 26 ++++++++++++++++++++++++-- kernel/bpf/xskmap.c | 7 +++++++ 2 files changed, 31 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c2cb5bd84ce..8d1786357a09 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -334,7 +334,8 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) { return type == PTR_TO_SOCKET || type == PTR_TO_SOCK_COMMON || - type == PTR_TO_TCP_SOCK; + type == PTR_TO_TCP_SOCK || + type == PTR_TO_XDP_SOCK; } static bool reg_type_may_be_null(enum bpf_reg_type type) @@ -406,6 +407,7 @@ static const char * const reg_type_str[] = { [PTR_TO_TCP_SOCK] = "tcp_sock", [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", }; static char slot_type_char[] = { @@ -1363,6 +1365,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: return true; default: return false; @@ -1843,6 +1846,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, case PTR_TO_TCP_SOCK: valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); break; + case PTR_TO_XDP_SOCK: + valid = bpf_xdp_sock_is_valid_access(off, size, t, &info); + break; default: valid = false; } @@ -2007,6 +2013,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_TCP_SOCK: pointer_desc = "tcp_sock "; break; + case PTR_TO_XDP_SOCK: + pointer_desc = "xdp_sock "; + break; default: break; } @@ -2905,10 +2914,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, * appear. */ case BPF_MAP_TYPE_CPUMAP: - case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; + case BPF_MAP_TYPE_XSKMAP: + if (func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_elem) + goto error; + break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) @@ -3799,6 +3812,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -5038,6 +5052,9 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (reg->map_ptr->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; reg->map_ptr = reg->map_ptr->inner_map_meta; + } else if (reg->map_ptr->map_type == + BPF_MAP_TYPE_XSKMAP) { + reg->type = PTR_TO_XDP_SOCK; } else { reg->type = PTR_TO_MAP_VALUE; } @@ -6299,6 +6316,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -6693,6 +6711,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: return false; default: return true; @@ -7826,6 +7845,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_TCP_SOCK: convert_ctx_access = bpf_tcp_sock_convert_ctx_access; break; + case PTR_TO_XDP_SOCK: + convert_ctx_access = bpf_xdp_sock_convert_ctx_access; + break; default: continue; } diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 413d75f4fc72..ef7338cebd18 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -151,6 +151,12 @@ void __xsk_map_flush(struct bpf_map *map) } static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return __xsk_map_lookup_elem(map, *(u32 *)key); +} + +static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) { return ERR_PTR(-EOPNOTSUPP); } @@ -218,6 +224,7 @@ const struct bpf_map_ops xsk_map_ops = { .map_free = xsk_map_free, .map_get_next_key = xsk_map_get_next_key, .map_lookup_elem = xsk_map_lookup_elem, + .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, .map_check_btf = map_check_no_btf, -- cgit v1.2.3 From 0bfaffbf4cc6f380a83eabc679c0d6eb307cf6e4 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 11 Jun 2019 10:58:24 -0700 Subject: swiotlb: Group identical cleanup in swiotlb_cleanup() Avoid repeating the zeroing of global swiotlb variables in two locations and introduce swiotlb_cleanup() to do that. Signed-off-by: Florian Fainelli Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 38d57218809c..20f414f713e5 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -309,6 +309,14 @@ swiotlb_late_init_with_default_size(size_t default_size) return rc; } +static void swiotlb_cleanup(void) +{ + io_tlb_end = 0; + io_tlb_start = 0; + io_tlb_nslabs = 0; + max_segment = 0; +} + int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { @@ -359,10 +367,7 @@ cleanup4: sizeof(int))); io_tlb_list = NULL; cleanup3: - io_tlb_end = 0; - io_tlb_start = 0; - io_tlb_nslabs = 0; - max_segment = 0; + swiotlb_cleanup(); return -ENOMEM; } @@ -386,10 +391,7 @@ void __init swiotlb_exit(void) memblock_free_late(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); } - io_tlb_start = 0; - io_tlb_end = 0; - io_tlb_nslabs = 0; - max_segment = 0; + swiotlb_cleanup(); } /* -- cgit v1.2.3 From 4aa095ea329d76aeb2ae043e0d1ed96049d47940 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 11 Jun 2019 10:58:25 -0700 Subject: swiotlb: Return consistent SWIOTLB segments/nr_tbl With a specifically contrived memory layout where there is no physical memory available to the kernel below the 4GB boundary, we will fail to perform the initial swiotlb_init() call and set no_iotlb_memory to true. There are drivers out there that call into swiotlb_nr_tbl() to determine whether they can use the SWIOTLB. With the right DMA_BIT_MASK() value for these drivers (say 64-bit), they won't ever need to hit swiotlb_tbl_map_single() so this can go unoticed and we would be possibly lying about those drivers. Signed-off-by: Florian Fainelli Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 20f414f713e5..72506085795c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -128,15 +128,17 @@ setup_io_tlb_npages(char *str) } early_param("swiotlb", setup_io_tlb_npages); +static bool no_iotlb_memory; + unsigned long swiotlb_nr_tbl(void) { - return io_tlb_nslabs; + return unlikely(no_iotlb_memory) ? 0 : io_tlb_nslabs; } EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); unsigned int swiotlb_max_segment(void) { - return max_segment; + return unlikely(no_iotlb_memory) ? 0 : max_segment; } EXPORT_SYMBOL_GPL(swiotlb_max_segment); @@ -159,8 +161,6 @@ unsigned long swiotlb_size_or_default(void) return size ? size : (IO_TLB_DEFAULT_SIZE); } -static bool no_iotlb_memory; - void swiotlb_print_info(void) { unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; -- cgit v1.2.3 From 0e5aa23282f8e6ee38c18f67ddfdaaa32d3df86b Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 7 Jun 2019 13:42:53 -0400 Subject: hrtimer: Remove unused header include seq_file.h does not need to be included, so remove it. Signed-off-by: Yangtao Li Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190607174253.27403-1-tiny.windzz@gmail.com --- kernel/time/hrtimer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 41dfff23c1f9..edb230aba3d1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 619c1baa91b2820eae9ff5d89eb525df81ea7a5a Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 27 May 2019 22:55:14 +0200 Subject: genirq/timings: Fix next event index function The current code is luckily working with most of the interval samples testing but actually it fails to correctly detect pattern repetition breaking at the end of the buffer. Narrowing down the bug has been a real pain because of the pointers, so the routine is rewrittne by using indexes instead. Fixes: bbba0e7c5cda "genirq/timings: Add array suffix computation code" Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20190527205521.12091-2-daniel.lezcano@linaro.org --- kernel/irq/timings.c | 51 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 90c735da15d0..4f5daf3db13b 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -297,7 +297,16 @@ static u64 irq_timings_ema_new(u64 value, u64 ema_old) static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) { - int i; + int period; + + /* + * Move the beginning pointer to the end minus the max period x 3. + * We are at the point we can begin searching the pattern + */ + buffer = &buffer[len - (period_max * 3)]; + + /* Adjust the length to the maximum allowed period x 3 */ + len = period_max * 3; /* * The buffer contains the suite of intervals, in a ilog2 @@ -306,21 +315,45 @@ static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) * period beginning at the end of the buffer. We do that for * each suffix. */ - for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) { + for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) { - int *begin = &buffer[len - (i * 3)]; - int *ptr = begin; + /* + * The first comparison always succeed because the + * suffix is deduced from the first n-period bytes of + * the buffer and we compare the initial suffix with + * itself, so we can skip the first iteration. + */ + int idx = period; + size_t size = period; /* * We look if the suite with period 'i' repeat * itself. If it is truncated at the end, as it * repeats we can use the period to find out the next - * element. + * element with the modulo. */ - while (!memcmp(ptr, begin, i * sizeof(*ptr))) { - ptr += i; - if (ptr >= &buffer[len]) - return begin[((i * 3) % i)]; + while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) { + + /* + * Move the index in a period basis + */ + idx += size; + + /* + * If this condition is reached, all previous + * memcmp were successful, so the period is + * found. + */ + if (idx == len) + return buffer[len % period]; + + /* + * If the remaining elements to compare are + * smaller than the period, readjust the size + * of the comparison for the last iteration. + */ + if (len - idx < period) + size = len - idx; } } -- cgit v1.2.3 From 2840eef0513c518faeb8a0ab8d07268c6285cdd0 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 27 May 2019 22:55:15 +0200 Subject: genirq/timings: Fix timings buffer inspection It appears the index beginning computation is not correct, the current code does: i = (irqts->count & IRQ_TIMINGS_MASK) - 1 If irqts->count is equal to zero, we end up with an index equal to -1, but that does not happen because the function checks against zero before and returns in such case. However, if irqts->count is a multiple of IRQ_TIMINGS_SIZE, the resulting & bit op will be zero and leads also to a -1 index. Re-introduce the iteration loop belonging to the previous variance code which was correct. Fixes: bbba0e7c5cda "genirq/timings: Add array suffix computation code" Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20190527205521.12091-3-daniel.lezcano@linaro.org --- kernel/irq/timings.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 4f5daf3db13b..19d2fad379ee 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -267,6 +267,23 @@ void irq_timings_disable(void) #define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */ #define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */ +/* + * Number of elements in the circular buffer: If it happens it was + * flushed before, then the number of elements could be smaller than + * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is + * used as we wrapped. The index begins from zero when we did not + * wrap. That could be done in a nicer way with the proper circular + * array structure type but with the cost of extra computation in the + * interrupt handler hot path. We choose efficiency. + */ +#define for_each_irqts(i, irqts) \ + for (i = irqts->count < IRQ_TIMINGS_SIZE ? \ + 0 : irqts->count & IRQ_TIMINGS_MASK, \ + irqts->count = min(IRQ_TIMINGS_SIZE, \ + irqts->count); \ + irqts->count > 0; irqts->count--, \ + i = (i + 1) & IRQ_TIMINGS_MASK) + struct irqt_stat { u64 last_ts; u64 ema_time[PREDICTION_BUFFER_SIZE]; @@ -526,11 +543,7 @@ u64 irq_timings_next_event(u64 now) * model while decrementing the counter because we consume the * data from our circular buffer. */ - - i = (irqts->count & IRQ_TIMINGS_MASK) - 1; - irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); - - for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { + for_each_irqts(i, irqts) { irq = irq_timing_decode(irqts->values[i], &ts); s = idr_find(&irqt_stats, irq); if (s) -- cgit v1.2.3 From 3c2e79f4cef7938125b356e7f5c8fd038212619a Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 27 May 2019 22:55:16 +0200 Subject: genirq/timings: Optimize the period detection speed With a minimal period and if there is a period which is a multiple of it but lesser than the max period then it will be detected before and the minimal period will be never reached. 1 2 1 2 1 2 1 2 1 2 1 2 <-----> <-----> <-----> <-> <-> <-> <-> <-> <-> In that case, the minimum period is 2 and the maximum period is 5. That means all repeating pattern of 2 will be detected as repeating pattern of 4, it is pointless to go up to 2 when searching for the period as it will always fail. Remove one loop iteration by increasing the minimal period to 3. Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20190527205521.12091-4-daniel.lezcano@linaro.org --- kernel/irq/timings.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 19d2fad379ee..1d1c411d4cae 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -261,7 +261,7 @@ void irq_timings_disable(void) #define EMA_ALPHA_VAL 64 #define EMA_ALPHA_SHIFT 7 -#define PREDICTION_PERIOD_MIN 2 +#define PREDICTION_PERIOD_MIN 3 #define PREDICTION_PERIOD_MAX 5 #define PREDICTION_FACTOR 4 #define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */ -- cgit v1.2.3 From df025e47e4e34b779af2cc72c350877be7104ef3 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 27 May 2019 22:55:17 +0200 Subject: genirq/timings: Encapsulate timings push For the next patches providing the selftest, it is required to artificially insert timings value in the circular buffer in order to check the correctness of the code. Encapsulate the common code between the future test code and the current code with an always-inline tag. No functional change. Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20190527205521.12091-5-daniel.lezcano@linaro.org --- kernel/irq/internals.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 70c3053bc1f6..21f9927ff5ad 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -354,6 +354,16 @@ static inline int irq_timing_decode(u64 value, u64 *timestamp) return value & U16_MAX; } +static __always_inline void irq_timings_push(u64 ts, int irq) +{ + struct irq_timings *timings = this_cpu_ptr(&irq_timings); + + timings->values[timings->count & IRQ_TIMINGS_MASK] = + irq_timing_encode(ts, irq); + + timings->count++; +} + /* * The function record_irq_time is only called in one place in the * interrupts handler. We want this function always inline so the code @@ -367,15 +377,8 @@ static __always_inline void record_irq_time(struct irq_desc *desc) if (!static_branch_likely(&irq_timing_enabled)) return; - if (desc->istate & IRQS_TIMINGS) { - struct irq_timings *timings = this_cpu_ptr(&irq_timings); - - timings->values[timings->count & IRQ_TIMINGS_MASK] = - irq_timing_encode(local_clock(), - irq_desc_get_irq(desc)); - - timings->count++; - } + if (desc->istate & IRQS_TIMINGS) + irq_timings_push(local_clock(), irq_desc_get_irq(desc)); } #else static inline void irq_remove_timings(struct irq_desc *desc) {} -- cgit v1.2.3 From 23aa3b9a6b7d5029c1f124426bc5ba4430dcc29c Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 27 May 2019 22:55:18 +0200 Subject: genirq/timings: Encapsulate storing function For the next patches providing the selftest, it is required to insert interval values directly in the buffer in order to check the correctness of the code. Encapsulate the code doing that in a always inline function in order to reuse it in the test code. No functional changes. Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20190527205521.12091-6-daniel.lezcano@linaro.org --- kernel/irq/timings.c | 53 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 1d1c411d4cae..bc04eca6ef84 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -430,11 +430,43 @@ static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now) return irqs->last_ts + irqs->ema_time[index]; } +static __always_inline int irq_timings_interval_index(u64 interval) +{ + /* + * The PREDICTION_FACTOR increase the interval size for the + * array of exponential average. + */ + u64 interval_us = (interval >> 10) / PREDICTION_FACTOR; + + return likely(interval_us) ? ilog2(interval_us) : 0; +} + +static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs, + u64 interval) +{ + int index; + + /* + * Get the index in the ema table for this interrupt. + */ + index = irq_timings_interval_index(interval); + + /* + * Store the index as an element of the pattern in another + * circular array. + */ + irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; + + irqs->ema_time[index] = irq_timings_ema_new(interval, + irqs->ema_time[index]); + + irqs->count++; +} + static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) { u64 old_ts = irqs->last_ts; u64 interval; - int index; /* * The timestamps are absolute time values, we need to compute @@ -465,24 +497,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) return; } - /* - * Get the index in the ema table for this interrupt. The - * PREDICTION_FACTOR increase the interval size for the array - * of exponential average. - */ - index = likely(interval) ? - ilog2((interval >> 10) / PREDICTION_FACTOR) : 0; - - /* - * Store the index as an element of the pattern in another - * circular array. - */ - irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; - - irqs->ema_time[index] = irq_timings_ema_new(interval, - irqs->ema_time[index]); - - irqs->count++; + __irq_timings_store(irq, irqs, interval); } /** -- cgit v1.2.3 From 6aed82de719b424bd5548aa4179e95f34fd779ab Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 27 May 2019 22:55:19 +0200 Subject: genirq/timings: Add selftest for circular array Due to the complexity of the code and the difficulty to debug it, add some selftests to the framework in order to spot issues or regression at boot time when the runtime testing is enabled for this subsystem. This tests the circular buffer at the limits and validates: - the encoding / decoding of the values - the macro to browse the irq timings circular buffer - the function to push data in the circular buffer Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20190527205521.12091-7-daniel.lezcano@linaro.org --- kernel/irq/Makefile | 3 ++ kernel/irq/timings.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index ff6e352e3a6c..b4f53717d143 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -2,6 +2,9 @@ obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o obj-$(CONFIG_IRQ_TIMINGS) += timings.o +ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y) + CFLAGS_timings.o += -DDEBUG +endif obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index bc04eca6ef84..95b63bdea156 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -1,10 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2016, Linaro Ltd - Daniel Lezcano +#define pr_fmt(fmt) "irq_timings: " fmt #include #include #include #include +#include #include #include #include @@ -625,3 +627,120 @@ int irq_timings_alloc(int irq) return 0; } + +#ifdef CONFIG_TEST_IRQ_TIMINGS +static int __init irq_timings_test_irqts(struct irq_timings *irqts, + unsigned count) +{ + int start = count >= IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0; + int i, irq, oirq = 0xBEEF; + u64 ots = 0xDEAD, ts; + + /* + * Fill the circular buffer by using the dedicated function. + */ + for (i = 0; i < count; i++) { + pr_debug("%d: index=%d, ts=%llX irq=%X\n", + i, i & IRQ_TIMINGS_MASK, ots + i, oirq + i); + + irq_timings_push(ots + i, oirq + i); + } + + /* + * Compute the first elements values after the index wrapped + * up or not. + */ + ots += start; + oirq += start; + + /* + * Test the circular buffer count is correct. + */ + pr_debug("---> Checking timings array count (%d) is right\n", count); + if (WARN_ON(irqts->count != count)) + return -EINVAL; + + /* + * Test the macro allowing to browse all the irqts. + */ + pr_debug("---> Checking the for_each_irqts() macro\n"); + for_each_irqts(i, irqts) { + + irq = irq_timing_decode(irqts->values[i], &ts); + + pr_debug("index=%d, ts=%llX / %llX, irq=%X / %X\n", + i, ts, ots, irq, oirq); + + if (WARN_ON(ts != ots || irq != oirq)) + return -EINVAL; + + ots++; oirq++; + } + + /* + * The circular buffer should have be flushed when browsed + * with for_each_irqts + */ + pr_debug("---> Checking timings array is empty after browsing it\n"); + if (WARN_ON(irqts->count)) + return -EINVAL; + + return 0; +} + +static int __init irq_timings_irqts_selftest(void) +{ + struct irq_timings *irqts = this_cpu_ptr(&irq_timings); + int i, ret; + + /* + * Test the circular buffer with different number of + * elements. The purpose is to test at the limits (empty, half + * full, full, wrapped with the cursor at the boundaries, + * wrapped several times, etc ... + */ + int count[] = { 0, + IRQ_TIMINGS_SIZE >> 1, + IRQ_TIMINGS_SIZE, + IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1), + 2 * IRQ_TIMINGS_SIZE, + (2 * IRQ_TIMINGS_SIZE) + 3, + }; + + for (i = 0; i < ARRAY_SIZE(count); i++) { + + pr_info("---> Checking the timings with %d/%d values\n", + count[i], IRQ_TIMINGS_SIZE); + + ret = irq_timings_test_irqts(irqts, count[i]); + if (ret) + break; + } + + return ret; +} + +static int __init irq_timings_selftest(void) +{ + int ret; + + pr_info("------------------- selftest start -----------------\n"); + + /* + * At this point, we don't except any subsystem to use the irq + * timings but us, so it should not be enabled. + */ + if (static_branch_unlikely(&irq_timing_enabled)) { + pr_warn("irq timings already initialized, skipping selftest\n"); + return 0; + } + + ret = irq_timings_irqts_selftest(); + + pr_info("---------- selftest end with %s -----------\n", + ret ? "failure" : "success"); + + return ret; +} +early_initcall(irq_timings_selftest); +#endif -- cgit v1.2.3 From f52da98d900e18a250cb14ca426d4041ea7002db Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 27 May 2019 22:55:20 +0200 Subject: genirq/timings: Add selftest for irqs circular buffer After testing the per cpu interrupt circular event, make sure the per interrupt circular buffer usage is correct. Add tests to validate the interrupt circular buffer. Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20190527205521.12091-8-daniel.lezcano@linaro.org --- kernel/irq/timings.c | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 95b63bdea156..5b13c2231d4f 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -629,6 +629,141 @@ int irq_timings_alloc(int irq) } #ifdef CONFIG_TEST_IRQ_TIMINGS +struct timings_intervals { + u64 *intervals; + size_t count; +}; + +/* + * Intervals are given in nanosecond base + */ +static u64 intervals0[] __initdata = { + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, +}; + +static u64 intervals1[] __initdata = { + 223947000, 1240000, 1384000, 1386000, 1386000, + 217416000, 1236000, 1384000, 1386000, 1387000, + 214719000, 1241000, 1386000, 1387000, 1384000, + 213696000, 1234000, 1384000, 1386000, 1388000, + 219904000, 1240000, 1385000, 1389000, 1385000, + 212240000, 1240000, 1386000, 1386000, 1386000, + 214415000, 1236000, 1384000, 1386000, 1387000, + 214276000, 1234000, +}; + +static u64 intervals2[] __initdata = { + 4000, 3000, 5000, 100000, + 3000, 3000, 5000, 117000, + 4000, 4000, 5000, 112000, + 4000, 3000, 4000, 110000, + 3000, 5000, 3000, 117000, + 4000, 4000, 5000, 112000, + 4000, 3000, 4000, 110000, + 3000, 4000, 5000, 112000, + 4000, +}; + +static u64 intervals3[] __initdata = { + 1385000, 212240000, 1240000, + 1386000, 214415000, 1236000, + 1384000, 214276000, 1234000, + 1386000, 214415000, 1236000, + 1385000, 212240000, 1240000, + 1386000, 214415000, 1236000, + 1384000, 214276000, 1234000, + 1386000, 214415000, 1236000, + 1385000, 212240000, 1240000, +}; + +static u64 intervals4[] __initdata = { + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, +}; + +static struct timings_intervals tis[] __initdata = { + { intervals0, ARRAY_SIZE(intervals0) }, + { intervals1, ARRAY_SIZE(intervals1) }, + { intervals2, ARRAY_SIZE(intervals2) }, + { intervals3, ARRAY_SIZE(intervals3) }, + { intervals4, ARRAY_SIZE(intervals4) }, +}; + +static int __init irq_timings_test_irqs(struct timings_intervals *ti) +{ + struct irqt_stat __percpu *s; + struct irqt_stat *irqs; + int i, index, ret, irq = 0xACE5; + + ret = irq_timings_alloc(irq); + if (ret) { + pr_err("Failed to allocate irq timings\n"); + return ret; + } + + s = idr_find(&irqt_stats, irq); + if (!s) { + ret = -EIDRM; + goto out; + } + + irqs = this_cpu_ptr(s); + + for (i = 0; i < ti->count; i++) { + + index = irq_timings_interval_index(ti->intervals[i]); + pr_debug("%d: interval=%llu ema_index=%d\n", + i, ti->intervals[i], index); + + __irq_timings_store(irq, irqs, ti->intervals[i]); + if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) { + pr_err("Failed to store in the circular buffer\n"); + goto out; + } + } + + if (irqs->count != ti->count) { + pr_err("Count differs\n"); + goto out; + } + + ret = 0; +out: + irq_timings_free(irq); + + return ret; +} + +static int __init irq_timings_irqs_selftest(void) +{ + int i, ret; + + for (i = 0; i < ARRAY_SIZE(tis); i++) { + pr_info("---> Injecting intervals number #%d (count=%zd)\n", + i, tis[i].count); + ret = irq_timings_test_irqs(&tis[i]); + if (ret) + break; + } + + return ret; +} + static int __init irq_timings_test_irqts(struct irq_timings *irqts, unsigned count) { @@ -736,7 +871,11 @@ static int __init irq_timings_selftest(void) } ret = irq_timings_irqts_selftest(); + if (ret) + goto out; + ret = irq_timings_irqs_selftest(); +out: pr_info("---------- selftest end with %s -----------\n", ret ? "failure" : "success"); -- cgit v1.2.3 From 699785f5d898965408430e841d10cd1cb2c02a77 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 27 May 2019 22:55:21 +0200 Subject: genirq/timings: Add selftest for next event computation The circular buffers are now validated with selftests. The next interrupt index algorithm which is the hardest part to validate needs extra coverage. Add a selftest which uses the intervals stored in the arrays and insert all the values except the last one. The next event computation must return the same value as the last element which was not inserted. Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20190527205521.12091-9-daniel.lezcano@linaro.org --- kernel/irq/timings.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 5b13c2231d4f..e960d7ce7bcc 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -704,6 +704,68 @@ static struct timings_intervals tis[] __initdata = { { intervals4, ARRAY_SIZE(intervals4) }, }; +static int __init irq_timings_test_next_index(struct timings_intervals *ti) +{ + int _buffer[IRQ_TIMINGS_SIZE]; + int buffer[IRQ_TIMINGS_SIZE]; + int index, start, i, count, period_max; + + count = ti->count - 1; + + period_max = count > (3 * PREDICTION_PERIOD_MAX) ? + PREDICTION_PERIOD_MAX : count / 3; + + /* + * Inject all values except the last one which will be used + * to compare with the next index result. + */ + pr_debug("index suite: "); + + for (i = 0; i < count; i++) { + index = irq_timings_interval_index(ti->intervals[i]); + _buffer[i & IRQ_TIMINGS_MASK] = index; + pr_cont("%d ", index); + } + + start = count < IRQ_TIMINGS_SIZE ? 0 : + count & IRQ_TIMINGS_MASK; + + count = min_t(int, count, IRQ_TIMINGS_SIZE); + + for (i = 0; i < count; i++) { + int index = (start + i) & IRQ_TIMINGS_MASK; + buffer[i] = _buffer[index]; + } + + index = irq_timings_next_event_index(buffer, count, period_max); + i = irq_timings_interval_index(ti->intervals[ti->count - 1]); + + if (index != i) { + pr_err("Expected (%d) and computed (%d) next indexes differ\n", + i, index); + return -EINVAL; + } + + return 0; +} + +static int __init irq_timings_next_index_selftest(void) +{ + int i, ret; + + for (i = 0; i < ARRAY_SIZE(tis); i++) { + + pr_info("---> Injecting intervals number #%d (count=%zd)\n", + i, tis[i].count); + + ret = irq_timings_test_next_index(&tis[i]); + if (ret) + break; + } + + return ret; +} + static int __init irq_timings_test_irqs(struct timings_intervals *ti) { struct irqt_stat __percpu *s; @@ -875,6 +937,10 @@ static int __init irq_timings_selftest(void) goto out; ret = irq_timings_irqs_selftest(); + if (ret) + goto out; + + ret = irq_timings_next_index_selftest(); out: pr_info("---------- selftest end with %s -----------\n", ret ? "failure" : "success"); -- cgit v1.2.3 From 0e51833042fccfe882ef3e85a346252550d26c22 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 2 Jun 2019 20:21:17 +0900 Subject: genirq/affinity: Remove unused argument from [__]irq_build_affinity_masks() The *affd argument is neither used in irq_build_affinity_masks() nor __irq_build_affinity_masks(). Remove it. Signed-off-by: Minwoo Im Signed-off-by: Thomas Gleixner Reviewed-by: Ming Lei Cc: Minwoo Im Cc: linux-block@vger.kernel.org Link: https://lkml.kernel.org/r/20190602112117.31839-1-minwoo.im.dev@gmail.com --- kernel/irq/affinity.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index f18cd5aa33e8..4352b08ae48d 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -94,8 +94,7 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -static int __irq_build_affinity_masks(const struct irq_affinity *affd, - unsigned int startvec, +static int __irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, unsigned int firstvec, cpumask_var_t *node_to_cpumask, @@ -171,8 +170,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, * 1) spread present CPU on these vectors * 2) spread other possible CPUs on these vectors */ -static int irq_build_affinity_masks(const struct irq_affinity *affd, - unsigned int startvec, unsigned int numvecs, +static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, unsigned int firstvec, struct irq_affinity_desc *masks) { @@ -197,7 +195,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, build_node_to_cpumask(node_to_cpumask); /* Spread on present CPUs starting from affd->pre_vectors */ - nr_present = __irq_build_affinity_masks(affd, curvec, numvecs, + nr_present = __irq_build_affinity_masks(curvec, numvecs, firstvec, node_to_cpumask, cpu_present_mask, nmsk, masks); @@ -212,7 +210,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, else curvec = firstvec + nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - nr_others = __irq_build_affinity_masks(affd, curvec, numvecs, + nr_others = __irq_build_affinity_masks(curvec, numvecs, firstvec, node_to_cpumask, npresmsk, nmsk, masks); put_online_cpus(); @@ -295,7 +293,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) unsigned int this_vecs = affd->set_size[i]; int ret; - ret = irq_build_affinity_masks(affd, curvec, this_vecs, + ret = irq_build_affinity_masks(curvec, this_vecs, curvec, masks); if (ret) { kfree(masks); -- cgit v1.2.3 From a66d955e910ab0e598d7a7450cbe6139f52befe7 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Mon, 3 Jun 2019 10:01:03 +0530 Subject: cpu/hotplug: Abort disabling secondary CPUs if wakeup is pending When "deep" suspend is enabled, all CPUs except the primary CPU are frozen via CPU hotplug one by one. After all secondary CPUs are unplugged the wakeup pending condition is evaluated and if pending the suspend operation is aborted and the secondary CPUs are brought up again. CPU hotplug is a slow operation, so it makes sense to check for wakeup pending in the freezer loop before bringing down the next CPU. This improves the system suspend abort latency significantly. [ tglx: Massaged changelog and improved printk message ] Signed-off-by: Pavankumar Kondeti Signed-off-by: Thomas Gleixner Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Pavel Machek Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Konrad Rzeszutek Wilk Cc: iri Kosina Cc: Mukesh Ojha Cc: linux-pm@vger.kernel.org Link: https://lkml.kernel.org/r/1559536263-16472-1-git-send-email-pkondeti@codeaurora.org --- kernel/cpu.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index be82cbc11a8a..0778249cd49d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1221,6 +1221,13 @@ int freeze_secondary_cpus(int primary) for_each_online_cpu(cpu) { if (cpu == primary) continue; + + if (pm_wakeup_pending()) { + pr_info("Wakeup pending. Abort CPU freeze\n"); + error = -EBUSY; + break; + } + trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1, CPUHP_OFFLINE); trace_suspend_resume(TPS("CPU_OFF"), cpu, false); -- cgit v1.2.3 From aee450cbe482a8c2f6fa5b05b178ef8b8ff107ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valdis=20Kl=C4=93tnieks?= Date: Thu, 6 Jun 2019 22:39:27 -0400 Subject: bpf: silence warning messages in core Compiling kernel/bpf/core.c with W=1 causes a flood of warnings: kernel/bpf/core.c:1198:65: warning: initialized field overwritten [-Woverride-init] 1198 | #define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true | ^~~~ kernel/bpf/core.c:1087:2: note: in expansion of macro 'BPF_INSN_3_TBL' 1087 | INSN_3(ALU, ADD, X), \ | ^~~~~~ kernel/bpf/core.c:1202:3: note: in expansion of macro 'BPF_INSN_MAP' 1202 | BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), | ^~~~~~~~~~~~ kernel/bpf/core.c:1198:65: note: (near initialization for 'public_insntable[12]') 1198 | #define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true | ^~~~ kernel/bpf/core.c:1087:2: note: in expansion of macro 'BPF_INSN_3_TBL' 1087 | INSN_3(ALU, ADD, X), \ | ^~~~~~ kernel/bpf/core.c:1202:3: note: in expansion of macro 'BPF_INSN_MAP' 1202 | BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), | ^~~~~~~~~~~~ 98 copies of the above. The attached patch silences the warnings, because we *know* we're overwriting the default initializer. That leaves bpf/core.c with only 6 other warnings, which become more visible in comparison. Signed-off-by: Valdis Kletnieks Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 4c2fa3ac56f6..29d781061cd5 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o +CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -- cgit v1.2.3 From 96050c68be33edef18800ad6748f61f81db81a20 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 20 Apr 2019 01:40:54 -0700 Subject: rcu: Upgrade sync_exp_work_done() to smp_mb() The sync_exp_work_done() function uses smp_mb__before_atomic(), but there is no obvious atomic in the ensuing code. The ordering is absolutely required for grace periods to work correctly, so this commit upgrades the smp_mb__before_atomic() to smp_mb(). Fixes: 6fba2b3767ea ("rcu: Remove deprecated RCU debugfs tracing code") Reported-by: Andrea Parri Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 9c990df880d1..d969650a72c6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -259,8 +259,7 @@ static bool sync_exp_work_done(unsigned long s) { if (rcu_exp_gp_seq_done(s)) { trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); - /* Ensure test happens before caller kfree(). */ - smp_mb__before_atomic(); /* ^^^ */ + smp_mb(); /* Ensure test happens before caller kfree(). */ return true; } return false; -- cgit v1.2.3 From bc6f2a757d525e001268c3658bd88822e768f8db Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 11 Jun 2019 23:00:07 +0800 Subject: kernel/module: Fix mem leak in module_add_modinfo_attrs In module_add_modinfo_attrs if sysfs_create_file fails, we forget to free allocated modinfo_attrs and roll back the sysfs files. Fixes: 03e88ae1b13d ("[PATCH] fix module sysfs files reference counting") Reviewed-by: Miroslav Benes Signed-off-by: YueHaibing Signed-off-by: Jessica Yu --- kernel/module.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1a0d8cab9eb7..c1517053e9d6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1708,6 +1708,8 @@ static int add_usage_links(struct module *mod) return ret; } +static void module_remove_modinfo_attrs(struct module *mod, int end); + static int module_add_modinfo_attrs(struct module *mod) { struct module_attribute *attr; @@ -1722,24 +1724,34 @@ static int module_add_modinfo_attrs(struct module *mod) return -ENOMEM; temp_attr = mod->modinfo_attrs; - for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { + for (i = 0; (attr = modinfo_attrs[i]); i++) { if (!attr->test || attr->test(mod)) { memcpy(temp_attr, attr, sizeof(*temp_attr)); sysfs_attr_init(&temp_attr->attr); error = sysfs_create_file(&mod->mkobj.kobj, &temp_attr->attr); + if (error) + goto error_out; ++temp_attr; } } + + return 0; + +error_out: + if (i > 0) + module_remove_modinfo_attrs(mod, --i); return error; } -static void module_remove_modinfo_attrs(struct module *mod) +static void module_remove_modinfo_attrs(struct module *mod, int end) { struct module_attribute *attr; int i; for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { + if (end >= 0 && i > end) + break; /* pick a field to test for end of list */ if (!attr->attr.name) break; @@ -1827,7 +1839,7 @@ static int mod_sysfs_setup(struct module *mod, return 0; out_unreg_modinfo_attrs: - module_remove_modinfo_attrs(mod); + module_remove_modinfo_attrs(mod, -1); out_unreg_param: module_param_sysfs_remove(mod); out_unreg_holders: @@ -1863,7 +1875,7 @@ static void mod_sysfs_fini(struct module *mod) { } -static void module_remove_modinfo_attrs(struct module *mod) +static void module_remove_modinfo_attrs(struct module *mod, int end) { } @@ -1879,7 +1891,7 @@ static void init_param_lock(struct module *mod) static void mod_sysfs_teardown(struct module *mod) { del_usage_links(mod); - module_remove_modinfo_attrs(mod); + module_remove_modinfo_attrs(mod, -1); module_param_sysfs_remove(mod); kobject_put(mod->mkobj.drivers_dir); kobject_put(mod->holders_dir); -- cgit v1.2.3 From 1ec0cd8286f35988134e05367ab5e66213b84e7c Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Fri, 24 May 2019 12:44:18 +0200 Subject: PM: hibernate: powerpc: Expose pfn_is_nosave() prototype The declaration for pfn_is_nosave is only available in kernel/power/power.h. Since this function can be override in arch, expose it globally. Having a prototype will make sure to avoid warning (sometime treated as error with W=1) such as: arch/powerpc/kernel/suspend.c:18:5: error: no previous prototype for 'pfn_is_nosave' [-Werror=missing-prototypes] This moves the declaration into a globally visible header file and add missing include to avoid a warning on powerpc. Also remove the duplicated prototypes since not required anymore. Signed-off-by: Mathieu Malaterre Acked-by: Michael Ellerman (powerpc) Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 9e58bdc8a562..44bee462ff57 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -75,8 +75,6 @@ static inline void hibernate_reserved_size_init(void) {} static inline void hibernate_image_size_init(void) {} #endif /* !CONFIG_HIBERNATION */ -extern int pfn_is_nosave(unsigned long); - #define power_attr(_name) \ static struct kobj_attribute _name##_attr = { \ .attr = { \ -- cgit v1.2.3 From 4b4b077cbd0a998aebaa72c199e06b8a4c8dcfee Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 10 Jun 2019 15:54:37 -0700 Subject: dma-remap: Avoid de-referencing NULL atomic_pool With architectures allowing the kernel to be placed almost arbitrarily in memory (e.g.: ARM64), it is possible to have the kernel resides at physical addresses above 4GB, resulting in neither the default CMA area, nor the atomic pool from successfully allocating. This does not prevent specific peripherals from working though, one example is XHCI, which still operates correctly. Trouble comes when the XHCI driver gets suspended and resumed, since we can now trigger the following NPD: [ 12.664170] usb usb1: root hub lost power or was reset [ 12.669387] usb usb2: root hub lost power or was reset [ 12.674662] Unable to handle kernel NULL pointer dereference at virtual address 00000008 [ 12.682896] pgd = ffffffc1365a7000 [ 12.686386] [00000008] *pgd=0000000136500003, *pud=0000000136500003, *pmd=0000000000000000 [ 12.694897] Internal error: Oops: 96000006 [#1] SMP [ 12.699843] Modules linked in: [ 12.702980] CPU: 0 PID: 1499 Comm: pml Not tainted 4.9.135-1.13pre #51 [ 12.709577] Hardware name: BCM97268DV (DT) [ 12.713736] task: ffffffc136bb6540 task.stack: ffffffc1366cc000 [ 12.719740] PC is at addr_in_gen_pool+0x4/0x48 [ 12.724253] LR is at __dma_free+0x64/0xbc [ 12.728325] pc : [] lr : [] pstate: 60000145 [ 12.735825] sp : ffffffc1366cf990 [ 12.739196] x29: ffffffc1366cf990 x28: ffffffc1366cc000 [ 12.744608] x27: 0000000000000000 x26: ffffffc13a8568c8 [ 12.750020] x25: 0000000000000000 x24: ffffff80098f9000 [ 12.755433] x23: 000000013a5ff000 x22: ffffff8009c57000 [ 12.760844] x21: ffffffc13a856810 x20: 0000000000000000 [ 12.766255] x19: 0000000000001000 x18: 000000000000000a [ 12.771667] x17: 0000007f917553e0 x16: 0000000000001002 [ 12.777078] x15: 00000000000a36cb x14: ffffff80898feb77 [ 12.782490] x13: ffffffffffffffff x12: 0000000000000030 [ 12.787899] x11: 00000000fffffffe x10: ffffff80098feb7f [ 12.793311] x9 : 0000000005f5e0ff x8 : 65776f702074736f [ 12.798723] x7 : 6c2062756820746f x6 : ffffff80098febb1 [ 12.804134] x5 : ffffff800809797c x4 : 0000000000000000 [ 12.809545] x3 : 000000013a5ff000 x2 : 0000000000000fff [ 12.814955] x1 : ffffff8009c57000 x0 : 0000000000000000 [ 12.820363] [ 12.821907] Process pml (pid: 1499, stack limit = 0xffffffc1366cc020) [ 12.828421] Stack: (0xffffffc1366cf990 to 0xffffffc1366d0000) [ 12.834240] f980: ffffffc1366cf9e0 ffffff80086004d0 [ 12.842186] f9a0: ffffffc13ab08238 0000000000000010 ffffff80097c2218 ffffffc13a856810 [ 12.850131] f9c0: ffffff8009c57000 000000013a5ff000 0000000000000008 000000013a5ff000 [ 12.858076] f9e0: ffffffc1366cfa50 ffffff80085f9250 ffffffc13ab08238 0000000000000004 [ 12.866021] fa00: ffffffc13ab08000 ffffff80097b6000 ffffffc13ab08130 0000000000000001 [ 12.873966] fa20: 0000000000000008 ffffffc13a8568c8 0000000000000000 ffffffc1366cc000 [ 12.881911] fa40: ffffffc13ab08130 0000000000000001 ffffffc1366cfa90 ffffff80085e3de8 [ 12.889856] fa60: ffffffc13ab08238 0000000000000000 ffffffc136b75b00 0000000000000000 [ 12.897801] fa80: 0000000000000010 ffffff80089ccb92 ffffffc1366cfac0 ffffff80084ad040 [ 12.905746] faa0: ffffffc13a856810 0000000000000000 ffffff80084ad004 ffffff80084b91a8 [ 12.913691] fac0: ffffffc1366cfae0 ffffff80084b91b4 ffffffc13a856810 ffffff80080db5cc [ 12.921636] fae0: ffffffc1366cfb20 ffffff80084b96bc ffffffc13a856810 0000000000000010 [ 12.929581] fb00: ffffffc13a856870 0000000000000000 ffffffc13a856810 ffffff800984d2b8 [ 12.937526] fb20: ffffffc1366cfb50 ffffff80084baa70 ffffff8009932ad0 ffffff800984d260 [ 12.945471] fb40: 0000000000000010 00000002eff0a065 ffffffc1366cfbb0 ffffff80084bafbc [ 12.953415] fb60: 0000000000000010 0000000000000003 ffffff80098fe000 0000000000000000 [ 12.961360] fb80: ffffff80097b6000 ffffff80097b6dc8 ffffff80098c12b8 ffffff80098c12f8 [ 12.969306] fba0: ffffff8008842000 ffffff80097b6dc8 ffffffc1366cfbd0 ffffff80080e0d88 [ 12.977251] fbc0: 00000000fffffffb ffffff80080e10bc ffffffc1366cfc60 ffffff80080e16a8 [ 12.985196] fbe0: 0000000000000000 0000000000000003 ffffff80097b6000 ffffff80098fe9f0 [ 12.993140] fc00: ffffff80097d4000 ffffff8008983802 0000000000000123 0000000000000040 [ 13.001085] fc20: ffffff8008842000 ffffffc1366cc000 ffffff80089803c2 00000000ffffffff [ 13.009029] fc40: 0000000000000000 0000000000000000 ffffffc1366cfc60 0000000000040987 [ 13.016974] fc60: ffffffc1366cfcc0 ffffff80080dfd08 0000000000000003 0000000000000004 [ 13.024919] fc80: 0000000000000003 ffffff80098fea08 ffffffc136577ec0 ffffff80089803c2 [ 13.032864] fca0: 0000000000000123 0000000000000001 0000000500000002 0000000000040987 [ 13.040809] fcc0: ffffffc1366cfd00 ffffff80083a89d4 0000000000000004 ffffffc136577ec0 [ 13.048754] fce0: ffffffc136610cc0 ffffffffffffffea ffffffc1366cfeb0 ffffffc136610cd8 [ 13.056700] fd00: ffffffc1366cfd10 ffffff800822a614 ffffffc1366cfd40 ffffff80082295d4 [ 13.064645] fd20: 0000000000000004 ffffffc136577ec0 ffffffc136610cc0 0000000021670570 [ 13.072590] fd40: ffffffc1366cfd80 ffffff80081b5d10 ffffff80097b6000 ffffffc13aae4200 [ 13.080536] fd60: ffffffc1366cfeb0 0000000000000004 0000000021670570 0000000000000004 [ 13.088481] fd80: ffffffc1366cfe30 ffffff80081b6b20 ffffffc13aae4200 0000000000000000 [ 13.096427] fda0: 0000000000000004 0000000021670570 ffffffc1366cfeb0 ffffffc13a838200 [ 13.104371] fdc0: 0000000000000000 000000000000000a ffffff80097b6000 0000000000040987 [ 13.112316] fde0: ffffffc1366cfe20 ffffff80081b3af0 ffffffc13a838200 0000000000000000 [ 13.120261] fe00: ffffffc1366cfe30 ffffff80081b6b0c ffffffc13aae4200 0000000000000000 [ 13.128206] fe20: 0000000000000004 0000000000040987 ffffffc1366cfe70 ffffff80081b7dd8 [ 13.136151] fe40: ffffff80097b6000 ffffffc13aae4200 ffffffc13aae4200 fffffffffffffff7 [ 13.144096] fe60: 0000000021670570 ffffffc13a8c63c0 0000000000000000 ffffff8008083180 [ 13.152042] fe80: ffffffffffffff1d 0000000021670570 ffffffffffffffff 0000007f917ad9b8 [ 13.159986] fea0: 0000000020000000 0000000000000015 0000000000000000 0000000000040987 [ 13.167930] fec0: 0000000000000001 0000000021670570 0000000000000004 0000000000000000 [ 13.175874] fee0: 0000000000000888 0000440110000000 000000000000006d 0000000000000003 [ 13.183819] ff00: 0000000000000040 ffffff80ffffffc8 0000000000000000 0000000000000020 [ 13.191762] ff20: 0000000000000000 0000000000000000 0000000000000001 0000000000000000 [ 13.199707] ff40: 0000000000000000 0000007f917553e0 0000000000000000 0000000000000004 [ 13.207651] ff60: 0000000021670570 0000007f91835480 0000000000000004 0000007f91831638 [ 13.215595] ff80: 0000000000000004 00000000004b0de0 00000000004b0000 0000000000000000 [ 13.223539] ffa0: 0000000000000000 0000007fc92ac8c0 0000007f9175d178 0000007fc92ac8c0 [ 13.231483] ffc0: 0000007f917ad9b8 0000000020000000 0000000000000001 0000000000000040 [ 13.239427] ffe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 13.247360] Call trace: [ 13.249866] Exception stack(0xffffffc1366cf7a0 to 0xffffffc1366cf8d0) [ 13.256386] f7a0: 0000000000001000 0000007fffffffff ffffffc1366cf990 ffffff80083c0df8 [ 13.264331] f7c0: 0000000060000145 ffffff80089b5001 ffffffc13ab08130 0000000000000001 [ 13.272275] f7e0: 0000000000000008 ffffffc13a8568c8 0000000000000000 0000000000000000 [ 13.280220] f800: ffffffc1366cf960 ffffffc1366cf960 ffffffc1366cf930 00000000ffffffd8 [ 13.288165] f820: ffffff8009931ac0 4554535953425553 4544006273753d4d 3831633d45434956 [ 13.296110] f840: ffff003832313a39 ffffff800845926c ffffffc1366cf880 0000000000040987 [ 13.304054] f860: 0000000000000000 ffffff8009c57000 0000000000000fff 000000013a5ff000 [ 13.311999] f880: 0000000000000000 ffffff800809797c ffffff80098febb1 6c2062756820746f [ 13.319944] f8a0: 65776f702074736f 0000000005f5e0ff ffffff80098feb7f 00000000fffffffe [ 13.327884] f8c0: 0000000000000030 ffffffffffffffff [ 13.332835] [] addr_in_gen_pool+0x4/0x48 [ 13.338398] [] xhci_mem_cleanup+0xc8/0x51c [ 13.344137] [] xhci_resume+0x308/0x65c [ 13.349524] [] xhci_brcm_resume+0x84/0x8c [ 13.355174] [] platform_pm_resume+0x3c/0x64 [ 13.360997] [] dpm_run_callback+0x5c/0x15c [ 13.366732] [] device_resume+0xc0/0x190 [ 13.372205] [] dpm_resume+0x144/0x2cc [ 13.377504] [] dpm_resume_end+0x20/0x34 [ 13.382980] [] suspend_devices_and_enter+0x104/0x704 [ 13.389585] [] pm_suspend+0x320/0x53c [ 13.394881] [] state_store+0xbc/0xe0 [ 13.400094] [] kobj_attr_store+0x14/0x24 [ 13.405655] [] sysfs_kf_write+0x60/0x70 [ 13.411128] [] kernfs_fop_write+0x130/0x194 [ 13.416954] [] __vfs_write+0x60/0x150 [ 13.422254] [] vfs_write+0xc8/0x164 [ 13.427376] [] SyS_write+0x70/0xc8 [ 13.432412] [] el0_svc_naked+0x34/0x38 [ 13.437800] Code: 92800173 97f6fb9e 17fffff5 d1000442 (f8408c03) [ 13.444033] ---[ end trace 2effe12f909ce205 ]--- The call path leading to this problem is xhci_mem_cleanup() -> dma_free_coherent() -> dma_free_from_pool() -> addr_in_gen_pool. If the atomic_pool is NULL, we can't possibly have the address in the atomic pool anyway, so guard against that. Signed-off-by: Florian Fainelli Signed-off-by: Christoph Hellwig --- kernel/dma/remap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 7a723194ecbe..0207e3764d52 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -158,6 +158,9 @@ out: bool dma_in_atomic_pool(void *start, size_t size) { + if (unlikely(!atomic_pool)) + return false; + return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); } -- cgit v1.2.3 From 0f48b41f597e3b62b649abbf796e1e72901f9df3 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Fri, 24 May 2019 12:33:39 +0200 Subject: clocksource: Move inline keyword to the beginning of function declarations The inline keyword was not at the beginning of the function declarations. Fix the following warnings triggered when using W=1: kernel/time/clocksource.c:108:1: warning: 'inline' is not at beginning of declaration [-Wold-style-declaration] kernel/time/clocksource.c:113:1: warning: 'inline' is not at beginning of declaration [-Wold-style-declaration] Signed-off-by: Mathieu Malaterre Signed-off-by: Thomas Gleixner Cc: trivial@kernel.org Cc: kernel-janitors@vger.kernel.org Cc: John Stultz Cc: Stephen Boyd Link: https://lkml.kernel.org/r/20190524103339.28787-1-malat@debian.org --- kernel/time/clocksource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 3bcc19ceb073..fff5f64981c6 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -105,12 +105,12 @@ static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; static atomic_t watchdog_reset_pending; -static void inline clocksource_watchdog_lock(unsigned long *flags) +static inline void clocksource_watchdog_lock(unsigned long *flags) { spin_lock_irqsave(&watchdog_lock, *flags); } -static void inline clocksource_watchdog_unlock(unsigned long *flags) +static inline void clocksource_watchdog_unlock(unsigned long *flags) { spin_unlock_irqrestore(&watchdog_lock, *flags); } -- cgit v1.2.3 From 141e1ecda356bb0034027a9acb949e97a963ba16 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 25 May 2019 14:39:25 -0400 Subject: alarmtimer: Fix kerneldoc comment for alarmtimer_suspend() This brings the kernel doc in line with the function signature. Signed-off-by: Yangtao Li Signed-off-by: Thomas Gleixner Cc: john.stultz@linaro.org Cc: sboyd@kernel.org Link: https://lkml.kernel.org/r/20190525183925.18963-1-tiny.windzz@gmail.com --- kernel/time/alarmtimer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0519a8805aab..57518efc3810 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -233,7 +233,6 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); /** * alarmtimer_suspend - Suspend time callback * @dev: unused - * @state: unused * * When we are going into suspend, we look through the bases * to see which is the soonest timer to expire. We then -- cgit v1.2.3 From 38cf3a687f5827fcfc81cbc433ef5822693a49c1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Jun 2019 10:12:45 -0700 Subject: cgroup: Move cgroup_parse_float() implementation out of CONFIG_SYSFS a5e112e6424a ("cgroup: add cgroup_parse_float()") accidentally added cgroup_parse_float() inside CONFIG_SYSFS block. Move it outside so that it doesn't cause failures on !CONFIG_SYSFS builds. Signed-off-by: Tejun Heo Fixes: a5e112e6424a ("cgroup: add cgroup_parse_float()") --- kernel/cgroup/cgroup.c | 84 +++++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9e3dffb09489..f582414e15ba 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6229,6 +6229,48 @@ struct cgroup *cgroup_get_from_fd(int fd) } EXPORT_SYMBOL_GPL(cgroup_get_from_fd); +static u64 power_of_ten(int power) +{ + u64 v = 1; + while (power--) + v *= 10; + return v; +} + +/** + * cgroup_parse_float - parse a floating number + * @input: input string + * @dec_shift: number of decimal digits to shift + * @v: output + * + * Parse a decimal floating point number in @input and store the result in + * @v with decimal point right shifted @dec_shift times. For example, if + * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345. + * Returns 0 on success, -errno otherwise. + * + * There's nothing cgroup specific about this function except that it's + * currently the only user. + */ +int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) +{ + s64 whole, frac = 0; + int fstart = 0, fend = 0, flen; + + if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend)) + return -EINVAL; + if (frac < 0) + return -EINVAL; + + flen = fend > fstart ? fend - fstart : 0; + if (flen < dec_shift) + frac *= power_of_ten(dec_shift - flen); + else + frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift)); + + *v = whole * power_of_ten(dec_shift) + frac; + return 0; +} + /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. @@ -6392,46 +6434,4 @@ static int __init cgroup_sysfs_init(void) } subsys_initcall(cgroup_sysfs_init); -static u64 power_of_ten(int power) -{ - u64 v = 1; - while (power--) - v *= 10; - return v; -} - -/** - * cgroup_parse_float - parse a floating number - * @input: input string - * @dec_shift: number of decimal digits to shift - * @v: output - * - * Parse a decimal floating point number in @input and store the result in - * @v with decimal point right shifted @dec_shift times. For example, if - * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345. - * Returns 0 on success, -errno otherwise. - * - * There's nothing cgroup specific about this function except that it's - * currently the only user. - */ -int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) -{ - s64 whole, frac = 0; - int fstart = 0, fend = 0, flen; - - if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend)) - return -EINVAL; - if (frac < 0) - return -EINVAL; - - flen = fend > fstart ? fend - fstart : 0; - if (flen < dec_shift) - frac *= power_of_ten(dec_shift - flen); - else - frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift)); - - *v = whole * power_of_ten(dec_shift) + frac; - return 0; -} - #endif /* CONFIG_SYSFS */ -- cgit v1.2.3 From 99c8b231ae6c6ca4ca2fd1c0b3701071f589661f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 12 Jun 2019 14:52:41 -0300 Subject: docs: cgroup-v1: convert docs to ReST and rename to *.rst Convert the cgroup-v1 files to ReST format, in order to allow a later addition to the admin-guide. The conversion is actually: - add blank lines and identation in order to identify paragraphs; - fix tables markups; - add some lists markups; - mark literal blocks; - adjust title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Acked-by: Tejun Heo Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6a1942ed781c..fc6668f9db15 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -729,7 +729,7 @@ static inline int nr_cpusets(void) * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt + * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst * for a background explanation of this. * * Does not return errors, on the theory that the callers of this -- cgit v1.2.3 From d6a3b247627a3bc0551504eb305d624cc6fb5453 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 12 Jun 2019 14:53:03 -0300 Subject: docs: scheduler: convert docs to ReST and rename to *.rst In order to prepare to add them to the Kernel API book, convert the files to ReST format. The conversion is actually: - add blank lines and identation in order to identify paragraphs; - fix tables markups; - add some lists markups; - mark literal blocks; - adjust title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 43901fa3f269..049d795ee9d3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -726,7 +726,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * refill the runtime and set the deadline a period in the future, * because keeping the current (absolute) deadline of the task would * result in breaking guarantees promised to other tasks (refer to - * Documentation/scheduler/sched-deadline.txt for more information). + * Documentation/scheduler/sched-deadline.rst for more information). * * This function returns true if: * -- cgit v1.2.3 From 151f4e2bdc7a04020ae5c533896fb91a16e1f501 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 13 Jun 2019 07:10:36 -0300 Subject: docs: power: convert docs to ReST and rename to *.rst Convert the PM documents to ReST, in order to allow them to build with Sphinx. The conversion is actually: - add blank lines and indentation in order to identify paragraphs; - fix tables markups; - add some lists markups; - mark literal blocks; - adjust title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Bjorn Helgaas Acked-by: Mark Brown Acked-by: Srivatsa S. Bhat (VMware) --- kernel/power/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9bbaaab14b36..7a4dda9e5309 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -65,7 +65,7 @@ config HIBERNATION need to run mkswap against the swap partition used for the suspend. It also works with swap files to a limited extent (for details see - ). + ). Right now you may boot without resuming and resume later but in the meantime you cannot use the swap partition(s)/file(s) involved in @@ -74,7 +74,7 @@ config HIBERNATION MOUNT any journaled filesystems mounted before the suspend or they will get corrupted in a nasty way. - For more information take a look at . + For more information take a look at . config ARCH_SAVE_PAGE_KEYS bool @@ -255,7 +255,7 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read + and more information, read and the Battery Powered Linux mini-HOWTO, available from . -- cgit v1.2.3 From 38f2c691a4b3e89d476f8e8350d1ca299974b89d Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 17 May 2019 12:50:42 +0200 Subject: s390: improve wait logic of stop_machine The stop_machine loop to advance the state machine and to wait for all affected CPUs to check-in calls cpu_relax_yield in a tight loop until the last missing CPUs acknowledged the state transition. On a virtual system where not all logical CPUs are backed by real CPUs all the time it can take a while for all CPUs to check-in. With the current definition of cpu_relax_yield a diagnose 0x44 is done which tells the hypervisor to schedule *some* other CPU. That can be any CPU and not necessarily one of the CPUs that need to run in order to advance the state machine. This can lead to a pretty bad diagnose 0x44 storm until the last missing CPU finally checked-in. Replace the undirected cpu_relax_yield based on diagnose 0x44 with a directed yield. Each CPU in the wait loop will pick up the next CPU in the cpumask of stop_machine. The diagnose 0x9c is used to tell the hypervisor to run this next CPU instead of the current one. If there is only a limited number of real CPUs backing the virtual CPUs we end up with the real CPUs passed around in a round-robin fashion. [heiko.carstens@de.ibm.com]: Use cpumask_next_wrap as suggested by Peter Zijlstra. Signed-off-by: Martin Schwidefsky Acked-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Signed-off-by: Heiko Carstens --- kernel/stop_machine.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2b5a6754646f..b8b0c5ff8da9 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -183,6 +183,7 @@ static int multi_cpu_stop(void *data) struct multi_stop_data *msdata = data; enum multi_stop_state curstate = MULTI_STOP_NONE; int cpu = smp_processor_id(), err = 0; + const struct cpumask *cpumask; unsigned long flags; bool is_active; @@ -192,15 +193,18 @@ static int multi_cpu_stop(void *data) */ local_save_flags(flags); - if (!msdata->active_cpus) - is_active = cpu == cpumask_first(cpu_online_mask); - else - is_active = cpumask_test_cpu(cpu, msdata->active_cpus); + if (!msdata->active_cpus) { + cpumask = cpu_online_mask; + is_active = cpu == cpumask_first(cpumask); + } else { + cpumask = msdata->active_cpus; + is_active = cpumask_test_cpu(cpu, cpumask); + } /* Simple state machine */ do { /* Chill out and ensure we re-read multi_stop_state. */ - cpu_relax_yield(); + cpu_relax_yield(cpumask); if (msdata->state != curstate) { curstate = msdata->state; switch (curstate) { -- cgit v1.2.3 From 4ecf0a43e729a7e641d800c294faabe87378fc05 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 8 Jun 2019 12:13:57 +0200 Subject: processor: get rid of cpu_relax_yield stop_machine is the only user left of cpu_relax_yield. Given that it now has special semantics which are tied to stop_machine introduce a weak stop_machine_yield function which architectures can override, and get rid of the generic cpu_relax_yield implementation. Acked-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Signed-off-by: Heiko Carstens --- kernel/stop_machine.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b8b0c5ff8da9..b4f83f7bdf86 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -177,6 +177,11 @@ static void ack_state(struct multi_stop_data *msdata) set_state(msdata, msdata->state + 1); } +void __weak stop_machine_yield(const struct cpumask *cpumask) +{ + cpu_relax(); +} + /* This is the cpu_stop function which stops the CPU. */ static int multi_cpu_stop(void *data) { @@ -204,7 +209,7 @@ static int multi_cpu_stop(void *data) /* Simple state machine */ do { /* Chill out and ensure we re-read multi_stop_state. */ - cpu_relax_yield(cpumask); + stop_machine_yield(cpumask); if (msdata->state != curstate) { curstate = msdata->state; switch (curstate) { -- cgit v1.2.3 From e1aacb3f4adc1bcd4273a1d538a2dc3e50f1cbb5 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 12 Jun 2019 11:57:26 +0200 Subject: jump_label: Add a jump_label_can_update() helper Move the check if a jump_entry is valid to a function. No functional change. Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Chris von Recklinghausen Cc: Clark Williams Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Jason Baron Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Marcelo Tosatti Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Scott Wood Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/56b69bd3f8e644ed64f2dbde7c088030b8cbe76b.1560325897.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 0bfa10f4410c..24f0d3b1526b 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -384,23 +384,30 @@ static enum jump_label_type jump_label_type(struct jump_entry *entry) return enabled ^ branch; } +static bool jump_label_can_update(struct jump_entry *entry, bool init) +{ + /* + * Cannot update code that was in an init text area. + */ + if (!init && jump_entry_is_init(entry)) + return false; + + if (!kernel_text_address(jump_entry_code(entry))) { + WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); + return false; + } + + return true; +} + static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { - /* - * An entry->code of 0 indicates an entry which has been - * disabled because it was in an init text area. - */ - if (init || !jump_entry_is_init(entry)) { - if (kernel_text_address(jump_entry_code(entry))) - arch_jump_label_transform(entry, jump_label_type(entry)); - else - WARN_ONCE(1, "can't patch jump_label at %pS", - (void *)jump_entry_code(entry)); - } + if (jump_label_can_update(entry, init)) + arch_jump_label_transform(entry, jump_label_type(entry)); } } -- cgit v1.2.3 From 0f133021bd82548a33580bfb7b055e8857f46c2a Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 12 Jun 2019 11:57:28 +0200 Subject: jump_label: Sort entries of the same key by the code In the batching mode, all the entries of a given key are updated at once. During the update of a key, a hit in the int3 handler will check if the hitting code address belongs to one of these keys. To optimize the search of a given code in the vector of entries being updated, a binary search is used. The binary search relies on the order of the entries of a key by its code. Hence the keys need to be sorted by the code too, so sort the entries of a given key by the code. Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Chris von Recklinghausen Cc: Clark Williams Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Jason Baron Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Marcelo Tosatti Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Scott Wood Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/f57ae83e0592418ba269866bb7ade570fc8632e0.1560325897.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 24f0d3b1526b..ca00ac10d9b9 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -37,12 +37,26 @@ static int jump_label_cmp(const void *a, const void *b) const struct jump_entry *jea = a; const struct jump_entry *jeb = b; + /* + * Entrires are sorted by key. + */ if (jump_entry_key(jea) < jump_entry_key(jeb)) return -1; if (jump_entry_key(jea) > jump_entry_key(jeb)) return 1; + /* + * In the batching mode, entries should also be sorted by the code + * inside the already sorted list of entries, enabling a bsearch in + * the vector. + */ + if (jump_entry_code(jea) < jump_entry_code(jeb)) + return -1; + + if (jump_entry_code(jea) > jump_entry_code(jeb)) + return 1; + return 0; } -- cgit v1.2.3 From c2ba8a15f310d915f8748dd8324c91c82b12b5ff Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 12 Jun 2019 11:57:30 +0200 Subject: jump_label: Batch updates if arch supports it If the architecture supports the batching of jump label updates, use it! An easy way to see the benefits of this patch is switching the schedstats on and off. For instance: -------------------------- %< ---------------------------- #!/bin/sh while [ true ]; do sysctl -w kernel.sched_schedstats=1 sleep 2 sysctl -w kernel.sched_schedstats=0 sleep 2 done -------------------------- >% ---------------------------- while watching the IPI count: -------------------------- %< ---------------------------- # watch -n1 "cat /proc/interrupts | grep Function" -------------------------- >% ---------------------------- With the current mode, it is possible to see +- 168 IPIs each 2 seconds, while with this patch the number of IPIs goes to 3 each 2 seconds. Regarding the performance impact of this patch set, I made two measurements: The time to update a key (the task that is causing the change) The time to run the int3 handler (the side effect on a thread that hits the code being changed) The schedstats static key was chosen as the key to being switched on and off. The reason being is that it is used in more than 56 places, in a hot path. The change in the schedstats static key will be done with the following command: while [ true ]; do sysctl -w kernel.sched_schedstats=1 usleep 500000 sysctl -w kernel.sched_schedstats=0 usleep 500000 done In this way, they key will be updated twice per second. To force the hit of the int3 handler, the system will also run a kernel compilation with two jobs per CPU. The test machine is a two nodes/24 CPUs box with an Intel Xeon processor @2.27GHz. Regarding the update part, on average, the regular kernel takes 57 ms to update the schedstats key, while the kernel with the batch updates takes just 1.4 ms on average. Although it seems to be too good to be true, it makes sense: the schedstats key is used in 56 places, so it was expected that it would take around 56 times to update the keys with the current implementation, as the IPIs are the most expensive part of the update. Regarding the int3 handler, the non-batch handler takes 45 ns on average, while the batch version takes around 180 ns. At first glance, it seems to be a high value. But it is not, considering that it is doing 56 updates, rather than one! It is taking four times more, only. This gain is possible because the patch uses a binary search in the vector: log2(56)=5.8. So, it was expected to have an overhead within four times. (voice of tv propaganda) But, that is not all! As the int3 handler keeps on for a shorter period (because the update part is on for a shorter time), the number of hits in the int3 handler decreased by 10%. The question then is: Is it worth paying the price of "135 ns" more in the int3 handler? Considering that, in this test case, we are saving the handling of 53 IPIs, that takes more than these 135 ns, it seems to be a meager price to be paid. Moreover, the test case was forcing the hit of the int3, in practice, it does not take that often. While the IPI takes place on all CPUs, hitting the int3 handler or not! For instance, in an isolated CPU with a process running in user-space (nohz_full use-case), the chances of hitting the int3 handler is barely zero, while there is no way to avoid the IPIs. By bounding the IPIs, we are improving a lot this scenario. Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Chris von Recklinghausen Cc: Clark Williams Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Jason Baron Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Marcelo Tosatti Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Scott Wood Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/acc891dbc2dbc9fd616dd680529a2337b1d1274c.1560325897.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index ca00ac10d9b9..df3008419a1d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -414,6 +414,7 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init) return true; } +#ifndef HAVE_JUMP_LABEL_BATCH static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, @@ -424,6 +425,28 @@ static void __jump_label_update(struct static_key *key, arch_jump_label_transform(entry, jump_label_type(entry)); } } +#else +static void __jump_label_update(struct static_key *key, + struct jump_entry *entry, + struct jump_entry *stop, + bool init) +{ + for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { + + if (!jump_label_can_update(entry, init)) + continue; + + if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) { + /* + * Queue is full: Apply the current queue and try again. + */ + arch_jump_label_transform_apply(); + BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry))); + } + } + arch_jump_label_transform_apply(); +} +#endif void __init jump_label_init(void) { -- cgit v1.2.3 From dd471efe345bf6f9e1206f6c629ca3e80eb43523 Mon Sep 17 00:00:00 2001 From: Kobe Wu Date: Thu, 30 May 2019 19:59:35 +0800 Subject: locking/lockdep: Remove unnecessary DEBUG_LOCKS_WARN_ON() DEBUG_LOCKS_WARN_ON() will turn off debug_locks and makes print_unlock_imbalance_bug() return directly. Remove a redundant whitespace. Signed-off-by: Kobe Wu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Eason Lin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: https://lkml.kernel.org/r/1559217575-30298-1-git-send-email-kobe-cp.wu@mediatek.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 48a840adb281..5e368f485330 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4160,7 +4160,7 @@ __lock_release(struct lockdep_map *lock, unsigned long ip) * So we're all set to release this lock.. wait what lock? We don't * own any locks, you've been drinking again? */ - if (DEBUG_LOCKS_WARN_ON(depth <= 0)) { + if (depth <= 0) { print_unlock_imbalance_bug(curr, lock, ip); return 0; } -- cgit v1.2.3 From e3b929b0a184edb35531153c5afcaebb09014f9d Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 3 Jun 2019 17:13:38 +0800 Subject: sched/core: Add __sched tag for io_schedule() Non-inline io_schedule() was introduced in: commit 10ab56434f2f ("sched/core: Separate out io_schedule_prepare() and io_schedule_finish()") Keep in line with io_schedule_timeout(), otherwise "/proc//wchan" will report io_schedule() rather than its callers when waiting for IO. Reported-by: Jilong Kou Signed-off-by: Gao Xiang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Andrew Morton Cc: Linus Torvalds Cc: Miao Xie Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 10ab56434f2f ("sched/core: Separate out io_schedule_prepare() and io_schedule_finish()") Link: https://lkml.kernel.org/r/20190603091338.2695-1-gaoxiang25@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 29984d8c41f0..cd047927f707 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5122,7 +5122,7 @@ long __sched io_schedule_timeout(long timeout) } EXPORT_SYMBOL(io_schedule_timeout); -void io_schedule(void) +void __sched io_schedule(void) { int token; -- cgit v1.2.3 From b0c792244138d3ef099e7fce978675dc4acae570 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 3 Jun 2019 12:54:24 +0100 Subject: sched/fair: Clean up definition of NOHZ blocked load functions cfs_rq_has_blocked() and others_have_blocked() are only used within update_blocked_averages(). The !CONFIG_FAIR_GROUP_SCHED version of the latter calls them within a #define CONFIG_NO_HZ_COMMON block, whereas the CONFIG_FAIR_GROUP_SCHED one calls them unconditionnally. As reported by Qian, the above leads to this warning in !CONFIG_NO_HZ_COMMON configs: kernel/sched/fair.c: In function 'update_blocked_averages': kernel/sched/fair.c:7750:7: warning: variable 'done' set but not used [-Wunused-but-set-variable] It wouldn't be wrong to keep cfs_rq_has_blocked() and others_have_blocked() as they are, but since their only current use is to figure out when we can stop calling update_blocked_averages() on fully decayed NOHZ idle CPUs, we can give them a new definition for !CONFIG_NO_HZ_COMMON. Change the definition of cfs_rq_has_blocked() and others_have_blocked() for !CONFIG_NO_HZ_COMMON so that the NOHZ-specific blocks of update_blocked_averages() become no-ops and the 'done' variable gets optimised out. While at it, remove the CONFIG_NO_HZ_COMMON block from the !CONFIG_FAIR_GROUP_SCHED definition of update_blocked_averages() by using the newly-introduced update_blocked_load_status() helper. No change in functionality intended. [ Additions by Peter Zijlstra. ] Reported-by: Qian Cai Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190603115424.7951-1-valentin.schneider@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7f8d477f90fe..4c8f45ed093c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7393,6 +7393,7 @@ static void attach_tasks(struct lb_env *env) rq_unlock(env->dst_rq, &rf); } +#ifdef CONFIG_NO_HZ_COMMON static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { if (cfs_rq->avg.load_avg) @@ -7420,6 +7421,19 @@ static inline bool others_have_blocked(struct rq *rq) return false; } +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +{ + rq->last_blocked_load_update_tick = jiffies; + + if (!has_blocked) + rq->has_blocked_load = 0; +} +#else +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } +static inline bool others_have_blocked(struct rq *rq) { return false; } +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -7485,11 +7499,7 @@ static void update_blocked_averages(int cpu) if (others_have_blocked(rq)) done = false; -#ifdef CONFIG_NO_HZ_COMMON - rq->last_blocked_load_update_tick = jiffies; - if (done) - rq->has_blocked_load = 0; -#endif + update_blocked_load_status(rq, !done); rq_unlock_irqrestore(rq, &rf); } @@ -7555,11 +7565,7 @@ static inline void update_blocked_averages(int cpu) update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); -#ifdef CONFIG_NO_HZ_COMMON - rq->last_blocked_load_update_tick = jiffies; - if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) - rq->has_blocked_load = 0; -#endif + update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq)); rq_unlock_irqrestore(rq, &rf); } -- cgit v1.2.3 From 509466b7d480bc5d22e90b9fbe6122ae0e2fbe39 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 3 Jun 2019 17:11:44 -0400 Subject: sched/fair: Fix "runnable_avg_yN_inv" not used warnings runnable_avg_yN_inv[] is only used in kernel/sched/pelt.c but was included in several other places because they need other macros all came from kernel/sched/sched-pelt.h which was generated by Documentation/scheduler/sched-pelt. As the result, it causes compilation a lot of warnings, kernel/sched/sched-pelt.h:4:18: warning: 'runnable_avg_yN_inv' defined but not used [-Wunused-const-variable=] kernel/sched/sched-pelt.h:4:18: warning: 'runnable_avg_yN_inv' defined but not used [-Wunused-const-variable=] kernel/sched/sched-pelt.h:4:18: warning: 'runnable_avg_yN_inv' defined but not used [-Wunused-const-variable=] ... Silence it by appending the __maybe_unused attribute for it, so all generated variables and macros can still be kept in the same file. Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/1559596304-31581-1-git-send-email-cai@lca.pw Signed-off-by: Ingo Molnar --- kernel/sched/sched-pelt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index a26473674fb7..c529706bed11 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ -static const u32 runnable_avg_yN_inv[] = { +static const u32 runnable_avg_yN_inv[] __maybe_unused = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, -- cgit v1.2.3 From aacedf26fb7601222f2452cf0a54cab4fee160c5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Jun 2019 15:39:49 +0200 Subject: sched/core: Optimize try_to_wake_up() for local wakeups Jens reported that significant performance can be had on some block workloads by special casing local wakeups. That is, wakeups on the current task before it schedules out. Given something like the normal wait pattern: for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (cond) break; schedule(); } __set_current_state(TASK_RUNNING); Any wakeup (on this CPU) after set_current_state() and before schedule() would benefit from this. Normal wakeups take p->pi_lock, which serializes wakeups to the same task. By eliding that we gain concurrency on: - ttwu_stat(); we already had concurrency on rq stats, this now also brings it to task stats. -ENOCARE - tracepoints; it is now possible to get multiple instances of trace_sched_waking() (and possibly trace_sched_wakeup()) for the same task. Tracers will have to learn to cope. Furthermore, p->pi_lock is used by set_special_state(), to order against TASK_RUNNING stores from other CPUs. But since this is strictly CPU local, we don't need the lock, and set_special_state()'s disabling of IRQs is sufficient. After the normal wakeup takes p->pi_lock it issues smp_mb__after_spinlock(), in order to ensure the woken task must observe prior stores before we observe the p->state. If this is CPU local, this will be satisfied with a compiler barrier, and we rely on try_to_wake_up() being a funcation call, which implies such. Since, when 'p == current', 'p->on_rq' must be true, the normal wakeup would continue into the ttwu_remote() branch, which normally is concerned with exactly this wakeup scenario, except from a remote CPU. IOW we're waking a task that is still running. In this case, we can trivially avoid taking rq->lock, all that's left from this is to set p->state. This then yields an extremely simple and fast path for 'p == current'. Reported-by: Jens Axboe Tested-by: Jens Axboe Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Qian Cai Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: gkohli@codeaurora.org Cc: hch@lst.de Cc: oleg@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cd047927f707..83bd6bb32a34 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1991,6 +1991,29 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) unsigned long flags; int cpu, success = 0; + if (p == current) { + /* + * We're waking current, this means 'p->on_rq' and 'task_cpu(p) + * == smp_processor_id()'. Together this means we can special + * case the whole 'p->on_rq && ttwu_remote()' case below + * without taking any locks. + * + * In particular: + * - we rely on Program-Order guarantees for all the ordering, + * - we're serialized against set_special_state() by virtue of + * it disabling IRQs (this allows not taking ->pi_lock). + */ + if (!(p->state & state)) + return false; + + success = 1; + cpu = task_cpu(p); + trace_sched_waking(p); + p->state = TASK_RUNNING; + trace_sched_wakeup(p); + goto out; + } + /* * If we are going to wake up a thread waiting for CONDITION we * need to ensure that CONDITION=1 done by the caller can not be @@ -2000,7 +2023,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); if (!(p->state & state)) - goto out; + goto unlock; trace_sched_waking(p); @@ -2030,7 +2053,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) - goto stat; + goto unlock; #ifdef CONFIG_SMP /* @@ -2090,10 +2113,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); -stat: - ttwu_stat(p, cpu, wake_flags); -out: +unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); +out: + if (success) + ttwu_stat(p, cpu, wake_flags); return success; } -- cgit v1.2.3 From 66567fcbaecac455caa1b13643155d686b51ce63 Mon Sep 17 00:00:00 2001 From: "bsegall@google.com" Date: Thu, 6 Jun 2019 10:21:01 -0700 Subject: sched/fair: Don't push cfs_bandwith slack timers forward When a cfs_rq sleeps and returns its quota, we delay for 5ms before waking any throttled cfs_rqs to coalesce with other cfs_rqs going to sleep, as this has to be done outside of the rq lock we hold. The current code waits for 5ms without any sleeps, instead of waiting for 5ms from the first sleep, which can delay the unthrottle more than we want. Switch this around so that we can't push this forward forever. This requires an extra flag rather than using hrtimer_active, since we need to start a new timer if the current one is in the process of finishing. Signed-off-by: Ben Segall Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Xunlei Pang Acked-by: Phil Auld Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/xm26a7euy6iq.fsf_-_@bsegall-linux.svl.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++++++ kernel/sched/sched.h | 8 ++++---- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4c8f45ed093c..3c11dcdedcbc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4729,6 +4729,11 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) if (runtime_refresh_within(cfs_b, min_left)) return; + /* don't push forwards an existing deferred unthrottle */ + if (cfs_b->slack_started) + return; + cfs_b->slack_started = true; + hrtimer_start(&cfs_b->slack_timer, ns_to_ktime(cfs_bandwidth_slack_period), HRTIMER_MODE_REL); @@ -4782,6 +4787,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock_irqsave(&cfs_b->lock, flags); + cfs_b->slack_started = false; if (cfs_b->distribute_running) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return; @@ -4945,6 +4951,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; cfs_b->distribute_running = 0; + cfs_b->slack_started = false; } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 607859a18b2a..b08dee29ef5e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -338,8 +338,10 @@ struct cfs_bandwidth { u64 runtime_expires; int expires_seq; - short idle; - short period_active; + u8 idle; + u8 period_active; + u8 distribute_running; + u8 slack_started; struct hrtimer period_timer; struct hrtimer slack_timer; struct list_head throttled_cfs_rq; @@ -348,8 +350,6 @@ struct cfs_bandwidth { int nr_periods; int nr_throttled; u64 throttled_time; - - bool distribute_running; #endif }; -- cgit v1.2.3 From c71fd893f614f205dbc050d60299cc5496491c19 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:00 -0400 Subject: locking/rwsem: Make owner available even if !CONFIG_RWSEM_SPIN_ON_OWNER The owner field in the rw_semaphore structure is used primarily for optimistic spinning. However, identifying the rwsem owner can also be helpful in debugging as well as tracing locking related issues when analyzing crash dump. The owner field may also store state information that can be important to the operation of the rwsem. So the owner field is now made a permanent member of the rw_semaphore structure irrespective of CONFIG_RWSEM_SPIN_ON_OWNER. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-2-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 2 +- kernel/locking/rwsem.h | 23 ----------------------- 2 files changed, 1 insertion(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 0b1f77957240..c0500679fd2f 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -86,8 +86,8 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER sem->owner = NULL; +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); #endif } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 64877f5294e3..eb9c8534299b 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -61,7 +61,6 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * All writes to owner are protected by WRITE_ONCE() to make sure that * store tearing can't happen as optimistic spinners may read and use @@ -126,7 +125,6 @@ static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) * real owner or one of the real owners. The only exception is when the * unlock is done by up_read_non_owner(). */ -#define rwsem_clear_reader_owned rwsem_clear_reader_owned static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { unsigned long val = (unsigned long)current | RWSEM_READER_OWNED @@ -135,28 +133,7 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) cmpxchg_relaxed((unsigned long *)&sem->owner, val, RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); } -#endif - #else -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ -} - -static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, - struct task_struct *owner) -{ -} - -static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) -{ -} -#endif - -#ifndef rwsem_clear_reader_owned static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { } -- cgit v1.2.3 From 5c1ec49b60cdb31e51010f8a647f3189b774bddf Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:01 -0400 Subject: locking/rwsem: Remove rwsem_wake() wakeup optimization After the following commit: 59aabfc7e959 ("locking/rwsem: Reduce spinlock contention in wakeup after up_read()/up_write()") the rwsem_wake() forgoes doing a wakeup if the wait_lock cannot be directly acquired and an optimistic spinning locker is present. This can help performance by avoiding spinning on the wait_lock when it is contended. With the later commit: 133e89ef5ef3 ("locking/rwsem: Enable lockless waiter wakeup(s)") the performance advantage of the above optimization diminishes as the average wait_lock hold time become much shorter. With a later patch that supports rwsem lock handoff, we can no longer relies on the fact that the presence of an optimistic spinning locker will ensure that the lock will be acquired by a task soon and rwsem_wake() will be called later on to wake up waiters. This can lead to missed wakeup and application hang. So the original 59aabfc7e959 commit has to be reverted. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-3-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 72 --------------------------------------------- 1 file changed, 72 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index c0500679fd2f..3083fdf50447 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -411,25 +411,11 @@ done: lockevent_cond_inc(rwsem_opt_fail, !taken); return taken; } - -/* - * Return true if the rwsem has active spinner - */ -static inline bool rwsem_has_spinner(struct rw_semaphore *sem) -{ - return osq_is_locked(&sem->osq); -} - #else static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { return false; } - -static inline bool rwsem_has_spinner(struct rw_semaphore *sem) -{ - return false; -} #endif /* @@ -651,65 +637,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) unsigned long flags; DEFINE_WAKE_Q(wake_q); - /* - * __rwsem_down_write_failed_common(sem) - * rwsem_optimistic_spin(sem) - * osq_unlock(sem->osq) - * ... - * atomic_long_add_return(&sem->count) - * - * - VS - - * - * __up_write() - * if (atomic_long_sub_return_release(&sem->count) < 0) - * rwsem_wake(sem) - * osq_is_locked(&sem->osq) - * - * And __up_write() must observe !osq_is_locked() when it observes the - * atomic_long_add_return() in order to not miss a wakeup. - * - * This boils down to: - * - * [S.rel] X = 1 [RmW] r0 = (Y += 0) - * MB RMB - * [RmW] Y += 1 [L] r1 = X - * - * exists (r0=1 /\ r1=0) - */ - smp_rmb(); - - /* - * If a spinner is present, it is not necessary to do the wakeup. - * Try to do wakeup only if the trylock succeeds to minimize - * spinlock contention which may introduce too much delay in the - * unlock operation. - * - * spinning writer up_write/up_read caller - * --------------- ----------------------- - * [S] osq_unlock() [L] osq - * MB RMB - * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock) - * - * Here, it is important to make sure that there won't be a missed - * wakeup while the rwsem is free and the only spinning writer goes - * to sleep without taking the rwsem. Even when the spinning writer - * is just going to break out of the waiting loop, it will still do - * a trylock in rwsem_down_write_failed() before sleeping. IOW, if - * rwsem_has_spinner() is true, it will guarantee at least one - * trylock attempt on the rwsem later on. - */ - if (rwsem_has_spinner(sem)) { - /* - * The smp_rmb() here is to make sure that the spinner - * state is consulted before reading the wait_lock. - */ - smp_rmb(); - if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags)) - return sem; - goto locked; - } raw_spin_lock_irqsave(&sem->wait_lock, flags); -locked: if (!list_empty(&sem->wait_list)) __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); -- cgit v1.2.3 From 64489e78004cb5623211c75790cac90bd25ff5e9 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:02 -0400 Subject: locking/rwsem: Implement a new locking scheme The current way of using various reader, writer and waiting biases in the rwsem code are confusing and hard to understand. I have to reread the rwsem count guide in the rwsem-xadd.c file from time to time to remind myself how this whole thing works. It also makes the rwsem code harder to be optimized. To make rwsem more sane, a new locking scheme similar to the one in qrwlock is now being used. The atomic long count has the following bit definitions: Bit 0 - writer locked bit Bit 1 - waiters present bit Bits 2-7 - reserved for future extension Bits 8-X - reader count (24/56 bits) The cmpxchg instruction is now used to acquire the write lock. The read lock is still acquired with xadd instruction, so there is no change here. This scheme will allow up to 16M/64P active readers which should be more than enough. We can always use some more reserved bits if necessary. With that change, we can deterministically know if a rwsem has been write-locked. Looking at the count alone, however, one cannot determine for certain if a rwsem is owned by readers or not as the readers that set the reader count bits may be in the process of backing out. So we still need the reader-owned bit in the owner field to be sure. With a locking microbenchmark running on 5.1 based kernel, the total locking rates (in kops/s) of the benchmark on a 8-socket 120-core IvyBridge-EX system before and after the patch were as follows: Before Patch After Patch # of Threads wlock rlock wlock rlock ------------ ----- ----- ----- ----- 1 30,659 31,341 31,055 31,283 2 8,909 16,457 9,884 17,659 4 9,028 15,823 8,933 20,233 8 8,410 14,212 7,230 17,140 16 8,217 25,240 7,479 24,607 The locking rates of the benchmark on a Power8 system were as follows: Before Patch After Patch # of Threads wlock rlock wlock rlock ------------ ----- ----- ----- ----- 1 12,963 13,647 13,275 13,601 2 7,570 11,569 7,902 10,829 4 5,232 5,516 5,466 5,435 8 5,233 3,386 5,467 3,168 The locking rates of the benchmark on a 2-socket ARM64 system were as follows: Before Patch After Patch # of Threads wlock rlock wlock rlock ------------ ----- ----- ----- ----- 1 21,495 21,046 21,524 21,074 2 5,293 10,502 5,333 10,504 4 5,325 11,463 5,358 11,631 8 5,391 11,712 5,470 11,680 The performance are roughly the same before and after the patch. There are run-to-run variations in performance. Runs with higher variances usually have higher throughput. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-4-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 147 +++++++++++++++----------------------------- kernel/locking/rwsem.h | 74 +++++++++++----------- 2 files changed, 85 insertions(+), 136 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 3083fdf50447..7d537b50a849 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -9,6 +9,8 @@ * * Optimistic spinning by Tim Chen * and Davidlohr Bueso . Based on mutexes. + * + * Rwsem count bit fields re-definition by Waiman Long . */ #include #include @@ -22,52 +24,20 @@ #include "rwsem.h" /* - * Guide to the rw_semaphore's count field for common values. - * (32-bit case illustrated, similar for 64-bit) - * - * 0x0000000X (1) X readers active or attempting lock, no writer waiting - * X = #active_readers + #readers attempting to lock - * (X*ACTIVE_BIAS) - * - * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or - * attempting to read lock or write lock. - * - * 0xffff000X (1) X readers active or attempting lock, with waiters for lock - * X = #active readers + # readers attempting lock - * (X*ACTIVE_BIAS + WAITING_BIAS) - * (2) 1 writer attempting lock, no waiters for lock - * X-1 = #active readers + #readers attempting lock - * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) - * (3) 1 writer active, no waiters for lock - * X-1 = #active readers + #readers attempting lock - * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) - * - * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock - * (WAITING_BIAS + ACTIVE_BIAS) - * (2) 1 writer active or attempting lock, no waiters for lock - * (ACTIVE_WRITE_BIAS) + * Guide to the rw_semaphore's count field. * - * 0xffff0000 (1) There are writers or readers queued but none active - * or in the process of attempting lock. - * (WAITING_BIAS) - * Note: writer can attempt to steal lock for this count by adding - * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count + * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned + * by a writer. * - * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue. - * (ACTIVE_WRITE_BIAS + WAITING_BIAS) - * - * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking - * the count becomes more than 0 for successful lock acquisition, - * i.e. the case where there are only readers or nobody has lock. - * (1st and 2nd case above). - * - * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and - * checking the count becomes ACTIVE_WRITE_BIAS for successful lock - * acquisition (i.e. nobody else has lock or attempts lock). If - * unsuccessful, in rwsem_down_write_failed, we'll check to see if there - * are only waiters but none active (5th case above), and attempt to - * steal the lock. + * The lock is owned by readers when + * (1) the RWSEM_WRITER_LOCKED isn't set in count, + * (2) some of the reader bits are set in count, and + * (3) the owner field has RWSEM_READ_OWNED bit set. * + * Having some reader bits set is not enough to guarantee a readers owned + * lock as the readers may be in the process of backing out from the count + * and a writer has just released the lock. So another writer may steal + * the lock immediately after that. */ /* @@ -113,9 +83,8 @@ enum rwsem_wake_type { /* * handle the lock release when processes blocked on it that can now run - * - if we come here from up_xxxx(), then: - * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) - * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) + * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must + * have been set. * - there must be someone on the queue * - the wait_lock must be held by the caller * - tasks are marked for wakeup, the caller must later invoke wake_up_q() @@ -160,22 +129,11 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * so we can bail out early if a writer stole the lock. */ if (wake_type != RWSEM_WAKE_READ_OWNED) { - adjustment = RWSEM_ACTIVE_READ_BIAS; - try_reader_grant: + adjustment = RWSEM_READER_BIAS; oldcount = atomic_long_fetch_add(adjustment, &sem->count); - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* - * If the count is still less than RWSEM_WAITING_BIAS - * after removing the adjustment, it is assumed that - * a writer has stolen the lock. We have to undo our - * reader grant. - */ - if (atomic_long_add_return(-adjustment, &sem->count) < - RWSEM_WAITING_BIAS) - return; - - /* Last active locker left. Retry waking readers. */ - goto try_reader_grant; + if (unlikely(oldcount & RWSEM_WRITER_MASK)) { + atomic_long_sub(adjustment, &sem->count); + return; } /* * Set it to reader-owned to give spinners an early @@ -209,11 +167,11 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, } list_cut_before(&wlist, &sem->wait_list, &waiter->list); - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); if (list_empty(&sem->wait_list)) { /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; + adjustment -= RWSEM_FLAG_WAITERS; } if (adjustment) @@ -248,22 +206,15 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, */ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) { - /* - * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. - */ - if (count != RWSEM_WAITING_BIAS) + long new; + + if (count & RWSEM_LOCK_MASK) return false; - /* - * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there - * are other tasks on the wait list, we need to add on WAITING_BIAS. - */ - count = list_is_singular(&sem->wait_list) ? - RWSEM_ACTIVE_WRITE_BIAS : - RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; + new = count + RWSEM_WRITER_LOCKED - + (list_is_singular(&sem->wait_list) ? RWSEM_FLAG_WAITERS : 0); - if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) - == RWSEM_WAITING_BIAS) { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)) { rwsem_set_owner(sem); return true; } @@ -279,9 +230,9 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { long count = atomic_long_read(&sem->count); - while (!count || count == RWSEM_WAITING_BIAS) { + while (!(count & RWSEM_LOCK_MASK)) { if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, - count + RWSEM_ACTIVE_WRITE_BIAS)) { + count + RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); lockevent_inc(rwsem_opt_wlock); return true; @@ -424,7 +375,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) static inline struct rw_semaphore __sched * __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) { - long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; + long count, adjustment = -RWSEM_READER_BIAS; struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); @@ -436,16 +387,16 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) /* * In case the wait queue is empty and the lock isn't owned * by a writer, this reader can exit the slowpath and return - * immediately as its RWSEM_ACTIVE_READ_BIAS has already - * been set in the count. + * immediately as its RWSEM_READER_BIAS has already been + * set in the count. */ - if (atomic_long_read(&sem->count) >= 0) { + if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) { raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast); return sem; } - adjustment += RWSEM_WAITING_BIAS; + adjustment += RWSEM_FLAG_WAITERS; } list_add_tail(&waiter.list, &sem->wait_list); @@ -458,9 +409,8 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) * If there are no writers and we are first in the queue, * wake our own waiter to join the existing active readers ! */ - if (count == RWSEM_WAITING_BIAS || - (count > RWSEM_WAITING_BIAS && - adjustment != -RWSEM_ACTIVE_READ_BIAS)) + if (!(count & RWSEM_LOCK_MASK) || + (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS))) __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); @@ -488,7 +438,7 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) out_nolock: list_del(&waiter.list); if (list_empty(&sem->wait_list)) - atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); + atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); @@ -521,9 +471,6 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) struct rw_semaphore *ret = sem; DEFINE_WAKE_Q(wake_q); - /* undo write bias from down_write operation, stop active locking */ - count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); - /* do optimistic spinning and steal lock if possible */ if (rwsem_optimistic_spin(sem)) return sem; @@ -543,16 +490,18 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) list_add_tail(&waiter.list, &sem->wait_list); - /* we're now waiting on the lock, but no longer actively locking */ + /* we're now waiting on the lock */ if (waiting) { count = atomic_long_read(&sem->count); /* * If there were already threads queued before us and there are - * no active writers, the lock must be read owned; so we try to - * wake any read locks that were queued ahead of us. + * no active writers and some readers, the lock must be read + * owned; so we try to any read locks that were queued ahead + * of us. */ - if (count > RWSEM_WAITING_BIAS) { + if (!(count & RWSEM_WRITER_MASK) && + (count & RWSEM_READER_MASK)) { __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* * The wakeup is normally called _after_ the wait_lock @@ -569,8 +518,9 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) wake_q_init(&wake_q); } - } else - count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); + } else { + count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count); + } /* wait until we successfully acquire the lock */ set_current_state(state); @@ -587,7 +537,8 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) schedule(); lockevent_inc(rwsem_sleep_writer); set_current_state(state); - } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); + count = atomic_long_read(&sem->count); + } while (count & RWSEM_LOCK_MASK); raw_spin_lock_irq(&sem->wait_lock); } @@ -603,7 +554,7 @@ out_nolock: raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); if (list_empty(&sem->wait_list)) - atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); + atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); else __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index eb9c8534299b..499a9b2bda82 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -42,24 +42,24 @@ #endif /* - * R/W semaphores originally for PPC using the stuff in lib/rwsem.c. - * Adapted largely from include/asm-i386/rwsem.h - * by Paul Mackerras . - */ - -/* - * the semaphore definition + * The definition of the atomic counter in the semaphore: + * + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bits 2-7 - reserved + * Bits 8-X - 24-bit (32-bit) or 56-bit reader count + * + * atomic_long_fetch_add() is used to obtain reader lock, whereas + * atomic_long_cmpxchg() will be used to obtain writer lock. */ -#ifdef CONFIG_64BIT -# define RWSEM_ACTIVE_MASK 0xffffffffL -#else -# define RWSEM_ACTIVE_MASK 0x0000ffffL -#endif - -#define RWSEM_ACTIVE_BIAS 0x00000001L -#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) +#define RWSEM_WRITER_LOCKED (1UL << 0) +#define RWSEM_FLAG_WAITERS (1UL << 1) +#define RWSEM_READER_SHIFT 8 +#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) +#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) +#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED +#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) +#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS) /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -151,7 +151,8 @@ extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); */ static inline void __down_read(struct rw_semaphore *sem) { - if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { + if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, + &sem->count) & RWSEM_READ_FAILED_MASK)) { rwsem_down_read_failed(sem); DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem); @@ -162,7 +163,8 @@ static inline void __down_read(struct rw_semaphore *sem) static inline int __down_read_killable(struct rw_semaphore *sem) { - if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { + if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, + &sem->count) & RWSEM_READ_FAILED_MASK)) { if (IS_ERR(rwsem_down_read_failed_killable(sem))) return -EINTR; DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & @@ -183,11 +185,11 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) lockevent_inc(rwsem_rtrylock); do { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { + tmp + RWSEM_READER_BIAS)) { rwsem_set_reader_owned(sem); return 1; } - } while (tmp >= 0); + } while (!(tmp & RWSEM_READ_FAILED_MASK)); return 0; } @@ -196,22 +198,16 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline void __down_write(struct rw_semaphore *sem) { - long tmp; - - tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, - &sem->count); - if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, + RWSEM_WRITER_LOCKED))) rwsem_down_write_failed(sem); rwsem_set_owner(sem); } static inline int __down_write_killable(struct rw_semaphore *sem) { - long tmp; - - tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, - &sem->count); - if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, + RWSEM_WRITER_LOCKED))) if (IS_ERR(rwsem_down_write_failed_killable(sem))) return -EINTR; rwsem_set_owner(sem); @@ -224,7 +220,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) lockevent_inc(rwsem_wtrylock); tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); + RWSEM_WRITER_LOCKED); if (tmp == RWSEM_UNLOCKED_VALUE) { rwsem_set_owner(sem); return true; @@ -242,8 +238,9 @@ static inline void __up_read(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem); rwsem_clear_reader_owned(sem); - tmp = atomic_long_dec_return_release(&sem->count); - if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) + tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); + if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) + == RWSEM_FLAG_WAITERS)) rwsem_wake(sem); } @@ -254,8 +251,8 @@ static inline void __up_write(struct rw_semaphore *sem) { DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); rwsem_clear_owner(sem); - if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, - &sem->count) < 0)) + if (unlikely(atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, + &sem->count) & RWSEM_FLAG_WAITERS)) rwsem_wake(sem); } @@ -274,8 +271,9 @@ static inline void __downgrade_write(struct rw_semaphore *sem) * write side. As such, rely on RELEASE semantics. */ DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); - tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); + tmp = atomic_long_fetch_add_release( + -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); rwsem_set_reader_owned(sem); - if (tmp < 0) + if (tmp & RWSEM_FLAG_WAITERS) rwsem_downgrade_wake(sem); } -- cgit v1.2.3 From 5dec94d4923683b1dd6a09dc62427a24d79ee7b4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:03 -0400 Subject: locking/rwsem: Merge rwsem.h and rwsem-xadd.c into rwsem.c Now we only have one implementation of rwsem. Even though we still use xadd to handle reader locking, we use cmpxchg for writer instead. So the filename rwsem-xadd.c is not strictly correct. Also no one outside of the rwsem code need to know the internal implementation other than function prototypes for two internal functions that are called directly from percpu-rwsem.c. So the rwsem-xadd.c and rwsem.h files are now merged into rwsem.c in the following order: The rwsem.h file now contains only 2 function declarations for __up_read() and __down_read(). This is a code relocation patch with no code change at all except making __up_read() and __down_read() non-static functions so they can be used by percpu-rwsem.c. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-5-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/Makefile | 2 +- kernel/locking/rwsem-xadd.c | 624 ------------------------------- kernel/locking/rwsem.c | 884 ++++++++++++++++++++++++++++++++++++++++++++ kernel/locking/rwsem.h | 281 +------------- 4 files changed, 891 insertions(+), 900 deletions(-) delete mode 100644 kernel/locking/rwsem-xadd.c (limited to 'kernel') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 6fe2f333aecb..45452facff3b 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,7 +3,7 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c deleted file mode 100644 index 7d537b50a849..000000000000 --- a/kernel/locking/rwsem-xadd.c +++ /dev/null @@ -1,624 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* rwsem.c: R/W semaphores: contention handling functions - * - * Written by David Howells (dhowells@redhat.com). - * Derived from arch/i386/kernel/semaphore.c - * - * Writer lock-stealing by Alex Shi - * and Michel Lespinasse - * - * Optimistic spinning by Tim Chen - * and Davidlohr Bueso . Based on mutexes. - * - * Rwsem count bit fields re-definition by Waiman Long . - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rwsem.h" - -/* - * Guide to the rw_semaphore's count field. - * - * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned - * by a writer. - * - * The lock is owned by readers when - * (1) the RWSEM_WRITER_LOCKED isn't set in count, - * (2) some of the reader bits are set in count, and - * (3) the owner field has RWSEM_READ_OWNED bit set. - * - * Having some reader bits set is not enough to guarantee a readers owned - * lock as the readers may be in the process of backing out from the count - * and a writer has just released the lock. So another writer may steal - * the lock immediately after that. - */ - -/* - * Initialize an rwsem: - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); - sem->owner = NULL; -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER - osq_lock_init(&sem->osq); -#endif -} - -EXPORT_SYMBOL(__init_rwsem); - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -enum rwsem_wake_type { - RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ - RWSEM_WAKE_READERS, /* Wake readers only */ - RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ -}; - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must - * have been set. - * - there must be someone on the queue - * - the wait_lock must be held by the caller - * - tasks are marked for wakeup, the caller must later invoke wake_up_q() - * to actually wakeup the blocked task(s) and drop the reference count, - * preferably when the wait_lock is released - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only marked woken if downgrading is false - */ -static void __rwsem_mark_wake(struct rw_semaphore *sem, - enum rwsem_wake_type wake_type, - struct wake_q_head *wake_q) -{ - struct rwsem_waiter *waiter, *tmp; - long oldcount, woken = 0, adjustment = 0; - struct list_head wlist; - - /* - * Take a peek at the queue head waiter such that we can determine - * the wakeup(s) to perform. - */ - waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) { - /* - * Mark writer at the front of the queue for wakeup. - * Until the task is actually later awoken later by - * the caller, other writers are able to steal it. - * Readers, on the other hand, will block as they - * will notice the queued writer. - */ - wake_q_add(wake_q, waiter->task); - lockevent_inc(rwsem_wake_writer); - } - - return; - } - - /* - * Writers might steal the lock before we grant it to the next reader. - * We prefer to do the first reader grant before counting readers - * so we can bail out early if a writer stole the lock. - */ - if (wake_type != RWSEM_WAKE_READ_OWNED) { - adjustment = RWSEM_READER_BIAS; - oldcount = atomic_long_fetch_add(adjustment, &sem->count); - if (unlikely(oldcount & RWSEM_WRITER_MASK)) { - atomic_long_sub(adjustment, &sem->count); - return; - } - /* - * Set it to reader-owned to give spinners an early - * indication that readers now have the lock. - */ - __rwsem_set_reader_owned(sem, waiter->task); - } - - /* - * Grant an infinite number of read locks to the readers at the front - * of the queue. We know that woken will be at least 1 as we accounted - * for above. Note we increment the 'active part' of the count by the - * number of readers before waking any processes up. - * - * We have to do wakeup in 2 passes to prevent the possibility that - * the reader count may be decremented before it is incremented. It - * is because the to-be-woken waiter may not have slept yet. So it - * may see waiter->task got cleared, finish its critical section and - * do an unlock before the reader count increment. - * - * 1) Collect the read-waiters in a separate list, count them and - * fully increment the reader count in rwsem. - * 2) For each waiters in the new list, clear waiter->task and - * put them into wake_q to be woken up later. - */ - list_for_each_entry(waiter, &sem->wait_list, list) { - if (waiter->type == RWSEM_WAITING_FOR_WRITE) - break; - - woken++; - } - list_cut_before(&wlist, &sem->wait_list, &waiter->list); - - adjustment = woken * RWSEM_READER_BIAS - adjustment; - lockevent_cond_inc(rwsem_wake_reader, woken); - if (list_empty(&sem->wait_list)) { - /* hit end of list above */ - adjustment -= RWSEM_FLAG_WAITERS; - } - - if (adjustment) - atomic_long_add(adjustment, &sem->count); - - /* 2nd pass */ - list_for_each_entry_safe(waiter, tmp, &wlist, list) { - struct task_struct *tsk; - - tsk = waiter->task; - get_task_struct(tsk); - - /* - * Ensure calling get_task_struct() before setting the reader - * waiter to nil such that rwsem_down_read_failed() cannot - * race with do_exit() by always holding a reference count - * to the task to wakeup. - */ - smp_store_release(&waiter->task, NULL); - /* - * Ensure issuing the wakeup (either by us or someone else) - * after setting the reader waiter to nil. - */ - wake_q_add_safe(wake_q, tsk); - } -} - -/* - * This function must be called with the sem->wait_lock held to prevent - * race conditions between checking the rwsem wait list and setting the - * sem->count accordingly. - */ -static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) -{ - long new; - - if (count & RWSEM_LOCK_MASK) - return false; - - new = count + RWSEM_WRITER_LOCKED - - (list_is_singular(&sem->wait_list) ? RWSEM_FLAG_WAITERS : 0); - - if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)) { - rwsem_set_owner(sem); - return true; - } - - return false; -} - -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER -/* - * Try to acquire write lock before the writer has been put on wait queue. - */ -static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) -{ - long count = atomic_long_read(&sem->count); - - while (!(count & RWSEM_LOCK_MASK)) { - if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, - count + RWSEM_WRITER_LOCKED)) { - rwsem_set_owner(sem); - lockevent_inc(rwsem_opt_wlock); - return true; - } - } - return false; -} - -static inline bool owner_on_cpu(struct task_struct *owner) -{ - /* - * As lock holder preemption issue, we both skip spinning if - * task is not on cpu or its cpu is preempted - */ - return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); -} - -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) -{ - struct task_struct *owner; - bool ret = true; - - BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); - - if (need_resched()) - return false; - - rcu_read_lock(); - owner = READ_ONCE(sem->owner); - if (owner) { - ret = is_rwsem_owner_spinnable(owner) && - owner_on_cpu(owner); - } - rcu_read_unlock(); - return ret; -} - -/* - * Return true only if we can still spin on the owner field of the rwsem. - */ -static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) -{ - struct task_struct *owner = READ_ONCE(sem->owner); - - if (!is_rwsem_owner_spinnable(owner)) - return false; - - rcu_read_lock(); - while (owner && (READ_ONCE(sem->owner) == owner)) { - /* - * Ensure we emit the owner->on_cpu, dereference _after_ - * checking sem->owner still matches owner, if that fails, - * owner might point to free()d memory, if it still matches, - * the rcu_read_lock() ensures the memory stays valid. - */ - barrier(); - - /* - * abort spinning when need_resched or owner is not running or - * owner's cpu is preempted. - */ - if (need_resched() || !owner_on_cpu(owner)) { - rcu_read_unlock(); - return false; - } - - cpu_relax(); - } - rcu_read_unlock(); - - /* - * If there is a new owner or the owner is not set, we continue - * spinning. - */ - return is_rwsem_owner_spinnable(READ_ONCE(sem->owner)); -} - -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) -{ - bool taken = false; - - preempt_disable(); - - /* sem->wait_lock should not be held when doing optimistic spinning */ - if (!rwsem_can_spin_on_owner(sem)) - goto done; - - if (!osq_lock(&sem->osq)) - goto done; - - /* - * Optimistically spin on the owner field and attempt to acquire the - * lock whenever the owner changes. Spinning will be stopped when: - * 1) the owning writer isn't running; or - * 2) readers own the lock as we can't determine if they are - * actively running or not. - */ - while (rwsem_spin_on_owner(sem)) { - /* - * Try to acquire the lock - */ - if (rwsem_try_write_lock_unqueued(sem)) { - taken = true; - break; - } - - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!sem->owner && (need_resched() || rt_task(current))) - break; - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - cpu_relax(); - } - osq_unlock(&sem->osq); -done: - preempt_enable(); - lockevent_cond_inc(rwsem_opt_fail, !taken); - return taken; -} -#else -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) -{ - return false; -} -#endif - -/* - * Wait for the read lock to be granted - */ -static inline struct rw_semaphore __sched * -__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) -{ - long count, adjustment = -RWSEM_READER_BIAS; - struct rwsem_waiter waiter; - DEFINE_WAKE_Q(wake_q); - - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) { - /* - * In case the wait queue is empty and the lock isn't owned - * by a writer, this reader can exit the slowpath and return - * immediately as its RWSEM_READER_BIAS has already been - * set in the count. - */ - if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) { - raw_spin_unlock_irq(&sem->wait_lock); - rwsem_set_reader_owned(sem); - lockevent_inc(rwsem_rlock_fast); - return sem; - } - adjustment += RWSEM_FLAG_WAITERS; - } - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = atomic_long_add_return(adjustment, &sem->count); - - /* - * If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (!(count & RWSEM_LOCK_MASK) || - (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS))) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); - - /* wait to be given the lock */ - while (true) { - set_current_state(state); - if (!waiter.task) - break; - if (signal_pending_state(state, current)) { - raw_spin_lock_irq(&sem->wait_lock); - if (waiter.task) - goto out_nolock; - raw_spin_unlock_irq(&sem->wait_lock); - break; - } - schedule(); - lockevent_inc(rwsem_sleep_reader); - } - - __set_current_state(TASK_RUNNING); - lockevent_inc(rwsem_rlock); - return sem; -out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); - raw_spin_unlock_irq(&sem->wait_lock); - __set_current_state(TASK_RUNNING); - lockevent_inc(rwsem_rlock_fail); - return ERR_PTR(-EINTR); -} - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed_killable); - -/* - * Wait until we successfully acquire the write lock - */ -static inline struct rw_semaphore * -__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) -{ - long count; - bool waiting = true; /* any queued threads before us */ - struct rwsem_waiter waiter; - struct rw_semaphore *ret = sem; - DEFINE_WAKE_Q(wake_q); - - /* do optimistic spinning and steal lock if possible */ - if (rwsem_optimistic_spin(sem)) - return sem; - - /* - * Optimistic spinning failed, proceed to the slowpath - * and block until we can acquire the sem. - */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_WRITE; - - raw_spin_lock_irq(&sem->wait_lock); - - /* account for this before adding a new element to the list */ - if (list_empty(&sem->wait_list)) - waiting = false; - - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock */ - if (waiting) { - count = atomic_long_read(&sem->count); - - /* - * If there were already threads queued before us and there are - * no active writers and some readers, the lock must be read - * owned; so we try to any read locks that were queued ahead - * of us. - */ - if (!(count & RWSEM_WRITER_MASK) && - (count & RWSEM_READER_MASK)) { - __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); - /* - * The wakeup is normally called _after_ the wait_lock - * is released, but given that we are proactively waking - * readers we can deal with the wake_q overhead as it is - * similar to releasing and taking the wait_lock again - * for attempting rwsem_try_write_lock(). - */ - wake_up_q(&wake_q); - - /* - * Reinitialize wake_q after use. - */ - wake_q_init(&wake_q); - } - - } else { - count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count); - } - - /* wait until we successfully acquire the lock */ - set_current_state(state); - while (true) { - if (rwsem_try_write_lock(count, sem)) - break; - raw_spin_unlock_irq(&sem->wait_lock); - - /* Block until there are no active lockers. */ - do { - if (signal_pending_state(state, current)) - goto out_nolock; - - schedule(); - lockevent_inc(rwsem_sleep_writer); - set_current_state(state); - count = atomic_long_read(&sem->count); - } while (count & RWSEM_LOCK_MASK); - - raw_spin_lock_irq(&sem->wait_lock); - } - __set_current_state(TASK_RUNNING); - list_del(&waiter.list); - raw_spin_unlock_irq(&sem->wait_lock); - lockevent_inc(rwsem_wlock); - - return ret; - -out_nolock: - __set_current_state(TASK_RUNNING); - raw_spin_lock_irq(&sem->wait_lock); - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); - else - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); - lockevent_inc(rwsem_wlock_fail); - - return ERR_PTR(-EINTR); -} - -__visible struct rw_semaphore * __sched -rwsem_down_write_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_write_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed_killable); - -/* - * handle waking up a waiter on the semaphore - * - up_read/up_write has decremented the active part of count if we come here - */ -__visible -struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - DEFINE_WAKE_Q(wake_q); - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - wake_up_q(&wake_q); - - return sem; -} -EXPORT_SYMBOL(rwsem_wake); - -/* - * downgrade a write lock into a read lock - * - caller incremented waiting part of count and discovered it still negative - * - just wake up any readers at the front of the queue - */ -__visible -struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - DEFINE_WAKE_Q(wake_q); - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - wake_up_q(&wake_q); - - return sem; -} -EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index ccbf18f560ff..8317bcdf063b 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -3,17 +3,901 @@ * * Written by David Howells (dhowells@redhat.com). * Derived from asm-i386/semaphore.h + * + * Writer lock-stealing by Alex Shi + * and Michel Lespinasse + * + * Optimistic spinning by Tim Chen + * and Davidlohr Bueso . Based on mutexes. + * + * Rwsem count bit fields re-definition and rwsem rearchitecture + * by Waiman Long . */ #include #include #include +#include +#include #include +#include +#include #include #include #include #include "rwsem.h" +#include "lock_events.h" + +/* + * The least significant 2 bits of the owner value has the following + * meanings when set. + * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers + * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned, + * i.e. the owner(s) cannot be readily determined. It can be reader + * owned or the owning writer is indeterminate. + * + * When a writer acquires a rwsem, it puts its task_struct pointer + * into the owner field. It is cleared after an unlock. + * + * When a reader acquires a rwsem, it will also puts its task_struct + * pointer into the owner field with both the RWSEM_READER_OWNED and + * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will + * largely be left untouched. So for a free or reader-owned rwsem, + * the owner value may contain information about the last reader that + * acquires the rwsem. The anonymous bit is set because that particular + * reader may or may not still own the lock. + * + * That information may be helpful in debugging cases where the system + * seems to hang on a reader owned rwsem especially if only one reader + * is involved. Ideally we would like to track all the readers that own + * a rwsem, but the overhead is simply too big. + */ +#define RWSEM_READER_OWNED (1UL << 0) +#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) + +#ifdef CONFIG_DEBUG_RWSEMS +# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ + if (!debug_locks_silent && \ + WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ + #c, atomic_long_read(&(sem)->count), \ + (long)((sem)->owner), (long)current, \ + list_empty(&(sem)->wait_list) ? "" : "not ")) \ + debug_locks_off(); \ + } while (0) +#else +# define DEBUG_RWSEMS_WARN_ON(c, sem) +#endif + +/* + * The definition of the atomic counter in the semaphore: + * + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bits 2-7 - reserved + * Bits 8-X - 24-bit (32-bit) or 56-bit reader count + * + * atomic_long_fetch_add() is used to obtain reader lock, whereas + * atomic_long_cmpxchg() will be used to obtain writer lock. + */ +#define RWSEM_WRITER_LOCKED (1UL << 0) +#define RWSEM_FLAG_WAITERS (1UL << 1) +#define RWSEM_READER_SHIFT 8 +#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) +#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) +#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED +#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) +#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS) + +/* + * All writes to owner are protected by WRITE_ONCE() to make sure that + * store tearing can't happen as optimistic spinners may read and use + * the owner value concurrently without lock. Read from owner, however, + * may not need READ_ONCE() as long as the pointer value is only used + * for comparison and isn't being dereferenced. + */ +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + WRITE_ONCE(sem->owner, current); +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + WRITE_ONCE(sem->owner, NULL); +} + +/* + * The task_struct pointer of the last owning reader will be left in + * the owner field. + * + * Note that the owner value just indicates the task has owned the rwsem + * previously, it may not be the real owner or one of the real owners + * anymore when that field is examined, so take it with a grain of salt. + */ +static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, + struct task_struct *owner) +{ + unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED + | RWSEM_ANONYMOUSLY_OWNED; + + WRITE_ONCE(sem->owner, (struct task_struct *)val); +} + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ + __rwsem_set_reader_owned(sem, current); +} + +/* + * Return true if the a rwsem waiter can spin on the rwsem's owner + * and steal the lock, i.e. the lock is not anonymously owned. + * N.B. !owner is considered spinnable. + */ +static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) +{ + return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); +} + +/* + * Return true if rwsem is owned by an anonymous writer or readers. + */ +static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) +{ + return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; +} + +#ifdef CONFIG_DEBUG_RWSEMS +/* + * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there + * is a task pointer in owner of a reader-owned rwsem, it will be the + * real owner or one of the real owners. The only exception is when the + * unlock is done by up_read_non_owner(). + */ +static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) +{ + unsigned long val = (unsigned long)current | RWSEM_READER_OWNED + | RWSEM_ANONYMOUSLY_OWNED; + if (READ_ONCE(sem->owner) == (struct task_struct *)val) + cmpxchg_relaxed((unsigned long *)&sem->owner, val, + RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); +} +#else +static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) +{ +} +#endif + +/* + * Guide to the rw_semaphore's count field. + * + * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned + * by a writer. + * + * The lock is owned by readers when + * (1) the RWSEM_WRITER_LOCKED isn't set in count, + * (2) some of the reader bits are set in count, and + * (3) the owner field has RWSEM_READ_OWNED bit set. + * + * Having some reader bits set is not enough to guarantee a readers owned + * lock as the readers may be in the process of backing out from the count + * and a writer has just released the lock. So another writer may steal + * the lock immediately after that. + */ + +/* + * Initialize an rwsem: + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif + atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); + raw_spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); + sem->owner = NULL; +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + osq_lock_init(&sem->osq); +#endif +} + +EXPORT_SYMBOL(__init_rwsem); + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; +}; + +enum rwsem_wake_type { + RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ + RWSEM_WAKE_READERS, /* Wake readers only */ + RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ +}; + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must + * have been set. + * - there must be someone on the queue + * - the wait_lock must be held by the caller + * - tasks are marked for wakeup, the caller must later invoke wake_up_q() + * to actually wakeup the blocked task(s) and drop the reference count, + * preferably when the wait_lock is released + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only marked woken if downgrading is false + */ +static void __rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, + struct wake_q_head *wake_q) +{ + struct rwsem_waiter *waiter, *tmp; + long oldcount, woken = 0, adjustment = 0; + struct list_head wlist; + + /* + * Take a peek at the queue head waiter such that we can determine + * the wakeup(s) to perform. + */ + waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); + + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wake_type == RWSEM_WAKE_ANY) { + /* + * Mark writer at the front of the queue for wakeup. + * Until the task is actually later awoken later by + * the caller, other writers are able to steal it. + * Readers, on the other hand, will block as they + * will notice the queued writer. + */ + wake_q_add(wake_q, waiter->task); + lockevent_inc(rwsem_wake_writer); + } + + return; + } + + /* + * Writers might steal the lock before we grant it to the next reader. + * We prefer to do the first reader grant before counting readers + * so we can bail out early if a writer stole the lock. + */ + if (wake_type != RWSEM_WAKE_READ_OWNED) { + adjustment = RWSEM_READER_BIAS; + oldcount = atomic_long_fetch_add(adjustment, &sem->count); + if (unlikely(oldcount & RWSEM_WRITER_MASK)) { + atomic_long_sub(adjustment, &sem->count); + return; + } + /* + * Set it to reader-owned to give spinners an early + * indication that readers now have the lock. + */ + __rwsem_set_reader_owned(sem, waiter->task); + } + + /* + * Grant an infinite number of read locks to the readers at the front + * of the queue. We know that woken will be at least 1 as we accounted + * for above. Note we increment the 'active part' of the count by the + * number of readers before waking any processes up. + * + * We have to do wakeup in 2 passes to prevent the possibility that + * the reader count may be decremented before it is incremented. It + * is because the to-be-woken waiter may not have slept yet. So it + * may see waiter->task got cleared, finish its critical section and + * do an unlock before the reader count increment. + * + * 1) Collect the read-waiters in a separate list, count them and + * fully increment the reader count in rwsem. + * 2) For each waiters in the new list, clear waiter->task and + * put them into wake_q to be woken up later. + */ + list_for_each_entry(waiter, &sem->wait_list, list) { + if (waiter->type == RWSEM_WAITING_FOR_WRITE) + break; + + woken++; + } + list_cut_before(&wlist, &sem->wait_list, &waiter->list); + + adjustment = woken * RWSEM_READER_BIAS - adjustment; + lockevent_cond_inc(rwsem_wake_reader, woken); + if (list_empty(&sem->wait_list)) { + /* hit end of list above */ + adjustment -= RWSEM_FLAG_WAITERS; + } + + if (adjustment) + atomic_long_add(adjustment, &sem->count); + + /* 2nd pass */ + list_for_each_entry_safe(waiter, tmp, &wlist, list) { + struct task_struct *tsk; + + tsk = waiter->task; + get_task_struct(tsk); + + /* + * Ensure calling get_task_struct() before setting the reader + * waiter to nil such that rwsem_down_read_failed() cannot + * race with do_exit() by always holding a reference count + * to the task to wakeup. + */ + smp_store_release(&waiter->task, NULL); + /* + * Ensure issuing the wakeup (either by us or someone else) + * after setting the reader waiter to nil. + */ + wake_q_add_safe(wake_q, tsk); + } +} + +/* + * This function must be called with the sem->wait_lock held to prevent + * race conditions between checking the rwsem wait list and setting the + * sem->count accordingly. + */ +static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) +{ + long new; + + if (count & RWSEM_LOCK_MASK) + return false; + + new = count + RWSEM_WRITER_LOCKED - + (list_is_singular(&sem->wait_list) ? RWSEM_FLAG_WAITERS : 0); + + if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)) { + rwsem_set_owner(sem); + return true; + } + + return false; +} + +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * Try to acquire write lock before the writer has been put on wait queue. + */ +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + + while (!(count & RWSEM_LOCK_MASK)) { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, + count + RWSEM_WRITER_LOCKED)) { + rwsem_set_owner(sem); + lockevent_inc(rwsem_opt_wlock); + return true; + } + } + return false; +} + +static inline bool owner_on_cpu(struct task_struct *owner) +{ + /* + * As lock holder preemption issue, we both skip spinning if + * task is not on cpu or its cpu is preempted + */ + return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); +} + +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool ret = true; + + BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); + + if (need_resched()) + return false; + + rcu_read_lock(); + owner = READ_ONCE(sem->owner); + if (owner) { + ret = is_rwsem_owner_spinnable(owner) && + owner_on_cpu(owner); + } + rcu_read_unlock(); + return ret; +} + +/* + * Return true only if we can still spin on the owner field of the rwsem. + */ +static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *owner = READ_ONCE(sem->owner); + + if (!is_rwsem_owner_spinnable(owner)) + return false; + + rcu_read_lock(); + while (owner && (READ_ONCE(sem->owner) == owner)) { + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking sem->owner still matches owner, if that fails, + * owner might point to free()d memory, if it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + + /* + * abort spinning when need_resched or owner is not running or + * owner's cpu is preempted. + */ + if (need_resched() || !owner_on_cpu(owner)) { + rcu_read_unlock(); + return false; + } + + cpu_relax(); + } + rcu_read_unlock(); + + /* + * If there is a new owner or the owner is not set, we continue + * spinning. + */ + return is_rwsem_owner_spinnable(READ_ONCE(sem->owner)); +} + +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + bool taken = false; + + preempt_disable(); + + /* sem->wait_lock should not be held when doing optimistic spinning */ + if (!rwsem_can_spin_on_owner(sem)) + goto done; + + if (!osq_lock(&sem->osq)) + goto done; + + /* + * Optimistically spin on the owner field and attempt to acquire the + * lock whenever the owner changes. Spinning will be stopped when: + * 1) the owning writer isn't running; or + * 2) readers own the lock as we can't determine if they are + * actively running or not. + */ + while (rwsem_spin_on_owner(sem)) { + /* + * Try to acquire the lock + */ + if (rwsem_try_write_lock_unqueued(sem)) { + taken = true; + break; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!sem->owner && (need_resched() || rt_task(current))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax(); + } + osq_unlock(&sem->osq); +done: + preempt_enable(); + lockevent_cond_inc(rwsem_opt_fail, !taken); + return taken; +} +#else +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + return false; +} +#endif + +/* + * Wait for the read lock to be granted + */ +static inline struct rw_semaphore __sched * +__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) +{ + long count, adjustment = -RWSEM_READER_BIAS; + struct rwsem_waiter waiter; + DEFINE_WAKE_Q(wake_q); + + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_READ; + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) { + /* + * In case the wait queue is empty and the lock isn't owned + * by a writer, this reader can exit the slowpath and return + * immediately as its RWSEM_READER_BIAS has already been + * set in the count. + */ + if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) { + raw_spin_unlock_irq(&sem->wait_lock); + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_rlock_fast); + return sem; + } + adjustment += RWSEM_FLAG_WAITERS; + } + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = atomic_long_add_return(adjustment, &sem->count); + + /* + * If there are no active locks, wake the front queued process(es). + * + * If there are no writers and we are first in the queue, + * wake our own waiter to join the existing active readers ! + */ + if (!(count & RWSEM_LOCK_MASK) || + (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS))) + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + + /* wait to be given the lock */ + while (true) { + set_current_state(state); + if (!waiter.task) + break; + if (signal_pending_state(state, current)) { + raw_spin_lock_irq(&sem->wait_lock); + if (waiter.task) + goto out_nolock; + raw_spin_unlock_irq(&sem->wait_lock); + break; + } + schedule(); + lockevent_inc(rwsem_sleep_reader); + } + + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock); + return sem; +out_nolock: + list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); + raw_spin_unlock_irq(&sem->wait_lock); + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock_fail); + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed); + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed_killable); + +/* + * Wait until we successfully acquire the write lock + */ +static inline struct rw_semaphore * +__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) +{ + long count; + bool waiting = true; /* any queued threads before us */ + struct rwsem_waiter waiter; + struct rw_semaphore *ret = sem; + DEFINE_WAKE_Q(wake_q); + + /* do optimistic spinning and steal lock if possible */ + if (rwsem_optimistic_spin(sem)) + return sem; + + /* + * Optimistic spinning failed, proceed to the slowpath + * and block until we can acquire the sem. + */ + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_WRITE; + + raw_spin_lock_irq(&sem->wait_lock); + + /* account for this before adding a new element to the list */ + if (list_empty(&sem->wait_list)) + waiting = false; + + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock */ + if (waiting) { + count = atomic_long_read(&sem->count); + + /* + * If there were already threads queued before us and there are + * no active writers and some readers, the lock must be read + * owned; so we try to any read locks that were queued ahead + * of us. + */ + if (!(count & RWSEM_WRITER_MASK) && + (count & RWSEM_READER_MASK)) { + __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + /* + * The wakeup is normally called _after_ the wait_lock + * is released, but given that we are proactively waking + * readers we can deal with the wake_q overhead as it is + * similar to releasing and taking the wait_lock again + * for attempting rwsem_try_write_lock(). + */ + wake_up_q(&wake_q); + + /* + * Reinitialize wake_q after use. + */ + wake_q_init(&wake_q); + } + + } else { + count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count); + } + + /* wait until we successfully acquire the lock */ + set_current_state(state); + while (true) { + if (rwsem_try_write_lock(count, sem)) + break; + raw_spin_unlock_irq(&sem->wait_lock); + + /* Block until there are no active lockers. */ + do { + if (signal_pending_state(state, current)) + goto out_nolock; + + schedule(); + lockevent_inc(rwsem_sleep_writer); + set_current_state(state); + count = atomic_long_read(&sem->count); + } while (count & RWSEM_LOCK_MASK); + + raw_spin_lock_irq(&sem->wait_lock); + } + __set_current_state(TASK_RUNNING); + list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); + lockevent_inc(rwsem_wlock); + + return ret; + +out_nolock: + __set_current_state(TASK_RUNNING); + raw_spin_lock_irq(&sem->wait_lock); + list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); + else + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + lockevent_inc(rwsem_wlock_fail); + + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_write_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(rwsem_down_write_failed); + +__visible struct rw_semaphore * __sched +rwsem_down_write_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_write_failed_killable); + +/* + * handle waking up a waiter on the semaphore + * - up_read/up_write has decremented the active part of count if we come here + */ +__visible +struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + DEFINE_WAKE_Q(wake_q); + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (!list_empty(&sem->wait_list)) + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); + + return sem; +} +EXPORT_SYMBOL(rwsem_wake); + +/* + * downgrade a write lock into a read lock + * - caller incremented waiting part of count and discovered it still negative + * - just wake up any readers at the front of the queue + */ +__visible +struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + DEFINE_WAKE_Q(wake_q); + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (!list_empty(&sem->wait_list)) + __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); + + return sem; +} +EXPORT_SYMBOL(rwsem_downgrade_wake); + +/* + * lock for reading + */ +inline void __down_read(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, + &sem->count) & RWSEM_READ_FAILED_MASK)) { + rwsem_down_read_failed(sem); + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & + RWSEM_READER_OWNED), sem); + } else { + rwsem_set_reader_owned(sem); + } +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, + &sem->count) & RWSEM_READ_FAILED_MASK)) { + if (IS_ERR(rwsem_down_read_failed_killable(sem))) + return -EINTR; + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & + RWSEM_READER_OWNED), sem); + } else { + rwsem_set_reader_owned(sem); + } + return 0; +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + /* + * Optimize for the case when the rwsem is not locked at all. + */ + long tmp = RWSEM_UNLOCKED_VALUE; + + lockevent_inc(rwsem_rtrylock); + do { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + tmp + RWSEM_READER_BIAS)) { + rwsem_set_reader_owned(sem); + return 1; + } + } while (!(tmp & RWSEM_READ_FAILED_MASK)); + return 0; +} + +/* + * lock for writing + */ +static inline void __down_write(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, + RWSEM_WRITER_LOCKED))) + rwsem_down_write_failed(sem); + rwsem_set_owner(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, + RWSEM_WRITER_LOCKED))) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + rwsem_set_owner(sem); + return 0; +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + long tmp; + + lockevent_inc(rwsem_wtrylock); + tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, + RWSEM_WRITER_LOCKED); + if (tmp == RWSEM_UNLOCKED_VALUE) { + rwsem_set_owner(sem); + return true; + } + return false; +} + +/* + * unlock after reading + */ +inline void __up_read(struct rw_semaphore *sem) +{ + long tmp; + + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), + sem); + rwsem_clear_reader_owned(sem); + tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); + if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) + == RWSEM_FLAG_WAITERS)) + rwsem_wake(sem); +} + +/* + * unlock after writing + */ +static inline void __up_write(struct rw_semaphore *sem) +{ + DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + rwsem_clear_owner(sem); + if (unlikely(atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, + &sem->count) & RWSEM_FLAG_WAITERS)) + rwsem_wake(sem); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + long tmp; + + /* + * When downgrading from exclusive to shared ownership, + * anything inside the write-locked region cannot leak + * into the read side. In contrast, anything in the + * read-locked region is ok to be re-ordered into the + * write side. As such, rely on RELEASE semantics. + */ + DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + tmp = atomic_long_fetch_add_release( + -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); + rwsem_set_reader_owned(sem); + if (tmp & RWSEM_FLAG_WAITERS) + rwsem_downgrade_wake(sem); +} /* * lock for reading diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 499a9b2bda82..2534ce49f648 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,279 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* - * The least significant 2 bits of the owner value has the following - * meanings when set. - * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers - * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned, - * i.e. the owner(s) cannot be readily determined. It can be reader - * owned or the owning writer is indeterminate. - * - * When a writer acquires a rwsem, it puts its task_struct pointer - * into the owner field. It is cleared after an unlock. - * - * When a reader acquires a rwsem, it will also puts its task_struct - * pointer into the owner field with both the RWSEM_READER_OWNED and - * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will - * largely be left untouched. So for a free or reader-owned rwsem, - * the owner value may contain information about the last reader that - * acquires the rwsem. The anonymous bit is set because that particular - * reader may or may not still own the lock. - * - * That information may be helpful in debugging cases where the system - * seems to hang on a reader owned rwsem especially if only one reader - * is involved. Ideally we would like to track all the readers that own - * a rwsem, but the overhead is simply too big. - */ -#include "lock_events.h" -#define RWSEM_READER_OWNED (1UL << 0) -#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) +#ifndef __INTERNAL_RWSEM_H +#define __INTERNAL_RWSEM_H +#include -#ifdef CONFIG_DEBUG_RWSEMS -# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ - if (!debug_locks_silent && \ - WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ - #c, atomic_long_read(&(sem)->count), \ - (long)((sem)->owner), (long)current, \ - list_empty(&(sem)->wait_list) ? "" : "not ")) \ - debug_locks_off(); \ - } while (0) -#else -# define DEBUG_RWSEMS_WARN_ON(c, sem) -#endif +extern void __down_read(struct rw_semaphore *sem); +extern void __up_read(struct rw_semaphore *sem); -/* - * The definition of the atomic counter in the semaphore: - * - * Bit 0 - writer locked bit - * Bit 1 - waiters present bit - * Bits 2-7 - reserved - * Bits 8-X - 24-bit (32-bit) or 56-bit reader count - * - * atomic_long_fetch_add() is used to obtain reader lock, whereas - * atomic_long_cmpxchg() will be used to obtain writer lock. - */ -#define RWSEM_WRITER_LOCKED (1UL << 0) -#define RWSEM_FLAG_WAITERS (1UL << 1) -#define RWSEM_READER_SHIFT 8 -#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) -#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) -#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED -#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) -#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS) - -/* - * All writes to owner are protected by WRITE_ONCE() to make sure that - * store tearing can't happen as optimistic spinners may read and use - * the owner value concurrently without lock. Read from owner, however, - * may not need READ_ONCE() as long as the pointer value is only used - * for comparison and isn't being dereferenced. - */ -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ - WRITE_ONCE(sem->owner, current); -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ - WRITE_ONCE(sem->owner, NULL); -} - -/* - * The task_struct pointer of the last owning reader will be left in - * the owner field. - * - * Note that the owner value just indicates the task has owned the rwsem - * previously, it may not be the real owner or one of the real owners - * anymore when that field is examined, so take it with a grain of salt. - */ -static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, - struct task_struct *owner) -{ - unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; - - WRITE_ONCE(sem->owner, (struct task_struct *)val); -} - -static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) -{ - __rwsem_set_reader_owned(sem, current); -} - -/* - * Return true if the a rwsem waiter can spin on the rwsem's owner - * and steal the lock, i.e. the lock is not anonymously owned. - * N.B. !owner is considered spinnable. - */ -static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) -{ - return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); -} - -/* - * Return true if rwsem is owned by an anonymous writer or readers. - */ -static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) -{ - return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; -} - -#ifdef CONFIG_DEBUG_RWSEMS -/* - * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there - * is a task pointer in owner of a reader-owned rwsem, it will be the - * real owner or one of the real owners. The only exception is when the - * unlock is done by up_read_non_owner(). - */ -static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) -{ - unsigned long val = (unsigned long)current | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; - if (READ_ONCE(sem->owner) == (struct task_struct *)val) - cmpxchg_relaxed((unsigned long *)&sem->owner, val, - RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); -} -#else -static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) -{ -} -#endif - -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, - &sem->count) & RWSEM_READ_FAILED_MASK)) { - rwsem_down_read_failed(sem); - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED), sem); - } else { - rwsem_set_reader_owned(sem); - } -} - -static inline int __down_read_killable(struct rw_semaphore *sem) -{ - if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, - &sem->count) & RWSEM_READ_FAILED_MASK)) { - if (IS_ERR(rwsem_down_read_failed_killable(sem))) - return -EINTR; - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED), sem); - } else { - rwsem_set_reader_owned(sem); - } - return 0; -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - /* - * Optimize for the case when the rwsem is not locked at all. - */ - long tmp = RWSEM_UNLOCKED_VALUE; - - lockevent_inc(rwsem_rtrylock); - do { - if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - tmp + RWSEM_READER_BIAS)) { - rwsem_set_reader_owned(sem); - return 1; - } - } while (!(tmp & RWSEM_READ_FAILED_MASK)); - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write(struct rw_semaphore *sem) -{ - if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, - RWSEM_WRITER_LOCKED))) - rwsem_down_write_failed(sem); - rwsem_set_owner(sem); -} - -static inline int __down_write_killable(struct rw_semaphore *sem) -{ - if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, - RWSEM_WRITER_LOCKED))) - if (IS_ERR(rwsem_down_write_failed_killable(sem))) - return -EINTR; - rwsem_set_owner(sem); - return 0; -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - long tmp; - - lockevent_inc(rwsem_wtrylock); - tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_WRITER_LOCKED); - if (tmp == RWSEM_UNLOCKED_VALUE) { - rwsem_set_owner(sem); - return true; - } - return false; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - long tmp; - - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), - sem); - rwsem_clear_reader_owned(sem); - tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); - if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) - == RWSEM_FLAG_WAITERS)) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); - rwsem_clear_owner(sem); - if (unlikely(atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, - &sem->count) & RWSEM_FLAG_WAITERS)) - rwsem_wake(sem); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - long tmp; - - /* - * When downgrading from exclusive to shared ownership, - * anything inside the write-locked region cannot leak - * into the read side. In contrast, anything in the - * read-locked region is ok to be re-ordered into the - * write side. As such, rely on RELEASE semantics. - */ - DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); - tmp = atomic_long_fetch_add_release( - -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); - rwsem_set_reader_owned(sem); - if (tmp & RWSEM_FLAG_WAITERS) - rwsem_downgrade_wake(sem); -} +#endif /* __INTERNAL_RWSEM_H */ -- cgit v1.2.3 From 6cef7ff6e43cbdb9fa8eb91eb9a6b25d45ae11e3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:04 -0400 Subject: locking/rwsem: Code cleanup after files merging After merging all the relevant rwsem code into one single file, there are a number of optimizations and cleanups that can be done: 1) Remove all the EXPORT_SYMBOL() calls for functions that are not accessed elsewhere. 2) Remove all the __visible tags as none of the functions will be called from assembly code anymore. 3) Make all the internal functions static. 4) Remove some unneeded blank lines. 5) Remove the intermediate rwsem_down_{read|write}_failed*() functions and rename __rwsem_down_{read|write}_failed_common() to rwsem_down_{read|write}_slowpath(). 6) Remove "__" prefix of __rwsem_mark_wake(). 7) Use atomic_long_try_cmpxchg_acquire() as much as possible. 8) Remove the rwsem_rtrylock and rwsem_wtrylock lock events as they are not that useful. That enables the compiler to do better optimization and reduce code size. The text+data size of rwsem.o on an x86-64 machine with gcc8 was reduced from 10237 bytes to 5030 bytes with this change. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-6-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lock_events_list.h | 2 - kernel/locking/rwsem.c | 135 ++++++++++++-------------------------- 2 files changed, 42 insertions(+), 95 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index ad7668cfc9da..11187a1d40b8 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -61,7 +61,5 @@ LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ -LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */ LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ -LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 8317bcdf063b..f56329240ef1 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -205,7 +205,6 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, osq_lock_init(&sem->osq); #endif } - EXPORT_SYMBOL(__init_rwsem); enum rwsem_waiter_type { @@ -237,9 +236,9 @@ enum rwsem_wake_type { * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false */ -static void __rwsem_mark_wake(struct rw_semaphore *sem, - enum rwsem_wake_type wake_type, - struct wake_q_head *wake_q) +static void rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, + struct wake_q_head *wake_q) { struct rwsem_waiter *waiter, *tmp; long oldcount, woken = 0, adjustment = 0; @@ -330,7 +329,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, /* * Ensure calling get_task_struct() before setting the reader - * waiter to nil such that rwsem_down_read_failed() cannot + * waiter to nil such that rwsem_down_read_slowpath() cannot * race with do_exit() by always holding a reference count * to the task to wakeup. */ @@ -516,8 +515,8 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) /* * Wait for the read lock to be granted */ -static inline struct rw_semaphore __sched * -__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) +static struct rw_semaphore __sched * +rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) { long count, adjustment = -RWSEM_READER_BIAS; struct rwsem_waiter waiter; @@ -555,7 +554,7 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) */ if (!(count & RWSEM_LOCK_MASK) || (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS))) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); @@ -589,25 +588,11 @@ out_nolock: return ERR_PTR(-EINTR); } -__visible struct rw_semaphore * __sched -rwsem_down_read_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed_killable); - /* * Wait until we successfully acquire the write lock */ -static inline struct rw_semaphore * -__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) +static struct rw_semaphore * +rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; bool waiting = true; /* any queued threads before us */ @@ -646,7 +631,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) */ if (!(count & RWSEM_WRITER_MASK) && (count & RWSEM_READER_MASK)) { - __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* * The wakeup is normally called _after_ the wait_lock * is released, but given that we are proactively waking @@ -700,7 +685,7 @@ out_nolock: if (list_empty(&sem->wait_list)) atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); else - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); lockevent_inc(rwsem_wlock_fail); @@ -708,26 +693,11 @@ out_nolock: return ERR_PTR(-EINTR); } -__visible struct rw_semaphore * __sched -rwsem_down_write_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_write_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed_killable); - /* * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ -__visible -struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; DEFINE_WAKE_Q(wake_q); @@ -735,22 +705,20 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); return sem; } -EXPORT_SYMBOL(rwsem_wake); /* * downgrade a write lock into a read lock * - caller incremented waiting part of count and discovered it still negative * - just wake up any readers at the front of the queue */ -__visible -struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; DEFINE_WAKE_Q(wake_q); @@ -758,14 +726,13 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); + rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); return sem; } -EXPORT_SYMBOL(rwsem_downgrade_wake); /* * lock for reading @@ -774,7 +741,7 @@ inline void __down_read(struct rw_semaphore *sem) { if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count) & RWSEM_READ_FAILED_MASK)) { - rwsem_down_read_failed(sem); + rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem); } else { @@ -786,7 +753,7 @@ static inline int __down_read_killable(struct rw_semaphore *sem) { if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count) & RWSEM_READ_FAILED_MASK)) { - if (IS_ERR(rwsem_down_read_failed_killable(sem))) + if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) return -EINTR; DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem); @@ -803,7 +770,6 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ long tmp = RWSEM_UNLOCKED_VALUE; - lockevent_inc(rwsem_rtrylock); do { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, tmp + RWSEM_READER_BIAS)) { @@ -819,30 +785,33 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline void __down_write(struct rw_semaphore *sem) { - if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, - RWSEM_WRITER_LOCKED))) - rwsem_down_write_failed(sem); + long tmp = RWSEM_UNLOCKED_VALUE; + + if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + RWSEM_WRITER_LOCKED))) + rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); rwsem_set_owner(sem); } static inline int __down_write_killable(struct rw_semaphore *sem) { - if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, - RWSEM_WRITER_LOCKED))) - if (IS_ERR(rwsem_down_write_failed_killable(sem))) + long tmp = RWSEM_UNLOCKED_VALUE; + + if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + RWSEM_WRITER_LOCKED))) { + if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) return -EINTR; + } rwsem_set_owner(sem); return 0; } static inline int __down_write_trylock(struct rw_semaphore *sem) { - long tmp; + long tmp = RWSEM_UNLOCKED_VALUE; - lockevent_inc(rwsem_wtrylock); - tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_WRITER_LOCKED); - if (tmp == RWSEM_UNLOCKED_VALUE) { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); return true; } @@ -856,12 +825,11 @@ inline void __up_read(struct rw_semaphore *sem) { long tmp; - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), - sem); + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem); rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); - if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) - == RWSEM_FLAG_WAITERS)) + if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == + RWSEM_FLAG_WAITERS)) rwsem_wake(sem); } @@ -870,10 +838,12 @@ inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { + long tmp; + DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); rwsem_clear_owner(sem); - if (unlikely(atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, - &sem->count) & RWSEM_FLAG_WAITERS)) + tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); + if (unlikely(tmp & RWSEM_FLAG_WAITERS)) rwsem_wake(sem); } @@ -909,7 +879,6 @@ void __sched down_read(struct rw_semaphore *sem) LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } - EXPORT_SYMBOL(down_read); int __sched down_read_killable(struct rw_semaphore *sem) @@ -924,7 +893,6 @@ int __sched down_read_killable(struct rw_semaphore *sem) return 0; } - EXPORT_SYMBOL(down_read_killable); /* @@ -938,7 +906,6 @@ int down_read_trylock(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); return ret; } - EXPORT_SYMBOL(down_read_trylock); /* @@ -948,10 +915,8 @@ void __sched down_write(struct rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } - EXPORT_SYMBOL(down_write); /* @@ -962,14 +927,14 @@ int __sched down_write_killable(struct rw_semaphore *sem) might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, + __down_write_killable)) { rwsem_release(&sem->dep_map, 1, _RET_IP_); return -EINTR; } return 0; } - EXPORT_SYMBOL(down_write_killable); /* @@ -984,7 +949,6 @@ int down_write_trylock(struct rw_semaphore *sem) return ret; } - EXPORT_SYMBOL(down_write_trylock); /* @@ -993,10 +957,8 @@ EXPORT_SYMBOL(down_write_trylock); void up_read(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - __up_read(sem); } - EXPORT_SYMBOL(up_read); /* @@ -1005,10 +967,8 @@ EXPORT_SYMBOL(up_read); void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - __up_write(sem); } - EXPORT_SYMBOL(up_write); /* @@ -1017,10 +977,8 @@ EXPORT_SYMBOL(up_write); void downgrade_write(struct rw_semaphore *sem) { lock_downgrade(&sem->dep_map, _RET_IP_); - __downgrade_write(sem); } - EXPORT_SYMBOL(downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -1029,40 +987,32 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } - EXPORT_SYMBOL(down_read_nested); void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) { might_sleep(); rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } - EXPORT_SYMBOL(_down_write_nest_lock); void down_read_non_owner(struct rw_semaphore *sem) { might_sleep(); - __down_read(sem); __rwsem_set_reader_owned(sem, NULL); } - EXPORT_SYMBOL(down_read_non_owner); void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } - EXPORT_SYMBOL(down_write_nested); int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) @@ -1070,14 +1020,14 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, + __down_write_killable)) { rwsem_release(&sem->dep_map, 1, _RET_IP_); return -EINTR; } return 0; } - EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) @@ -1086,7 +1036,6 @@ void up_read_non_owner(struct rw_semaphore *sem) sem); __up_read(sem); } - EXPORT_SYMBOL(up_read_non_owner); #endif -- cgit v1.2.3 From 3f6d517a3ece6e6ced7abcbe798ff332ac5ca586 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:05 -0400 Subject: locking/rwsem: Make rwsem_spin_on_owner() return owner state This patch modifies rwsem_spin_on_owner() to return four possible values to better reflect the state of lock holder which enables us to make a better decision of what to do next. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-7-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 65 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f56329240ef1..8d0f2acfe13d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -414,17 +414,54 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) } /* - * Return true only if we can still spin on the owner field of the rwsem. + * The rwsem_spin_on_owner() function returns the folowing 4 values + * depending on the lock owner state. + * OWNER_NULL : owner is currently NULL + * OWNER_WRITER: when owner changes and is a writer + * OWNER_READER: when owner changes and the new owner may be a reader. + * OWNER_NONSPINNABLE: + * when optimistic spinning has to stop because either the + * owner stops running, is unknown, or its timeslice has + * been used up. */ -static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) +enum owner_state { + OWNER_NULL = 1 << 0, + OWNER_WRITER = 1 << 1, + OWNER_READER = 1 << 2, + OWNER_NONSPINNABLE = 1 << 3, +}; +#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER) + +static inline enum owner_state rwsem_owner_state(unsigned long owner) { - struct task_struct *owner = READ_ONCE(sem->owner); + if (!owner) + return OWNER_NULL; - if (!is_rwsem_owner_spinnable(owner)) - return false; + if (owner & RWSEM_ANONYMOUSLY_OWNED) + return OWNER_NONSPINNABLE; + + if (owner & RWSEM_READER_OWNED) + return OWNER_READER; + + return OWNER_WRITER; +} + +static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *tmp, *owner = READ_ONCE(sem->owner); + enum owner_state state = rwsem_owner_state((unsigned long)owner); + + if (state != OWNER_WRITER) + return state; rcu_read_lock(); - while (owner && (READ_ONCE(sem->owner) == owner)) { + for (;;) { + tmp = READ_ONCE(sem->owner); + if (tmp != owner) { + state = rwsem_owner_state((unsigned long)tmp); + break; + } + /* * Ensure we emit the owner->on_cpu, dereference _after_ * checking sem->owner still matches owner, if that fails, @@ -433,24 +470,16 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) */ barrier(); - /* - * abort spinning when need_resched or owner is not running or - * owner's cpu is preempted. - */ if (need_resched() || !owner_on_cpu(owner)) { - rcu_read_unlock(); - return false; + state = OWNER_NONSPINNABLE; + break; } cpu_relax(); } rcu_read_unlock(); - /* - * If there is a new owner or the owner is not set, we continue - * spinning. - */ - return is_rwsem_owner_spinnable(READ_ONCE(sem->owner)); + return state; } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) @@ -473,7 +502,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * 2) readers own the lock as we can't determine if they are * actively running or not. */ - while (rwsem_spin_on_owner(sem)) { + while (rwsem_spin_on_owner(sem) & OWNER_SPINNABLE) { /* * Try to acquire the lock */ -- cgit v1.2.3 From 4f23dbc1e657951e5d94c60369bc1db065961fb3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:06 -0400 Subject: locking/rwsem: Implement lock handoff to prevent lock starvation Because of writer lock stealing, it is possible that a constant stream of incoming writers will cause a waiting writer or reader to wait indefinitely leading to lock starvation. This patch implements a lock handoff mechanism to disable lock stealing and force lock handoff to the first waiter or waiters (for readers) in the queue after at least a 4ms waiting period unless it is a RT writer task which doesn't need to wait. The waiting period is used to avoid discouraging lock stealing too much to affect performance. The setting and clearing of the handoff bit is serialized by the wait_lock. So racing is not possible. A rwsem microbenchmark was run for 5 seconds on a 2-socket 40-core 80-thread Skylake system with a v5.1 based kernel and 240 write_lock threads with 5us sleep critical section. Before the patch, the min/mean/max numbers of locking operations for the locking threads were 1/7,792/173,696. After the patch, the figures became 5,842/6,542/7,458. It can be seen that the rwsem became much more fair, though there was a drop of about 16% in the mean locking operations done which was a tradeoff of having better fairness. Making the waiter set the handoff bit right after the first wakeup can impact performance especially with a mixed reader/writer workload. With the same microbenchmark with short critical section and equal number of reader and writer threads (40/40), the reader/writer locking operation counts with the current patch were: 40 readers, Iterations Min/Mean/Max = 1,793/1,794/1,796 40 writers, Iterations Min/Mean/Max = 1,793/34,956/86,081 By making waiter set handoff bit immediately after wakeup: 40 readers, Iterations Min/Mean/Max = 43/44/46 40 writers, Iterations Min/Mean/Max = 43/1,263/3,191 Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-8-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lock_events_list.h | 2 + kernel/locking/rwsem.c | 225 +++++++++++++++++++++++++++++--------- 2 files changed, 173 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 11187a1d40b8..634b47fd8b5e 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -61,5 +61,7 @@ LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ +LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ +LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 8d0f2acfe13d..decda9fb8c6d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -10,8 +10,9 @@ * Optimistic spinning by Tim Chen * and Davidlohr Bueso . Based on mutexes. * - * Rwsem count bit fields re-definition and rwsem rearchitecture - * by Waiman Long . + * Rwsem count bit fields re-definition and rwsem rearchitecture by + * Waiman Long and + * Peter Zijlstra . */ #include @@ -74,20 +75,33 @@ * * Bit 0 - writer locked bit * Bit 1 - waiters present bit - * Bits 2-7 - reserved + * Bit 2 - lock handoff bit + * Bits 3-7 - reserved * Bits 8-X - 24-bit (32-bit) or 56-bit reader count * * atomic_long_fetch_add() is used to obtain reader lock, whereas * atomic_long_cmpxchg() will be used to obtain writer lock. + * + * There are three places where the lock handoff bit may be set or cleared. + * 1) rwsem_mark_wake() for readers. + * 2) rwsem_try_write_lock() for writers. + * 3) Error path of rwsem_down_write_slowpath(). + * + * For all the above cases, wait_lock will be held. A writer must also + * be the first one in the wait_list to be eligible for setting the handoff + * bit. So concurrent setting/clearing of handoff bit is not possible. */ #define RWSEM_WRITER_LOCKED (1UL << 0) #define RWSEM_FLAG_WAITERS (1UL << 1) +#define RWSEM_FLAG_HANDOFF (1UL << 2) + #define RWSEM_READER_SHIFT 8 #define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) #define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) #define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED #define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) -#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS) +#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\ + RWSEM_FLAG_HANDOFF) /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -216,7 +230,10 @@ struct rwsem_waiter { struct list_head list; struct task_struct *task; enum rwsem_waiter_type type; + unsigned long timeout; }; +#define rwsem_first_waiter(sem) \ + list_first_entry(&sem->wait_list, struct rwsem_waiter, list) enum rwsem_wake_type { RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ @@ -224,6 +241,19 @@ enum rwsem_wake_type { RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ }; +enum writer_wait_state { + WRITER_NOT_FIRST, /* Writer is not first in wait list */ + WRITER_FIRST, /* Writer is first in wait list */ + WRITER_HANDOFF /* Writer is first & handoff needed */ +}; + +/* + * The typical HZ value is either 250 or 1000. So set the minimum waiting + * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait + * queue before initiating the handoff protocol. + */ +#define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250) + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -244,11 +274,13 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, long oldcount, woken = 0, adjustment = 0; struct list_head wlist; + lockdep_assert_held(&sem->wait_lock); + /* * Take a peek at the queue head waiter such that we can determine * the wakeup(s) to perform. */ - waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); + waiter = rwsem_first_waiter(sem); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { @@ -275,7 +307,18 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, adjustment = RWSEM_READER_BIAS; oldcount = atomic_long_fetch_add(adjustment, &sem->count); if (unlikely(oldcount & RWSEM_WRITER_MASK)) { - atomic_long_sub(adjustment, &sem->count); + /* + * When we've been waiting "too" long (for writers + * to give up the lock), request a HANDOFF to + * force the issue. + */ + if (!(oldcount & RWSEM_FLAG_HANDOFF) && + time_after(jiffies, waiter->timeout)) { + adjustment -= RWSEM_FLAG_HANDOFF; + lockevent_inc(rwsem_rlock_handoff); + } + + atomic_long_add(-adjustment, &sem->count); return; } /* @@ -317,6 +360,13 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, adjustment -= RWSEM_FLAG_WAITERS; } + /* + * When we've woken a reader, we no longer need to force writers + * to give up the lock and we can clear HANDOFF. + */ + if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) + adjustment -= RWSEM_FLAG_HANDOFF; + if (adjustment) atomic_long_add(adjustment, &sem->count); @@ -346,23 +396,48 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. + * + * If wstate is WRITER_HANDOFF, it will make sure that either the handoff + * bit is set or the lock is acquired with handoff bit cleared. */ -static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) +static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem, + enum writer_wait_state wstate) { long new; - if (count & RWSEM_LOCK_MASK) - return false; + lockdep_assert_held(&sem->wait_lock); - new = count + RWSEM_WRITER_LOCKED - - (list_is_singular(&sem->wait_list) ? RWSEM_FLAG_WAITERS : 0); + do { + bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); - if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)) { - rwsem_set_owner(sem); - return true; - } + if (has_handoff && wstate == WRITER_NOT_FIRST) + return false; - return false; + new = count; + + if (count & RWSEM_LOCK_MASK) { + if (has_handoff || (wstate != WRITER_HANDOFF)) + return false; + + new |= RWSEM_FLAG_HANDOFF; + } else { + new |= RWSEM_WRITER_LOCKED; + new &= ~RWSEM_FLAG_HANDOFF; + + if (list_is_singular(&sem->wait_list)) + new &= ~RWSEM_FLAG_WAITERS; + } + } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); + + /* + * We have either acquired the lock with handoff bit cleared or + * set the handoff bit. + */ + if (new & RWSEM_FLAG_HANDOFF) + return false; + + rwsem_set_owner(sem); + return true; } #ifdef CONFIG_RWSEM_SPIN_ON_OWNER @@ -373,9 +448,9 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { long count = atomic_long_read(&sem->count); - while (!(count & RWSEM_LOCK_MASK)) { + while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) { if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, - count + RWSEM_WRITER_LOCKED)) { + count | RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); lockevent_inc(rwsem_opt_wlock); return true; @@ -456,6 +531,11 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) rcu_read_lock(); for (;;) { + if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) { + state = OWNER_NONSPINNABLE; + break; + } + tmp = READ_ONCE(sem->owner); if (tmp != owner) { state = rwsem_owner_state((unsigned long)tmp); @@ -553,16 +633,18 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; + waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) { /* * In case the wait queue is empty and the lock isn't owned - * by a writer, this reader can exit the slowpath and return - * immediately as its RWSEM_READER_BIAS has already been - * set in the count. + * by a writer or has the handoff bit set, this reader can + * exit the slowpath and return immediately as its + * RWSEM_READER_BIAS has already been set in the count. */ - if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) { + if (!(atomic_long_read(&sem->count) & + (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast); @@ -609,8 +691,10 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) return sem; out_nolock: list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); + if (list_empty(&sem->wait_list)) { + atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, + &sem->count); + } raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); @@ -624,7 +708,7 @@ static struct rw_semaphore * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; - bool waiting = true; /* any queued threads before us */ + enum writer_wait_state wstate; struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; DEFINE_WAKE_Q(wake_q); @@ -639,66 +723,95 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) */ waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; + waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; raw_spin_lock_irq(&sem->wait_lock); /* account for this before adding a new element to the list */ - if (list_empty(&sem->wait_list)) - waiting = false; + wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock */ - if (waiting) { + if (wstate == WRITER_NOT_FIRST) { count = atomic_long_read(&sem->count); /* - * If there were already threads queued before us and there are - * no active writers and some readers, the lock must be read - * owned; so we try to any read locks that were queued ahead - * of us. + * If there were already threads queued before us and: + * 1) there are no no active locks, wake the front + * queued process(es) as the handoff bit might be set. + * 2) there are no active writers and some readers, the lock + * must be read owned; so we try to wake any read lock + * waiters that were queued ahead of us. */ - if (!(count & RWSEM_WRITER_MASK) && - (count & RWSEM_READER_MASK)) { - rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); - /* - * The wakeup is normally called _after_ the wait_lock - * is released, but given that we are proactively waking - * readers we can deal with the wake_q overhead as it is - * similar to releasing and taking the wait_lock again - * for attempting rwsem_try_write_lock(). - */ - wake_up_q(&wake_q); + if (count & RWSEM_WRITER_MASK) + goto wait; - /* - * Reinitialize wake_q after use. - */ - wake_q_init(&wake_q); - } + rwsem_mark_wake(sem, (count & RWSEM_READER_MASK) + ? RWSEM_WAKE_READERS + : RWSEM_WAKE_ANY, &wake_q); + /* + * The wakeup is normally called _after_ the wait_lock + * is released, but given that we are proactively waking + * readers we can deal with the wake_q overhead as it is + * similar to releasing and taking the wait_lock again + * for attempting rwsem_try_write_lock(). + */ + wake_up_q(&wake_q); + + /* We need wake_q again below, reinitialize */ + wake_q_init(&wake_q); } else { count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count); } +wait: /* wait until we successfully acquire the lock */ set_current_state(state); while (true) { - if (rwsem_try_write_lock(count, sem)) + if (rwsem_try_write_lock(count, sem, wstate)) break; + raw_spin_unlock_irq(&sem->wait_lock); /* Block until there are no active lockers. */ - do { + for (;;) { if (signal_pending_state(state, current)) goto out_nolock; schedule(); lockevent_inc(rwsem_sleep_writer); set_current_state(state); + /* + * If HANDOFF bit is set, unconditionally do + * a trylock. + */ + if (wstate == WRITER_HANDOFF) + break; + + if ((wstate == WRITER_NOT_FIRST) && + (rwsem_first_waiter(sem) == &waiter)) + wstate = WRITER_FIRST; + count = atomic_long_read(&sem->count); - } while (count & RWSEM_LOCK_MASK); + if (!(count & RWSEM_LOCK_MASK)) + break; + + /* + * The setting of the handoff bit is deferred + * until rwsem_try_write_lock() is called. + */ + if ((wstate == WRITER_FIRST) && (rt_task(current) || + time_after(jiffies, waiter.timeout))) { + wstate = WRITER_HANDOFF; + lockevent_inc(rwsem_wlock_handoff); + break; + } + } raw_spin_lock_irq(&sem->wait_lock); + count = atomic_long_read(&sem->count); } __set_current_state(TASK_RUNNING); list_del(&waiter.list); @@ -711,6 +824,10 @@ out_nolock: __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); + + if (unlikely(wstate == WRITER_HANDOFF)) + atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); + if (list_empty(&sem->wait_list)) atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); else @@ -726,7 +843,7 @@ out_nolock: * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ -static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count) { unsigned long flags; DEFINE_WAKE_Q(wake_q); @@ -859,7 +976,7 @@ inline void __up_read(struct rw_semaphore *sem) tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == RWSEM_FLAG_WAITERS)) - rwsem_wake(sem); + rwsem_wake(sem, tmp); } /* @@ -873,7 +990,7 @@ static inline void __up_write(struct rw_semaphore *sem) rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) - rwsem_wake(sem); + rwsem_wake(sem, tmp); } /* -- cgit v1.2.3 From 00f3c5a3df2c1e3dab14d0dd2b71f852d46be97f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:07 -0400 Subject: locking/rwsem: Always release wait_lock before waking up tasks With the use of wake_q, we can do task wakeups without holding the wait_lock. There is one exception in the rwsem code, though. It is when the writer in the slowpath detects that there are waiters ahead but the rwsem is not held by a writer. This can lead to a long wait_lock hold time especially when a large number of readers are to be woken up. Remediate this situation by releasing the wait_lock before waking up tasks and re-acquiring it afterward. The rwsem_try_write_lock() function is also modified to read the rwsem count directly to avoid stale count value. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-9-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index decda9fb8c6d..5532304406f7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -400,13 +400,14 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * If wstate is WRITER_HANDOFF, it will make sure that either the handoff * bit is set or the lock is acquired with handoff bit cleared. */ -static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem, +static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, enum writer_wait_state wstate) { - long new; + long count, new; lockdep_assert_held(&sem->wait_lock); + count = atomic_long_read(&sem->count); do { bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); @@ -751,26 +752,25 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) ? RWSEM_WAKE_READERS : RWSEM_WAKE_ANY, &wake_q); - /* - * The wakeup is normally called _after_ the wait_lock - * is released, but given that we are proactively waking - * readers we can deal with the wake_q overhead as it is - * similar to releasing and taking the wait_lock again - * for attempting rwsem_try_write_lock(). - */ - wake_up_q(&wake_q); - - /* We need wake_q again below, reinitialize */ - wake_q_init(&wake_q); + if (!wake_q_empty(&wake_q)) { + /* + * We want to minimize wait_lock hold time especially + * when a large number of readers are to be woken up. + */ + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + wake_q_init(&wake_q); /* Used again, reinit */ + raw_spin_lock_irq(&sem->wait_lock); + } } else { - count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count); + atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); } wait: /* wait until we successfully acquire the lock */ set_current_state(state); while (true) { - if (rwsem_try_write_lock(count, sem, wstate)) + if (rwsem_try_write_lock(sem, wstate)) break; raw_spin_unlock_irq(&sem->wait_lock); @@ -811,7 +811,6 @@ wait: } raw_spin_lock_irq(&sem->wait_lock); - count = atomic_long_read(&sem->count); } __set_current_state(TASK_RUNNING); list_del(&waiter.list); -- cgit v1.2.3 From 990fa7384a3057a3298bcf493651c6e14416c47c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:08 -0400 Subject: locking/rwsem: More optimal RT task handling of null owner An RT task can do optimistic spinning only if the lock holder is actually running. If the state of the lock holder isn't known, there is a possibility that high priority of the RT task may block forward progress of the lock holder if it happens to reside on the same CPU. This will lead to deadlock. So we have to make sure that an RT task will not spin on a reader-owned rwsem. When the owner is temporarily set to NULL, there are two cases where we may want to continue spinning: 1) The lock owner is in the process of releasing the lock, sem->owner is cleared but the lock has not been released yet. 2) The lock was free and owner cleared, but another task just comes in and acquire the lock before we try to get it. The new owner may be a spinnable writer. So an RT task is now made to retry one more time to see if it can acquire the lock or continue spinning on the new owning writer. When testing on a 8-socket IvyBridge-EX system, the one additional retry seems to improve locking performance of RT write locking threads under heavy contentions. The table below shows the locking rates (in kops/s) with various write locking threads before and after the patch. Locking threads Pre-patch Post-patch --------------- --------- ----------- 4 2,753 2,608 8 2,529 2,520 16 1,727 1,918 32 1,263 1,956 64 889 1,343 Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-10-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 51 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 5532304406f7..e1840b7c5310 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -566,6 +566,7 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { bool taken = false; + int prev_owner_state = OWNER_NULL; preempt_disable(); @@ -583,7 +584,12 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * 2) readers own the lock as we can't determine if they are * actively running or not. */ - while (rwsem_spin_on_owner(sem) & OWNER_SPINNABLE) { + for (;;) { + enum owner_state owner_state = rwsem_spin_on_owner(sem); + + if (!(owner_state & OWNER_SPINNABLE)) + break; + /* * Try to acquire the lock */ @@ -593,13 +599,44 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) } /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. + * An RT task cannot do optimistic spinning if it cannot + * be sure the lock holder is running or live-lock may + * happen if the current task and the lock holder happen + * to run in the same CPU. However, aborting optimistic + * spinning while a NULL owner is detected may miss some + * opportunity where spinning can continue without causing + * problem. + * + * There are 2 possible cases where an RT task may be able + * to continue spinning. + * + * 1) The lock owner is in the process of releasing the + * lock, sem->owner is cleared but the lock has not + * been released yet. + * 2) The lock was free and owner cleared, but another + * task just comes in and acquire the lock before + * we try to get it. The new owner may be a spinnable + * writer. + * + * To take advantage of two scenarios listed agove, the RT + * task is made to retry one more time to see if it can + * acquire the lock or continue spinning on the new owning + * writer. Of course, if the time lag is long enough or the + * new owner is not a writer or spinnable, the RT task will + * quit spinning. + * + * If the owner is a writer, the need_resched() check is + * done inside rwsem_spin_on_owner(). If the owner is not + * a writer, need_resched() check needs to be done here. */ - if (!sem->owner && (need_resched() || rt_task(current))) - break; + if (owner_state != OWNER_WRITER) { + if (need_resched()) + break; + if (rt_task(current) && + (prev_owner_state != OWNER_WRITER)) + break; + } + prev_owner_state = owner_state; /* * The cpu_relax() call is a compiler barrier which forces -- cgit v1.2.3 From d3681e269fff84048c94012342c3434b227c4706 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:09 -0400 Subject: locking/rwsem: Wake up almost all readers in wait queue When the front of the wait queue is a reader, other readers immediately following the first reader will also be woken up at the same time. However, if there is a writer in between. Those readers behind the writer will not be woken up. Because of optimistic spinning, the lock acquisition order is not FIFO anyway. The lock handoff mechanism will ensure that lock starvation will not happen. Assuming that the lock hold times of the other readers still in the queue will be about the same as the readers that are being woken up, there is really not much additional cost other than the additional latency due to the wakeup of additional tasks by the waker. Therefore all the readers up to a maximum of 256 in the queue are woken up when the first waiter is a reader to improve reader throughput. This is somewhat similar in concept to a phase-fair R/W lock. With a locking microbenchmark running on 5.1 based kernel, the total locking rates (in kops/s) on a 8-socket IvyBridge-EX system with equal numbers of readers and writers before and after this patch were as follows: # of Threads Pre-Patch Post-patch ------------ --------- ---------- 4 1,641 1,674 8 731 1,062 16 564 924 32 78 300 64 38 195 240 50 149 There is no performance gain at low contention level. At high contention level, however, this patch gives a pretty decent performance boost. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-11-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e1840b7c5310..ded96023f4dc 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -254,6 +254,14 @@ enum writer_wait_state { */ #define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250) +/* + * Magic number to batch-wakeup waiting readers, even when writers are + * also present in the queue. This both limits the amount of work the + * waking thread must do and also prevents any potential counter overflow, + * however unlikely. + */ +#define MAX_READERS_WAKEUP 0x100 + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -329,11 +337,17 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, } /* - * Grant an infinite number of read locks to the readers at the front - * of the queue. We know that woken will be at least 1 as we accounted + * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the + * queue. We know that the woken will be at least 1 as we accounted * for above. Note we increment the 'active part' of the count by the * number of readers before waking any processes up. * + * This is an adaptation of the phase-fair R/W locks where at the + * reader phase (first waiter is a reader), all readers are eligible + * to acquire the lock at the same time irrespective of their order + * in the queue. The writers acquire the lock according to their + * order in the queue. + * * We have to do wakeup in 2 passes to prevent the possibility that * the reader count may be decremented before it is incremented. It * is because the to-be-woken waiter may not have slept yet. So it @@ -345,13 +359,20 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * 2) For each waiters in the new list, clear waiter->task and * put them into wake_q to be woken up later. */ - list_for_each_entry(waiter, &sem->wait_list, list) { + INIT_LIST_HEAD(&wlist); + list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { if (waiter->type == RWSEM_WAITING_FOR_WRITE) - break; + continue; woken++; + list_move_tail(&waiter->list, &wlist); + + /* + * Limit # of readers that can be woken up per wakeup call. + */ + if (woken >= MAX_READERS_WAKEUP) + break; } - list_cut_before(&wlist, &sem->wait_list, &waiter->list); adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); -- cgit v1.2.3 From 02f1082b003a0cd48f48f12533d969cdbf1c2b63 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:10 -0400 Subject: locking/rwsem: Clarify usage of owner's nonspinaable bit Bit 1 of sem->owner (RWSEM_ANONYMOUSLY_OWNED) is used to designate an anonymous owner - readers or an anonymous writer. The setting of this anonymous bit is used as an indicator that optimistic spinning cannot be done on this rwsem. With the upcoming reader optimistic spinning patches, a reader-owned rwsem can be spinned on for a limit period of time. We still need this bit to indicate a rwsem is nonspinnable, but not setting this bit loses its meaning that the owner is known. So rename the bit to RWSEM_NONSPINNABLE to clarify its meaning. This patch also fixes a DEBUG_RWSEMS_WARN_ON() bug in __up_write(). Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-12-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index ded96023f4dc..180455b6b0d4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -33,17 +33,18 @@ /* * The least significant 2 bits of the owner value has the following * meanings when set. - * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers - * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned, - * i.e. the owner(s) cannot be readily determined. It can be reader - * owned or the owning writer is indeterminate. + * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers + * - Bit 1: RWSEM_NONSPINNABLE - Waiters cannot spin on the rwsem + * The rwsem is anonymously owned, i.e. the owner(s) cannot be + * readily determined. It can be reader owned or the owning writer + * is indeterminate. * * When a writer acquires a rwsem, it puts its task_struct pointer * into the owner field. It is cleared after an unlock. * * When a reader acquires a rwsem, it will also puts its task_struct * pointer into the owner field with both the RWSEM_READER_OWNED and - * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will + * RWSEM_NONSPINNABLE bits set. On unlock, the owner field will * largely be left untouched. So for a free or reader-owned rwsem, * the owner value may contain information about the last reader that * acquires the rwsem. The anonymous bit is set because that particular @@ -55,7 +56,8 @@ * a rwsem, but the overhead is simply too big. */ #define RWSEM_READER_OWNED (1UL << 0) -#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) +#define RWSEM_NONSPINNABLE (1UL << 1) +#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) #ifdef CONFIG_DEBUG_RWSEMS # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ @@ -132,7 +134,7 @@ static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; + | RWSEM_NONSPINNABLE; WRITE_ONCE(sem->owner, (struct task_struct *)val); } @@ -144,20 +146,12 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) /* * Return true if the a rwsem waiter can spin on the rwsem's owner - * and steal the lock, i.e. the lock is not anonymously owned. + * and steal the lock. * N.B. !owner is considered spinnable. */ static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) { - return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); -} - -/* - * Return true if rwsem is owned by an anonymous writer or readers. - */ -static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) -{ - return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; + return !((unsigned long)owner & RWSEM_NONSPINNABLE); } #ifdef CONFIG_DEBUG_RWSEMS @@ -170,10 +164,10 @@ static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { unsigned long val = (unsigned long)current | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; + | RWSEM_NONSPINNABLE; if (READ_ONCE(sem->owner) == (struct task_struct *)val) cmpxchg_relaxed((unsigned long *)&sem->owner, val, - RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); + RWSEM_READER_OWNED | RWSEM_NONSPINNABLE); } #else static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) @@ -495,7 +489,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) struct task_struct *owner; bool ret = true; - BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); + BUILD_BUG_ON(is_rwsem_owner_spinnable(RWSEM_OWNER_UNKNOWN)); if (need_resched()) return false; @@ -534,7 +528,7 @@ static inline enum owner_state rwsem_owner_state(unsigned long owner) if (!owner) return OWNER_NULL; - if (owner & RWSEM_ANONYMOUSLY_OWNED) + if (owner & RWSEM_NONSPINNABLE) return OWNER_NONSPINNABLE; if (owner & RWSEM_READER_OWNED) @@ -1043,7 +1037,12 @@ static inline void __up_write(struct rw_semaphore *sem) { long tmp; - DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + /* + * sem->owner may differ from current if the ownership is transferred + * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. + */ + DEBUG_RWSEMS_WARN_ON((sem->owner != current) && + !((long)sem->owner & RWSEM_NONSPINNABLE), sem); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) -- cgit v1.2.3 From cf69482d62d996d3ce840eeead8e160de281ac6c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:11 -0400 Subject: locking/rwsem: Enable readers spinning on writer This patch enables readers to optimistically spin on a rwsem when it is owned by a writer instead of going to sleep directly. The rwsem_can_spin_on_owner() function is extracted out of rwsem_optimistic_spin() and is called directly by rwsem_down_read_slowpath() and rwsem_down_write_slowpath(). With a locking microbenchmark running on 5.1 based kernel, the total locking rates (in kops/s) on a 8-socket IvyBrige-EX system with equal numbers of readers and writers before and after the patch were as follows: # of Threads Pre-patch Post-patch ------------ --------- ---------- 4 1,674 1,684 8 1,062 1,074 16 924 900 32 300 458 64 195 208 128 164 168 240 149 143 The performance change wasn't significant in this case, but this change is required by a follow-on patch. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-13-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lock_events_list.h | 1 + kernel/locking/rwsem.c | 86 +++++++++++++++++++++++++++++++++------ 2 files changed, 75 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 634b47fd8b5e..ca954e4e00e4 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -56,6 +56,7 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ +LOCK_EVENT(rwsem_opt_rlock) /* # of read locks opt-spin acquired */ LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */ LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 180455b6b0d4..985a03ad3f8c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -457,6 +457,30 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, } #ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * Try to acquire read lock before the reader is put on wait queue. + * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff + * is ongoing. + */ +static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + + if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF)) + return false; + + count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count); + if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_opt_rlock); + return true; + } + + /* Back out the change */ + atomic_long_add(-RWSEM_READER_BIAS, &sem->count); + return false; +} + /* * Try to acquire write lock before the writer has been put on wait queue. */ @@ -491,9 +515,12 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) BUILD_BUG_ON(is_rwsem_owner_spinnable(RWSEM_OWNER_UNKNOWN)); - if (need_resched()) + if (need_resched()) { + lockevent_inc(rwsem_opt_fail); return false; + } + preempt_disable(); rcu_read_lock(); owner = READ_ONCE(sem->owner); if (owner) { @@ -501,6 +528,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) owner_on_cpu(owner); } rcu_read_unlock(); + preempt_enable(); + + lockevent_cond_inc(rwsem_opt_fail, !ret); return ret; } @@ -578,7 +608,7 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) return state; } -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) { bool taken = false; int prev_owner_state = OWNER_NULL; @@ -586,9 +616,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) preempt_disable(); /* sem->wait_lock should not be held when doing optimistic spinning */ - if (!rwsem_can_spin_on_owner(sem)) - goto done; - if (!osq_lock(&sem->osq)) goto done; @@ -608,10 +635,11 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) /* * Try to acquire the lock */ - if (rwsem_try_write_lock_unqueued(sem)) { - taken = true; + taken = wlock ? rwsem_try_write_lock_unqueued(sem) + : rwsem_try_read_lock_unqueued(sem); + + if (taken) break; - } /* * An RT task cannot do optimistic spinning if it cannot @@ -668,7 +696,12 @@ done: return taken; } #else -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ + return false; +} + +static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) { return false; } @@ -684,6 +717,31 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); + if (!rwsem_can_spin_on_owner(sem)) + goto queue; + + /* + * Undo read bias from down_read() and do optimistic spinning. + */ + atomic_long_add(-RWSEM_READER_BIAS, &sem->count); + adjustment = 0; + if (rwsem_optimistic_spin(sem, false)) { + /* + * Wake up other readers in the wait list if the front + * waiter is a reader. + */ + if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) { + raw_spin_lock_irq(&sem->wait_lock); + if (!list_empty(&sem->wait_list)) + rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, + &wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + } + return sem; + } + +queue: waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; @@ -696,7 +754,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) * exit the slowpath and return immediately as its * RWSEM_READER_BIAS has already been set in the count. */ - if (!(atomic_long_read(&sem->count) & + if (adjustment && !(atomic_long_read(&sem->count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); @@ -708,7 +766,10 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = atomic_long_add_return(adjustment, &sem->count); + if (adjustment) + count = atomic_long_add_return(adjustment, &sem->count); + else + count = atomic_long_read(&sem->count); /* * If there are no active locks, wake the front queued process(es). @@ -767,7 +828,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ - if (rwsem_optimistic_spin(sem)) + if (rwsem_can_spin_on_owner(sem) && + rwsem_optimistic_spin(sem, true)) return sem; /* -- cgit v1.2.3 From 94a9717b3c40e77a54e4afacd8f19a9a86bfeead Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:12 -0400 Subject: locking/rwsem: Make rwsem->owner an atomic_long_t The rwsem->owner contains not just the task structure pointer, it also holds some flags for storing the current state of the rwsem. Some of the flags may have to be atomically updated. To reflect the new reality, the owner is now changed to an atomic_long_t type. New helper functions are added to properly separate out the task structure pointer and the embedded flags. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-14-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 125 +++++++++++++++++++++++++++++++------------------ 1 file changed, 80 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 985a03ad3f8c..fae557be8334 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -64,7 +64,7 @@ if (!debug_locks_silent && \ WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ #c, atomic_long_read(&(sem)->count), \ - (long)((sem)->owner), (long)current, \ + atomic_long_read(&(sem)->owner), (long)current, \ list_empty(&(sem)->wait_list) ? "" : "not ")) \ debug_locks_off(); \ } while (0) @@ -114,12 +114,20 @@ */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { - WRITE_ONCE(sem->owner, current); + atomic_long_set(&sem->owner, (long)current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { - WRITE_ONCE(sem->owner, NULL); + atomic_long_set(&sem->owner, 0); +} + +/* + * Test the flags in the owner field. + */ +static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) +{ + return atomic_long_read(&sem->owner) & flags; } /* @@ -133,10 +141,9 @@ static inline void rwsem_clear_owner(struct rw_semaphore *sem) static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { - unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED - | RWSEM_NONSPINNABLE; + unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | RWSEM_NONSPINNABLE; - WRITE_ONCE(sem->owner, (struct task_struct *)val); + atomic_long_set(&sem->owner, val); } static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) @@ -145,13 +152,20 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) } /* - * Return true if the a rwsem waiter can spin on the rwsem's owner - * and steal the lock. - * N.B. !owner is considered spinnable. + * Return true if the rwsem is owned by a reader. */ -static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) +static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) { - return !((unsigned long)owner & RWSEM_NONSPINNABLE); +#ifdef CONFIG_DEBUG_RWSEMS + /* + * Check the count to see if it is write-locked. + */ + long count = atomic_long_read(&sem->count); + + if (count & RWSEM_WRITER_MASK) + return false; +#endif + return rwsem_test_oflags(sem, RWSEM_READER_OWNED); } #ifdef CONFIG_DEBUG_RWSEMS @@ -163,11 +177,13 @@ static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) */ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { - unsigned long val = (unsigned long)current | RWSEM_READER_OWNED - | RWSEM_NONSPINNABLE; - if (READ_ONCE(sem->owner) == (struct task_struct *)val) - cmpxchg_relaxed((unsigned long *)&sem->owner, val, - RWSEM_READER_OWNED | RWSEM_NONSPINNABLE); + unsigned long val = atomic_long_read(&sem->owner); + + while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) { + if (atomic_long_try_cmpxchg(&sem->owner, &val, + val & RWSEM_OWNER_FLAGS_MASK)) + return; + } } #else static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) @@ -175,6 +191,28 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) } #endif +/* + * Return just the real task structure pointer of the owner + */ +static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +{ + return (struct task_struct *) + (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); +} + +/* + * Return the real task structure pointer of the owner and the embedded + * flags in the owner. pflags must be non-NULL. + */ +static inline struct task_struct * +rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + *pflags = owner & RWSEM_OWNER_FLAGS_MASK; + return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK); +} + /* * Guide to the rw_semaphore's count field. * @@ -208,7 +246,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); - sem->owner = NULL; + atomic_long_set(&sem->owner, 0L); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); #endif @@ -511,9 +549,10 @@ static inline bool owner_on_cpu(struct task_struct *owner) static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; + unsigned long flags; bool ret = true; - BUILD_BUG_ON(is_rwsem_owner_spinnable(RWSEM_OWNER_UNKNOWN)); + BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE)); if (need_resched()) { lockevent_inc(rwsem_opt_fail); @@ -522,11 +561,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) preempt_disable(); rcu_read_lock(); - owner = READ_ONCE(sem->owner); - if (owner) { - ret = is_rwsem_owner_spinnable(owner) && - owner_on_cpu(owner); - } + owner = rwsem_owner_flags(sem, &flags); + if ((flags & RWSEM_NONSPINNABLE) || (owner && !owner_on_cpu(owner))) + ret = false; rcu_read_unlock(); preempt_enable(); @@ -553,25 +590,26 @@ enum owner_state { }; #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER) -static inline enum owner_state rwsem_owner_state(unsigned long owner) +static inline enum owner_state +rwsem_owner_state(struct task_struct *owner, unsigned long flags) { - if (!owner) - return OWNER_NULL; - - if (owner & RWSEM_NONSPINNABLE) + if (flags & RWSEM_NONSPINNABLE) return OWNER_NONSPINNABLE; - if (owner & RWSEM_READER_OWNED) + if (flags & RWSEM_READER_OWNED) return OWNER_READER; - return OWNER_WRITER; + return owner ? OWNER_WRITER : OWNER_NULL; } static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) { - struct task_struct *tmp, *owner = READ_ONCE(sem->owner); - enum owner_state state = rwsem_owner_state((unsigned long)owner); + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; + owner = rwsem_owner_flags(sem, &flags); + state = rwsem_owner_state(owner, flags); if (state != OWNER_WRITER) return state; @@ -582,9 +620,9 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) break; } - tmp = READ_ONCE(sem->owner); - if (tmp != owner) { - state = rwsem_owner_state((unsigned long)tmp); + new = rwsem_owner_flags(sem, &new_flags); + if ((new != owner) || (new_flags != flags)) { + state = rwsem_owner_state(new, new_flags); break; } @@ -1001,8 +1039,7 @@ inline void __down_read(struct rw_semaphore *sem) if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count) & RWSEM_READ_FAILED_MASK)) { rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED), sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); } else { rwsem_set_reader_owned(sem); } @@ -1014,8 +1051,7 @@ static inline int __down_read_killable(struct rw_semaphore *sem) &sem->count) & RWSEM_READ_FAILED_MASK)) { if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) return -EINTR; - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED), sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); } else { rwsem_set_reader_owned(sem); } @@ -1084,7 +1120,7 @@ inline void __up_read(struct rw_semaphore *sem) { long tmp; - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == @@ -1103,8 +1139,8 @@ static inline void __up_write(struct rw_semaphore *sem) * sem->owner may differ from current if the ownership is transferred * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. */ - DEBUG_RWSEMS_WARN_ON((sem->owner != current) && - !((long)sem->owner & RWSEM_NONSPINNABLE), sem); + DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && + !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) @@ -1125,7 +1161,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) * read-locked region is ok to be re-ordered into the * write side. As such, rely on RELEASE semantics. */ - DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem); tmp = atomic_long_fetch_add_release( -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); rwsem_set_reader_owned(sem); @@ -1296,8 +1332,7 @@ EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), - sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); __up_read(sem); } EXPORT_SYMBOL(up_read_non_owner); -- cgit v1.2.3 From 7d43f1ce9dd075d8b2aa3ad1f3970ef386a5c358 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:13 -0400 Subject: locking/rwsem: Enable time-based spinning on reader-owned rwsem When the rwsem is owned by reader, writers stop optimistic spinning simply because there is no easy way to figure out if all the readers are actively running or not. However, there are scenarios where the readers are unlikely to sleep and optimistic spinning can help performance. This patch provides a simple mechanism for spinning on a reader-owned rwsem by a writer. It is a time threshold based spinning where the allowable spinning time can vary from 10us to 25us depending on the condition of the rwsem. When the time threshold is exceeded, the nonspinnable bits will be set in the owner field to indicate that no more optimistic spinning will be allowed on this rwsem until it becomes writer owned again. Not even readers is allowed to acquire the reader-locked rwsem by optimistic spinning for fairness. We also want a writer to acquire the lock after the readers hold the lock for a relatively long time. In order to give preference to writers under such a circumstance, the single RWSEM_NONSPINNABLE bit is now split into two - one for reader and one for writer. When optimistic spinning is disabled, both bits will be set. When the reader count drop down to 0, the writer nonspinnable bit will be cleared to allow writers to spin on the lock, but not the readers. When a writer acquires the lock, it will write its own task structure pointer into sem->owner and clear the reader nonspinnable bit in the process. The time taken for each iteration of the reader-owned rwsem spinning loop varies. Below are sample minimum elapsed times for 16 iterations of the loop. System Time for 16 Iterations ------ ---------------------- 1-socket Skylake ~800ns 4-socket Broadwell ~300ns 2-socket ThunderX2 (arm64) ~250ns When the lock cacheline is contended, we can see up to almost 10X increase in elapsed time. So 25us will be at most 500, 1300 and 1600 iterations for each of the above systems. With a locking microbenchmark running on 5.1 based kernel, the total locking rates (in kops/s) on a 8-socket IvyBridge-EX system with equal numbers of readers and writers before and after this patch were as follows: # of Threads Pre-patch Post-patch ------------ --------- ---------- 2 1,759 6,684 4 1,684 6,738 8 1,074 7,222 16 900 7,163 32 458 7,316 64 208 520 128 168 425 240 143 474 This patch gives a big boost in performance for mixed reader/writer workloads. With 32 locking threads, the rwsem lock event data were: rwsem_opt_fail=79850 rwsem_opt_nospin=5069 rwsem_opt_rlock=597484 rwsem_opt_wlock=957339 rwsem_sleep_reader=57782 rwsem_sleep_writer=55663 With 64 locking threads, the data looked like: rwsem_opt_fail=346723 rwsem_opt_nospin=6293 rwsem_opt_rlock=1127119 rwsem_opt_wlock=1400628 rwsem_sleep_reader=308201 rwsem_sleep_writer=72281 So a lot more threads acquired the lock in the slowpath and more threads went to sleep. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-15-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lock_events_list.h | 1 + kernel/locking/rwsem.c | 173 +++++++++++++++++++++++++++++++------- 2 files changed, 144 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index ca954e4e00e4..baa998401052 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -59,6 +59,7 @@ LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ LOCK_EVENT(rwsem_opt_rlock) /* # of read locks opt-spin acquired */ LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */ LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ +LOCK_EVENT(rwsem_opt_nospin) /* # of disabled reader opt-spinnings */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index fae557be8334..2d7cabcfca50 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -31,24 +32,28 @@ #include "lock_events.h" /* - * The least significant 2 bits of the owner value has the following + * The least significant 3 bits of the owner value has the following * meanings when set. * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers - * - Bit 1: RWSEM_NONSPINNABLE - Waiters cannot spin on the rwsem - * The rwsem is anonymously owned, i.e. the owner(s) cannot be - * readily determined. It can be reader owned or the owning writer - * is indeterminate. + * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock. + * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock. * + * When the rwsem is either owned by an anonymous writer, or it is + * reader-owned, but a spinning writer has timed out, both nonspinnable + * bits will be set to disable optimistic spinning by readers and writers. + * In the later case, the last unlocking reader should then check the + * writer nonspinnable bit and clear it only to give writers preference + * to acquire the lock via optimistic spinning, but not readers. Similar + * action is also done in the reader slowpath. + * When a writer acquires a rwsem, it puts its task_struct pointer * into the owner field. It is cleared after an unlock. * * When a reader acquires a rwsem, it will also puts its task_struct - * pointer into the owner field with both the RWSEM_READER_OWNED and - * RWSEM_NONSPINNABLE bits set. On unlock, the owner field will - * largely be left untouched. So for a free or reader-owned rwsem, - * the owner value may contain information about the last reader that - * acquires the rwsem. The anonymous bit is set because that particular - * reader may or may not still own the lock. + * pointer into the owner field with the RWSEM_READER_OWNED bit set. + * On unlock, the owner field will largely be left untouched. So + * for a free or reader-owned rwsem, the owner value may contain + * information about the last reader that acquires the rwsem. * * That information may be helpful in debugging cases where the system * seems to hang on a reader owned rwsem especially if only one reader @@ -56,7 +61,9 @@ * a rwsem, but the overhead is simply too big. */ #define RWSEM_READER_OWNED (1UL << 0) -#define RWSEM_NONSPINNABLE (1UL << 1) +#define RWSEM_RD_NONSPINNABLE (1UL << 1) +#define RWSEM_WR_NONSPINNABLE (1UL << 2) +#define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE) #define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) #ifdef CONFIG_DEBUG_RWSEMS @@ -141,7 +148,7 @@ static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { - unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | RWSEM_NONSPINNABLE; + unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED; atomic_long_set(&sem->owner, val); } @@ -191,6 +198,23 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) } #endif +/* + * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag + * remains set. Otherwise, the operation will be aborted. + */ +static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + do { + if (!(owner & RWSEM_READER_OWNED)) + break; + if (owner & RWSEM_NONSPINNABLE) + break; + } while (!atomic_long_try_cmpxchg(&sem->owner, &owner, + owner | RWSEM_NONSPINNABLE)); +} + /* * Return just the real task structure pointer of the owner */ @@ -546,7 +570,8 @@ static inline bool owner_on_cpu(struct task_struct *owner) return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); } -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, + unsigned long nonspinnable) { struct task_struct *owner; unsigned long flags; @@ -562,7 +587,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) preempt_disable(); rcu_read_lock(); owner = rwsem_owner_flags(sem, &flags); - if ((flags & RWSEM_NONSPINNABLE) || (owner && !owner_on_cpu(owner))) + if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner))) ret = false; rcu_read_unlock(); preempt_enable(); @@ -588,12 +613,12 @@ enum owner_state { OWNER_READER = 1 << 2, OWNER_NONSPINNABLE = 1 << 3, }; -#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER) +#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) static inline enum owner_state -rwsem_owner_state(struct task_struct *owner, unsigned long flags) +rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable) { - if (flags & RWSEM_NONSPINNABLE) + if (flags & nonspinnable) return OWNER_NONSPINNABLE; if (flags & RWSEM_READER_OWNED) @@ -602,14 +627,15 @@ rwsem_owner_state(struct task_struct *owner, unsigned long flags) return owner ? OWNER_WRITER : OWNER_NULL; } -static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) +static noinline enum owner_state +rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) { struct task_struct *new, *owner; unsigned long flags, new_flags; enum owner_state state; owner = rwsem_owner_flags(sem, &flags); - state = rwsem_owner_state(owner, flags); + state = rwsem_owner_state(owner, flags, nonspinnable); if (state != OWNER_WRITER) return state; @@ -622,7 +648,7 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) new = rwsem_owner_flags(sem, &new_flags); if ((new != owner) || (new_flags != flags)) { - state = rwsem_owner_state(new, new_flags); + state = rwsem_owner_state(new, new_flags, nonspinnable); break; } @@ -646,10 +672,39 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) return state; } +/* + * Calculate reader-owned rwsem spinning threshold for writer + * + * The more readers own the rwsem, the longer it will take for them to + * wind down and free the rwsem. So the empirical formula used to + * determine the actual spinning time limit here is: + * + * Spinning threshold = (10 + nr_readers/2)us + * + * The limit is capped to a maximum of 25us (30 readers). This is just + * a heuristic and is subjected to change in the future. + */ +static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + int readers = count >> RWSEM_READER_SHIFT; + u64 delta; + + if (readers > 30) + readers = 30; + delta = (20 + readers) * NSEC_PER_USEC / 2; + + return sched_clock() + delta; +} + static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) { bool taken = false; int prev_owner_state = OWNER_NULL; + int loop = 0; + u64 rspin_threshold = 0; + unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE + : RWSEM_RD_NONSPINNABLE; preempt_disable(); @@ -661,12 +716,12 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) * Optimistically spin on the owner field and attempt to acquire the * lock whenever the owner changes. Spinning will be stopped when: * 1) the owning writer isn't running; or - * 2) readers own the lock as we can't determine if they are - * actively running or not. + * 2) readers own the lock and spinning time has exceeded limit. */ for (;;) { - enum owner_state owner_state = rwsem_spin_on_owner(sem); + enum owner_state owner_state; + owner_state = rwsem_spin_on_owner(sem, nonspinnable); if (!(owner_state & OWNER_SPINNABLE)) break; @@ -679,6 +734,38 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) if (taken) break; + /* + * Time-based reader-owned rwsem optimistic spinning + */ + if (wlock && (owner_state == OWNER_READER)) { + /* + * Re-initialize rspin_threshold every time when + * the owner state changes from non-reader to reader. + * This allows a writer to steal the lock in between + * 2 reader phases and have the threshold reset at + * the beginning of the 2nd reader phase. + */ + if (prev_owner_state != OWNER_READER) { + if (rwsem_test_oflags(sem, nonspinnable)) + break; + rspin_threshold = rwsem_rspin_threshold(sem); + loop = 0; + } + + /* + * Check time threshold once every 16 iterations to + * avoid calling sched_clock() too frequently so + * as to reduce the average latency between the times + * when the lock becomes free and when the spinner + * is ready to do a trylock. + */ + else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) { + rwsem_set_nonspinnable(sem); + lockevent_inc(rwsem_opt_nospin); + break; + } + } + /* * An RT task cannot do optimistic spinning if it cannot * be sure the lock holder is running or live-lock may @@ -733,8 +820,25 @@ done: lockevent_cond_inc(rwsem_opt_fail, !taken); return taken; } + +/* + * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should + * only be called when the reader count reaches 0. + * + * This give writers better chance to acquire the rwsem first before + * readers when the rwsem was being held by readers for a relatively long + * period of time. Race can happen that an optimistic spinner may have + * just stolen the rwsem and set the owner, but just clearing the + * RWSEM_WR_NONSPINNABLE bit will do no harm anyway. + */ +static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) +{ + if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE)) + atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner); +} #else -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, + unsigned long nonspinnable) { return false; } @@ -743,6 +847,8 @@ static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) { return false; } + +static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { } #endif /* @@ -752,10 +858,11 @@ static struct rw_semaphore __sched * rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) { long count, adjustment = -RWSEM_READER_BIAS; + bool wake = false; struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); - if (!rwsem_can_spin_on_owner(sem)) + if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE)) goto queue; /* @@ -815,8 +922,12 @@ queue: * If there are no writers and we are first in the queue, * wake our own waiter to join the existing active readers ! */ - if (!(count & RWSEM_LOCK_MASK) || - (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS))) + if (!(count & RWSEM_LOCK_MASK)) { + clear_wr_nonspinnable(sem); + wake = true; + } + if (wake || (!(count & RWSEM_WRITER_MASK) && + (adjustment & RWSEM_FLAG_WAITERS))) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); @@ -866,7 +977,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ - if (rwsem_can_spin_on_owner(sem) && + if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && rwsem_optimistic_spin(sem, true)) return sem; @@ -1124,8 +1235,10 @@ inline void __up_read(struct rw_semaphore *sem) rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == - RWSEM_FLAG_WAITERS)) + RWSEM_FLAG_WAITERS)) { + clear_wr_nonspinnable(sem); rwsem_wake(sem, tmp); + } } /* -- cgit v1.2.3 From 5cfd92e12e13432251981b9d0cd68dbd7aa8d690 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:14 -0400 Subject: locking/rwsem: Adaptive disabling of reader optimistic spinning Reader optimistic spinning is helpful when the reader critical section is short and there aren't that many readers around. It makes readers relatively more preferred than writers. When a writer times out spinning on a reader-owned lock and set the nospinnable bits, there are two main reasons for that. 1) The reader critical section is long, perhaps the task sleeps after acquiring the read lock. 2) There are just too many readers contending the lock causing it to take a while to service all of them. In the former case, long reader critical section will impede the progress of writers which is usually more important for system performance. In the later case, reader optimistic spinning tends to make the reader groups that contain readers that acquire the lock together smaller leading to more of them. That may hurt performance in some cases. In other words, the setting of nonspinnable bits indicates that reader optimistic spinning may not be helpful for those workloads that cause it. Therefore, any writers that have observed the setting of the writer nonspinnable bit for a given rwsem after they fail to acquire the lock via optimistic spinning will set the reader nonspinnable bit once they acquire the write lock. Similarly, readers that observe the setting of reader nonspinnable bit at slowpath entry will also set the reader nonspinnable bit when they acquire the read lock via the wakeup path. Once the reader nonspinnable bit is on, it will only be reset when a writer is able to acquire the rwsem in the fast path or somehow a reader or writer in the slowpath doesn't observe the nonspinable bit. This is to discourage reader optmistic spinning on that particular rwsem and make writers more preferred. This adaptive disabling of reader optimistic spinning will alleviate some of the negative side effect of this feature. In addition, this patch tries to make readers in the spinning queue follow the phase-fair principle after quitting optimistic spinning by checking if another reader has somehow acquired a read lock after this reader enters the optimistic spinning queue. If so and the rwsem is still reader-owned, this reader is in the right read-phase and can attempt to acquire the lock. On a 2-socket 40-core 80-thread Skylake system, the page_fault1 test of the will-it-scale benchmark was run with various number of threads. The number of operations done before reader optimistic spinning patches, this patch and after this patch were: Threads Before rspin Before patch After patch %change ------- ------------ ------------ ----------- ------- 20 5541068 5345484 5455667 -3.5%/ +2.1% 40 10185150 7292313 9219276 -28.5%/+26.4% 60 8196733 6460517 7181209 -21.2%/+11.2% 80 9508864 6739559 8107025 -29.1%/+20.3% This patch doesn't recover all the lost performance, but it is more than half. Given the fact that reader optimistic spinning does benefit some workloads, this is a good compromise. Using the rwsem locking microbenchmark with very short critical section, this patch doesn't have too much impact on locking performance as shown by the locking rates (kops/s) below with equal numbers of readers and writers before and after this patch: # of Threads Pre-patch Post-patch ------------ --------- ---------- 2 4,730 4,969 4 4,814 4,786 8 4,866 4,815 16 4,715 4,511 32 3,338 3,500 64 3,212 3,389 80 3,110 3,044 When running the locking microbenchmark with 40 dedicated reader and writer threads, however, the reader performance is curtailed to favor the writer. Before patch: 40 readers, Iterations Min/Mean/Max = 204,026/234,309/254,816 40 writers, Iterations Min/Mean/Max = 88,515/95,884/115,644 After patch: 40 readers, Iterations Min/Mean/Max = 33,813/35,260/36,791 40 writers, Iterations Min/Mean/Max = 95,368/96,565/97,798 Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-16-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lock_events_list.h | 10 +-- kernel/locking/rwsem.c | 133 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 135 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index baa998401052..239039d0ce21 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -56,10 +56,12 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ -LOCK_EVENT(rwsem_opt_rlock) /* # of read locks opt-spin acquired */ -LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */ -LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ -LOCK_EVENT(rwsem_opt_nospin) /* # of disabled reader opt-spinnings */ +LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */ +LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */ +LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */ +LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */ +LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */ +LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2d7cabcfca50..e1e0bac957c4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -59,6 +59,42 @@ * seems to hang on a reader owned rwsem especially if only one reader * is involved. Ideally we would like to track all the readers that own * a rwsem, but the overhead is simply too big. + * + * Reader optimistic spinning is helpful when the reader critical section + * is short and there aren't that many readers around. It makes readers + * relatively more preferred than writers. When a writer times out spinning + * on a reader-owned lock and set the nospinnable bits, there are two main + * reasons for that. + * + * 1) The reader critical section is long, perhaps the task sleeps after + * acquiring the read lock. + * 2) There are just too many readers contending the lock causing it to + * take a while to service all of them. + * + * In the former case, long reader critical section will impede the progress + * of writers which is usually more important for system performance. In + * the later case, reader optimistic spinning tends to make the reader + * groups that contain readers that acquire the lock together smaller + * leading to more of them. That may hurt performance in some cases. In + * other words, the setting of nonspinnable bits indicates that reader + * optimistic spinning may not be helpful for those workloads that cause + * it. + * + * Therefore, any writers that had observed the setting of the writer + * nonspinnable bit for a given rwsem after they fail to acquire the lock + * via optimistic spinning will set the reader nonspinnable bit once they + * acquire the write lock. Similarly, readers that observe the setting + * of reader nonspinnable bit at slowpath entry will set the reader + * nonspinnable bits when they acquire the read lock via the wakeup path. + * + * Once the reader nonspinnable bit is on, it will only be reset when + * a writer is able to acquire the rwsem in the fast path or somehow a + * reader or writer in the slowpath doesn't observe the nonspinable bit. + * + * This is to discourage reader optmistic spinning on that particular + * rwsem and make writers more preferred. This adaptive disabling of reader + * optimistic spinning will alleviate the negative side effect of this + * feature. */ #define RWSEM_READER_OWNED (1UL << 0) #define RWSEM_RD_NONSPINNABLE (1UL << 1) @@ -144,11 +180,14 @@ static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) * Note that the owner value just indicates the task has owned the rwsem * previously, it may not be the real owner or one of the real owners * anymore when that field is examined, so take it with a grain of salt. + * + * The reader non-spinnable bit is preserved. */ static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { - unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED; + unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | + (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE); atomic_long_set(&sem->owner, val); } @@ -287,6 +326,7 @@ struct rwsem_waiter { struct task_struct *task; enum rwsem_waiter_type type; unsigned long timeout; + unsigned long last_rowner; }; #define rwsem_first_waiter(sem) \ list_first_entry(&sem->wait_list, struct rwsem_waiter, list) @@ -368,6 +408,8 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * so we can bail out early if a writer stole the lock. */ if (wake_type != RWSEM_WAKE_READ_OWNED) { + struct task_struct *owner; + adjustment = RWSEM_READER_BIAS; oldcount = atomic_long_fetch_add(adjustment, &sem->count); if (unlikely(oldcount & RWSEM_WRITER_MASK)) { @@ -388,8 +430,15 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, /* * Set it to reader-owned to give spinners an early * indication that readers now have the lock. + * The reader nonspinnable bit seen at slowpath entry of + * the reader is copied over. */ - __rwsem_set_reader_owned(sem, waiter->task); + owner = waiter->task; + if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) { + owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE); + lockevent_inc(rwsem_opt_norspin); + } + __rwsem_set_reader_owned(sem, owner); } /* @@ -836,6 +885,42 @@ static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE)) atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner); } + +/* + * This function is called when the reader fails to acquire the lock via + * optimistic spinning. In this case we will still attempt to do a trylock + * when comparing the rwsem state right now with the state when entering + * the slowpath indicates that the reader is still in a valid reader phase. + * This happens when the following conditions are true: + * + * 1) The lock is currently reader owned, and + * 2) The lock is previously not reader-owned or the last read owner changes. + * + * In the former case, we have transitioned from a writer phase to a + * reader-phase while spinning. In the latter case, it means the reader + * phase hasn't ended when we entered the optimistic spinning loop. In + * both cases, the reader is eligible to acquire the lock. This is the + * secondary path where a read lock is acquired optimistically. + * + * The reader non-spinnable bit wasn't set at time of entry or it will + * not be here at all. + */ +static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, + unsigned long last_rowner) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + if (!(owner & RWSEM_READER_OWNED)) + return false; + + if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) && + rwsem_try_read_lock_unqueued(sem)) { + lockevent_inc(rwsem_opt_rlock2); + lockevent_add(rwsem_opt_fail, -1); + return true; + } + return false; +} #else static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) @@ -849,6 +934,12 @@ static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) } static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { } + +static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, + unsigned long last_rowner) +{ + return false; +} #endif /* @@ -862,6 +953,14 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); + /* + * Save the current read-owner of rwsem, if available, and the + * reader nonspinnable bit. + */ + waiter.last_rowner = atomic_long_read(&sem->owner); + if (!(waiter.last_rowner & RWSEM_READER_OWNED)) + waiter.last_rowner &= RWSEM_RD_NONSPINNABLE; + if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE)) goto queue; @@ -884,6 +983,8 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) wake_up_q(&wake_q); } return sem; + } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { + return sem; } queue: @@ -964,6 +1065,19 @@ out_nolock: return ERR_PTR(-EINTR); } +/* + * This function is called by the a write lock owner. So the owner value + * won't get changed by others. + */ +static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem, + bool disable) +{ + if (unlikely(disable)) { + atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner); + lockevent_inc(rwsem_opt_norspin); + } +} + /* * Wait until we successfully acquire the write lock */ @@ -971,6 +1085,7 @@ static struct rw_semaphore * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; + bool disable_rspin; enum writer_wait_state wstate; struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; @@ -981,6 +1096,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) rwsem_optimistic_spin(sem, true)) return sem; + /* + * Disable reader optimistic spinning for this rwsem after + * acquiring the write lock when the setting of the nonspinnable + * bits are observed. + */ + disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE; + /* * Optimistic spinning failed, proceed to the slowpath * and block until we can acquire the sem. @@ -1077,6 +1199,7 @@ wait: } __set_current_state(TASK_RUNNING); list_del(&waiter.list); + rwsem_disable_reader_optspin(sem, disable_rspin); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); @@ -1196,7 +1319,8 @@ static inline void __down_write(struct rw_semaphore *sem) if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED))) rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); - rwsem_set_owner(sem); + else + rwsem_set_owner(sem); } static inline int __down_write_killable(struct rw_semaphore *sem) @@ -1207,8 +1331,9 @@ static inline int __down_write_killable(struct rw_semaphore *sem) RWSEM_WRITER_LOCKED))) { if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) return -EINTR; + } else { + rwsem_set_owner(sem); } - rwsem_set_owner(sem); return 0; } -- cgit v1.2.3 From a15ea1a35f1b2782befc8b958c123c5d6a7cab0a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:15 -0400 Subject: locking/rwsem: Guard against making count negative The upper bits of the count field is used as reader count. When sufficient number of active readers are present, the most significant bit will be set and the count becomes negative. If the number of active readers keep on piling up, we may eventually overflow the reader counts. This is not likely to happen unless the number of bits reserved for reader count is reduced because those bits are need for other purpose. To prevent this count overflow from happening, the most significant bit is now treated as a guard bit (RWSEM_FLAG_READFAIL). Read-lock attempts will now fail for both the fast and slow paths whenever this bit is set. So all those extra readers will be put to sleep in the wait list. Wakeup will not happen until the reader count reaches 0. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-17-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 53 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e1e0bac957c4..37524a47f002 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -116,13 +116,28 @@ #endif /* - * The definition of the atomic counter in the semaphore: + * On 64-bit architectures, the bit definitions of the count are: * - * Bit 0 - writer locked bit - * Bit 1 - waiters present bit - * Bit 2 - lock handoff bit - * Bits 3-7 - reserved - * Bits 8-X - 24-bit (32-bit) or 56-bit reader count + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bit 2 - lock handoff bit + * Bits 3-7 - reserved + * Bits 8-62 - 55-bit reader count + * Bit 63 - read fail bit + * + * On 32-bit architectures, the bit definitions of the count are: + * + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bit 2 - lock handoff bit + * Bits 3-7 - reserved + * Bits 8-30 - 23-bit reader count + * Bit 31 - read fail bit + * + * It is not likely that the most significant bit (read fail bit) will ever + * be set. This guard bit is still checked anyway in the down_read() fastpath + * just in case we need to use up more of the reader bits for other purpose + * in the future. * * atomic_long_fetch_add() is used to obtain reader lock, whereas * atomic_long_cmpxchg() will be used to obtain writer lock. @@ -139,6 +154,7 @@ #define RWSEM_WRITER_LOCKED (1UL << 0) #define RWSEM_FLAG_WAITERS (1UL << 1) #define RWSEM_FLAG_HANDOFF (1UL << 2) +#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1)) #define RWSEM_READER_SHIFT 8 #define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) @@ -146,7 +162,7 @@ #define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED #define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) #define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\ - RWSEM_FLAG_HANDOFF) + RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL) /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -254,6 +270,14 @@ static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) owner | RWSEM_NONSPINNABLE)); } +static inline bool rwsem_read_trylock(struct rw_semaphore *sem) +{ + long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); + if (WARN_ON_ONCE(cnt < 0)) + rwsem_set_nonspinnable(sem); + return !(cnt & RWSEM_READ_FAILED_MASK); +} + /* * Return just the real task structure pointer of the owner */ @@ -402,6 +426,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, return; } + /* + * No reader wakeup if there are too many of them already. + */ + if (unlikely(atomic_long_read(&sem->count) < 0)) + return; + /* * Writers might steal the lock before we grant it to the next reader. * We prefer to do the first reader grant before counting readers @@ -949,9 +979,9 @@ static struct rw_semaphore __sched * rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) { long count, adjustment = -RWSEM_READER_BIAS; - bool wake = false; struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); + bool wake = false; /* * Save the current read-owner of rwsem, if available, and the @@ -1270,8 +1300,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) */ inline void __down_read(struct rw_semaphore *sem) { - if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, - &sem->count) & RWSEM_READ_FAILED_MASK)) { + if (!rwsem_read_trylock(sem)) { rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); } else { @@ -1281,8 +1310,7 @@ inline void __down_read(struct rw_semaphore *sem) static inline int __down_read_killable(struct rw_semaphore *sem) { - if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, - &sem->count) & RWSEM_READ_FAILED_MASK)) { + if (!rwsem_read_trylock(sem)) { if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) return -EINTR; DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); @@ -1359,6 +1387,7 @@ inline void __up_read(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); + DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == RWSEM_FLAG_WAITERS)) { clear_wr_nonspinnable(sem); -- cgit v1.2.3 From 8dc2d993cf1aea578d947e34ea2d5c6ffd1644e4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Feb 2019 10:27:58 +0100 Subject: x86/percpu, sched/fair: Avoid local_clock() Nadav reported that code-gen changed because of the this_cpu_*() constraints, avoid this for select_idle_cpu() because that runs with preemption (and IRQs) disabled anyway. Reported-by: Nadav Amit Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f35930f5e528..8591529e1753 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6189,6 +6189,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t u64 time, cost; s64 delta; int cpu, nr = INT_MAX; + int this = smp_processor_id(); this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -6212,7 +6213,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t nr = 4; } - time = local_clock(); + time = cpu_clock(this); for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { if (!--nr) @@ -6223,7 +6224,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t break; } - time = local_clock() - time; + time = cpu_clock(this) - time; cost = this_sd->avg_scan_cost; delta = (s64)(time - cost) / 8; this_sd->avg_scan_cost += delta; -- cgit v1.2.3 From f0553dcb9778c343641d3a41f1db01be02e7551b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 10 Jun 2019 16:22:19 -0500 Subject: tracepoint: Use struct_size() in kmalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct tp_probes { ... struct tracepoint_func probes[0]; }; instance = kmalloc(sizeof(sizeof(struct tp_probes) + sizeof(struct tracepoint_func) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kmalloc(struct_size(instance, probes, count) GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steven Rostedt (VMware) --- kernel/tracepoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 46f2ab1e08a9..fb9353ed901b 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -68,8 +68,8 @@ struct tp_probes { static inline void *allocate_probes(int count) { - struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func) - + sizeof(struct tp_probes), GFP_KERNEL); + struct tp_probes *p = kmalloc(struct_size(p, probes, count), + GFP_KERNEL); return p == NULL ? NULL : p->probes; } -- cgit v1.2.3 From f7cf25b2026dc8441e0fa3a202c2aa8a56211e30 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 15 Jun 2019 12:12:17 -0700 Subject: bpf: track spill/fill of constants Compilers often spill induction variables into the stack, hence it is necessary for the verifier to track scalar values of the registers through stack slots. Also few bpf programs were incorrectly rejected in the past, since the verifier was not able to track such constants while they were used to compute offsets into packet headers. Tracking constants through the stack significantly decreases the chances of state pruning, since two different constants are considered to be different by state equivalency. End result that cilium tests suffer serious degradation in the number of states processed and corresponding verification time increase. before after bpf_lb-DLB_L3.o 1838 6441 bpf_lb-DLB_L4.o 3218 5908 bpf_lb-DUNKNOWN.o 1064 1064 bpf_lxc-DDROP_ALL.o 26935 93790 bpf_lxc-DUNKNOWN.o 34439 123886 bpf_netdev.o 9721 31413 bpf_overlay.o 6184 18561 bpf_lxc_jit.o 39389 359445 After further debugging turned out that cillium progs are getting hurt by clang due to the same constant tracking issue. Newer clang generates better code by spilling less to the stack. Instead it keeps more constants in the registers which hurts state pruning since the verifier already tracks constants in the registers: old clang new clang (no spill/fill tracking introduced by this patch) bpf_lb-DLB_L3.o 1838 1923 bpf_lb-DLB_L4.o 3218 3077 bpf_lb-DUNKNOWN.o 1064 1062 bpf_lxc-DDROP_ALL.o 26935 166729 bpf_lxc-DUNKNOWN.o 34439 174607 bpf_netdev.o 9721 8407 bpf_overlay.o 6184 5420 bpf_lcx_jit.o 39389 39389 The final table is depressing: old clang old clang new clang new clang const spill/fill const spill/fill bpf_lb-DLB_L3.o 1838 6441 1923 8128 bpf_lb-DLB_L4.o 3218 5908 3077 6707 bpf_lb-DUNKNOWN.o 1064 1064 1062 1062 bpf_lxc-DDROP_ALL.o 26935 93790 166729 380712 bpf_lxc-DUNKNOWN.o 34439 123886 174607 440652 bpf_netdev.o 9721 31413 8407 31904 bpf_overlay.o 6184 18561 5420 23569 bpf_lxc_jit.o 39389 359445 39389 359445 Tracking constants in the registers hurts state pruning already. Adding tracking of constants through stack hurts pruning even more. The later patch address this general constant tracking issue with coarse/precise logic. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 90 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d1786357a09..6c13d86569a6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1378,6 +1378,23 @@ static bool register_is_null(struct bpf_reg_state *reg) return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); } +static bool register_is_const(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); +} + +static void save_register_state(struct bpf_func_state *state, + int spi, struct bpf_reg_state *reg) +{ + int i; + + state->stack[spi].spilled_ptr = *reg; + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack[spi].slot_type[i] = STACK_SPILL; +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -1387,7 +1404,7 @@ static int check_stack_write(struct bpf_verifier_env *env, { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; - enum bpf_reg_type type; + struct bpf_reg_state *reg = NULL; err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), state->acquired_refs, true); @@ -1404,27 +1421,37 @@ static int check_stack_write(struct bpf_verifier_env *env, } cur = env->cur_state->frame[env->cur_state->curframe]; - if (value_regno >= 0 && - is_spillable_regtype((type = cur->regs[value_regno].type))) { + if (value_regno >= 0) + reg = &cur->regs[value_regno]; + if (reg && size == BPF_REG_SIZE && register_is_const(reg) && + !register_is_null(reg) && env->allow_ptr_leaks) { + save_register_state(state, spi, reg); + } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { + verbose_linfo(env, insn_idx, "; "); verbose(env, "invalid size of register spill\n"); return -EACCES; } - if (state != cur && type == PTR_TO_STACK) { + if (state != cur && reg->type == PTR_TO_STACK) { verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL; } - /* save register state */ - state->stack[spi].spilled_ptr = cur->regs[value_regno]; - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + if (!env->allow_ptr_leaks) { + bool sanitize = false; - for (i = 0; i < BPF_REG_SIZE; i++) { - if (state->stack[spi].slot_type[i] == STACK_MISC && - !env->allow_ptr_leaks) { + if (state->stack[spi].slot_type[0] == STACK_SPILL && + register_is_const(&state->stack[spi].spilled_ptr)) + sanitize = true; + for (i = 0; i < BPF_REG_SIZE; i++) + if (state->stack[spi].slot_type[i] == STACK_MISC) { + sanitize = true; + break; + } + if (sanitize) { int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; int soff = (-spi - 1) * BPF_REG_SIZE; @@ -1447,8 +1474,8 @@ static int check_stack_write(struct bpf_verifier_env *env, } *poff = soff; } - state->stack[spi].slot_type[i] = STACK_SPILL; } + save_register_state(state, spi, reg); } else { u8 type = STACK_MISC; @@ -1471,8 +1498,7 @@ static int check_stack_write(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; /* when we zero initialize stack slots mark them as such */ - if (value_regno >= 0 && - register_is_null(&cur->regs[value_regno])) + if (reg && register_is_null(reg)) type = STACK_ZERO; /* Mark slots affected by this stack write. */ @@ -1490,6 +1516,7 @@ static int check_stack_read(struct bpf_verifier_env *env, struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; + struct bpf_reg_state *reg; u8 *stype; if (reg_state->allocated_stack <= slot) { @@ -1498,11 +1525,21 @@ static int check_stack_read(struct bpf_verifier_env *env, return -EACCES; } stype = reg_state->stack[spi].slot_type; + reg = ®_state->stack[spi].spilled_ptr; if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { - verbose(env, "invalid size of register spill\n"); - return -EACCES; + if (reg->type != SCALAR_VALUE) { + verbose_linfo(env, env->insn_idx, "; "); + verbose(env, "invalid size of register fill\n"); + return -EACCES; + } + if (value_regno >= 0) { + mark_reg_unknown(env, state->regs, value_regno); + state->regs[value_regno].live |= REG_LIVE_WRITTEN; + } + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); + return 0; } for (i = 1; i < BPF_REG_SIZE; i++) { if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { @@ -1513,17 +1550,14 @@ static int check_stack_read(struct bpf_verifier_env *env, if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; + state->regs[value_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent, - REG_LIVE_READ64); - return 0; + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { int zeros = 0; @@ -1538,9 +1572,7 @@ static int check_stack_read(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent, - REG_LIVE_READ64); + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, @@ -1553,8 +1585,8 @@ static int check_stack_read(struct bpf_verifier_env *env, } state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - return 0; } + return 0; } static int check_stack_access(struct bpf_verifier_env *env, @@ -2415,7 +2447,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); - int err, min_off, max_off, i, slot, spi; + int err, min_off, max_off, i, j, slot, spi; if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -2503,6 +2535,14 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, *stype = STACK_MISC; goto mark; } + if (state->stack[spi].slot_type[0] == STACK_SPILL && + state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { + __mark_reg_unknown(&state->stack[spi].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + state->stack[spi].slot_type[j] = STACK_MISC; + goto mark; + } + err: if (tnum_is_const(reg->var_off)) { verbose(env, "invalid indirect read from stack off %d+%d size %d\n", -- cgit v1.2.3 From fb8d251ee2a6bf4d7f4af5548e9c8f4fb5f90402 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 15 Jun 2019 12:12:19 -0700 Subject: bpf: extend is_branch_taken to registers This patch extends is_branch_taken() logic from JMP+K instructions to JMP+X instructions. Conditional branches are often done when src and dst registers contain known scalars. In such case the verifier can follow the branch that is going to be taken when program executes. That speeds up the verification and is essential feature to support bounded loops. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6c13d86569a6..8d3a4ef1d969 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5266,9 +5266,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_verifier_state *this_branch = env->cur_state; struct bpf_verifier_state *other_branch; struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; - struct bpf_reg_state *dst_reg, *other_branch_regs; + struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; u8 opcode = BPF_OP(insn->code); bool is_jmp32; + int pred = -1; int err; /* Only conditional jumps are expected to reach here. */ @@ -5293,6 +5294,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->src_reg); return -EACCES; } + src_reg = ®s[insn->src_reg]; } else { if (insn->src_reg != BPF_REG_0) { verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); @@ -5308,20 +5310,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - if (BPF_SRC(insn->code) == BPF_K) { - int pred = is_branch_taken(dst_reg, insn->imm, opcode, - is_jmp32); - - if (pred == 1) { - /* only follow the goto, ignore fall-through */ - *insn_idx += insn->off; - return 0; - } else if (pred == 0) { - /* only follow fall-through branch, since - * that's where the program will go - */ - return 0; - } + if (BPF_SRC(insn->code) == BPF_K) + pred = is_branch_taken(dst_reg, insn->imm, + opcode, is_jmp32); + else if (src_reg->type == SCALAR_VALUE && + tnum_is_const(src_reg->var_off)) + pred = is_branch_taken(dst_reg, src_reg->var_off.value, + opcode, is_jmp32); + if (pred == 1) { + /* only follow the goto, ignore fall-through */ + *insn_idx += insn->off; + return 0; + } else if (pred == 0) { + /* only follow fall-through branch, since + * that's where the program will go + */ + return 0; } other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, -- cgit v1.2.3 From 2589726d12a1b12eaaa93c7f1ea64287e383c7a5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 15 Jun 2019 12:12:20 -0700 Subject: bpf: introduce bounded loops Allow the verifier to validate the loops by simulating their execution. Exisiting programs have used '#pragma unroll' to unroll the loops by the compiler. Instead let the verifier simulate all iterations of the loop. In order to do that introduce parentage chain of bpf_verifier_state and 'branches' counter for the number of branches left to explore. See more detailed algorithm description in bpf_verifier.h This algorithm borrows the key idea from Edward Cree approach: https://patchwork.ozlabs.org/patch/877222/ Additional state pruning heuristics make such brute force loop walk practical even for large loops. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 143 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 131 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d3a4ef1d969..25baa3c8cdd2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -721,6 +721,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; dst_state->active_spin_lock = src->active_spin_lock; + dst_state->branches = src->branches; + dst_state->parent = src->parent; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -736,6 +738,23 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return 0; } +static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + while (st) { + u32 br = --st->branches; + + /* WARN_ON(br > 1) technically makes sense here, + * but see comment in push_stack(), hence: + */ + WARN_ONCE((int)br < 0, + "BUG update_branch_counts:branches_to_explore=%d\n", + br); + if (br) + break; + st = st->parent; + } +} + static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx) { @@ -789,6 +808,18 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, env->stack_size); goto err; } + if (elem->st.parent) { + ++elem->st.parent->branches; + /* WARN_ON(branches > 2) technically makes sense here, + * but + * 1. speculative states will bump 'branches' for non-branch + * instructions + * 2. is_state_visited() heuristics may decide not to create + * a new state for a sequence of branches and all such current + * and cloned states will be pointing to a single parent state + * which might have large 'branches' count. + */ + } return &elem->st; err: free_verifier_state(env->cur_state, true); @@ -5682,7 +5713,8 @@ static void init_explored_state(struct bpf_verifier_env *env, int idx) * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, + bool loop_ok) { int *insn_stack = env->cfg.insn_stack; int *insn_state = env->cfg.insn_state; @@ -5712,6 +5744,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + if (loop_ok && env->allow_ptr_leaks) + return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); verbose(env, "back-edge from insn %d to %d\n", t, w); @@ -5763,7 +5797,7 @@ peek_stack: if (opcode == BPF_EXIT) { goto mark_explored; } else if (opcode == BPF_CALL) { - ret = push_insn(t, t + 1, FALLTHROUGH, env); + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5772,7 +5806,8 @@ peek_stack: init_explored_state(env, t + 1); if (insns[t].src_reg == BPF_PSEUDO_CALL) { init_explored_state(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, + env, false); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5785,7 +5820,7 @@ peek_stack: } /* unconditional jump with single edge */ ret = push_insn(t, t + insns[t].off + 1, - FALLTHROUGH, env); + FALLTHROUGH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5798,13 +5833,13 @@ peek_stack: } else { /* conditional jump with two edges */ init_explored_state(env, t); - ret = push_insn(t, t + 1, FALLTHROUGH, env); + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) goto err_free; - ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); + ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -5814,7 +5849,7 @@ peek_stack: /* all other non-branch instructions with single * fall-through edge */ - ret = push_insn(t, t + 1, FALLTHROUGH, env); + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -6247,6 +6282,8 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, sl = *explored_state(env, insn); while (sl) { + if (sl->state.branches) + goto next; if (sl->state.insn_idx != insn || sl->state.curframe != cur->curframe) goto next; @@ -6611,12 +6648,32 @@ static int propagate_liveness(struct bpf_verifier_env *env, return 0; } +static bool states_maybe_looping(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) +{ + struct bpf_func_state *fold, *fcur; + int i, fr = cur->curframe; + + if (old->curframe != fr) + return false; + + fold = old->frame[fr]; + fcur = cur->frame[fr]; + for (i = 0; i < MAX_BPF_REG; i++) + if (memcmp(&fold->regs[i], &fcur->regs[i], + offsetof(struct bpf_reg_state, parent))) + return false; + return true; +} + + static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; + bool add_new_state = false; if (!env->insn_aux_data[insn_idx].prune_point) /* this 'insn_idx' instruction wasn't marked, so we will not @@ -6624,6 +6681,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ return 0; + /* bpf progs typically have pruning point every 4 instructions + * http://vger.kernel.org/bpfconf2019.html#session-1 + * Do not add new state for future pruning if the verifier hasn't seen + * at least 2 jumps and at least 8 instructions. + * This heuristics helps decrease 'total_states' and 'peak_states' metric. + * In tests that amounts to up to 50% reduction into total verifier + * memory consumption and 20% verifier time speedup. + */ + if (env->jmps_processed - env->prev_jmps_processed >= 2 && + env->insn_processed - env->prev_insn_processed >= 8) + add_new_state = true; + pprev = explored_state(env, insn_idx); sl = *pprev; @@ -6633,6 +6702,30 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) states_cnt++; if (sl->state.insn_idx != insn_idx) goto next; + if (sl->state.branches) { + if (states_maybe_looping(&sl->state, cur) && + states_equal(env, &sl->state, cur)) { + verbose_linfo(env, insn_idx, "; "); + verbose(env, "infinite loop detected at insn %d\n", insn_idx); + return -EINVAL; + } + /* if the verifier is processing a loop, avoid adding new state + * too often, since different loop iterations have distinct + * states and may not help future pruning. + * This threshold shouldn't be too low to make sure that + * a loop with large bound will be rejected quickly. + * The most abusive loop will be: + * r1 += 1 + * if r1 < 1000000 goto pc-2 + * 1M insn_procssed limit / 100 == 10k peak states. + * This threshold shouldn't be too high either, since states + * at the end of the loop are likely to be useful in pruning. + */ + if (env->jmps_processed - env->prev_jmps_processed < 20 && + env->insn_processed - env->prev_insn_processed < 100) + add_new_state = false; + goto miss; + } if (states_equal(env, &sl->state, cur)) { sl->hit_cnt++; /* reached equivalent register/stack state, @@ -6650,7 +6743,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return err; return 1; } - sl->miss_cnt++; +miss: + /* when new state is not going to be added do not increase miss count. + * Otherwise several loop iterations will remove the state + * recorded earlier. The goal of these heuristics is to have + * states from some iterations of the loop (some in the beginning + * and some at the end) to help pruning. + */ + if (add_new_state) + sl->miss_cnt++; /* heuristic to determine whether this state is beneficial * to keep checking from state equivalence point of view. * Higher numbers increase max_states_per_insn and verification time, @@ -6662,6 +6763,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ *pprev = sl->next; if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { + u32 br = sl->state.branches; + + WARN_ONCE(br, + "BUG live_done but branches_to_explore %d\n", + br); free_verifier_state(&sl->state, false); kfree(sl); env->peak_states--; @@ -6687,18 +6793,25 @@ next: if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return 0; - /* there were no equivalent states, remember current one. - * technically the current state is not proven to be safe yet, + if (!add_new_state) + return 0; + + /* There were no equivalent states, remember the current one. + * Technically the current state is not proven to be safe yet, * but it will either reach outer most bpf_exit (which means it's safe) - * or it will be rejected. Since there are no loops, we won't be + * or it will be rejected. When there are no loops the verifier won't be * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) - * again on the way to bpf_exit + * again on the way to bpf_exit. + * When looping the sl->state.branches will be > 0 and this state + * will not be considered for equivalence until branches == 0. */ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; env->total_states++; env->peak_states++; + env->prev_jmps_processed = env->jmps_processed; + env->prev_insn_processed = env->insn_processed; /* add new state to the head of linked list */ new = &new_sl->state; @@ -6709,6 +6822,9 @@ next: return err; } new->insn_idx = insn_idx; + WARN_ONCE(new->branches != 1, + "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); + cur->parent = new; new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -6795,6 +6911,7 @@ static int do_check(struct bpf_verifier_env *env) return -ENOMEM; state->curframe = 0; state->speculative = false; + state->branches = 1; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); @@ -7001,6 +7118,7 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_JMP || class == BPF_JMP32) { u8 opcode = BPF_OP(insn->code); + env->jmps_processed++; if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || @@ -7086,6 +7204,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: + update_branch_counts(env, env->cur_state); err = pop_stack(env, &env->prev_insn_idx, &env->insn_idx); if (err < 0) { -- cgit v1.2.3 From eea1c227b9e9bad295e8ef984004a9acf12bb68c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 15 Jun 2019 12:12:21 -0700 Subject: bpf: fix callees pruning callers The commit 7640ead93924 partially resolved the issue of callees incorrectly pruning the callers. With introduction of bounded loops and jmps_processed heuristic single verifier state may contain multiple branches and calls. It's possible that new verifier state (for future pruning) will be allocated inside callee. Then callee will exit (still within the same verifier state). It will go back to the caller and there R6-R9 registers will be read and will trigger mark_reg_read. But the reg->live for all frames but the top frame is not set to LIVE_NONE. Hence mark_reg_read will fail to propagate liveness into parent and future walking will incorrectly conclude that the states are equivalent because LIVE_READ is not set. In other words the rule for parent/live should be: whenever register parentage chain is set the reg->live should be set to LIVE_NONE. is_state_visited logic already follows this rule for spilled registers. Fixes: 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 25baa3c8cdd2..870c8f19ce80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6834,17 +6834,18 @@ next: * the state of the call instruction (with WRITTEN set), and r0 comes * from callee with its full parentage chain, anyway. */ - for (j = 0; j <= cur->curframe; j++) - for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark * their parent and current state never has children yet. Only * explored_states can get read marks.) */ - for (i = 0; i < BPF_REG_FP; i++) - cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; + for (j = 0; j <= cur->curframe; j++) { + for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) + cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; + for (i = 0; i < BPF_REG_FP; i++) + cur->frame[j]->regs[i].live = REG_LIVE_NONE; + } /* all stack frames are accessible from callee, clear them all */ for (j = 0; j <= cur->curframe; j++) { -- cgit v1.2.3 From b5dc0163d8fd78e64a7e21f309cf932fda34353e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 15 Jun 2019 12:12:25 -0700 Subject: bpf: precise scalar_value tracking Introduce precision tracking logic that helps cilium programs the most: old clang old clang new clang new clang with all patches with all patches bpf_lb-DLB_L3.o 1838 2283 1923 1863 bpf_lb-DLB_L4.o 3218 2657 3077 2468 bpf_lb-DUNKNOWN.o 1064 545 1062 544 bpf_lxc-DDROP_ALL.o 26935 23045 166729 22629 bpf_lxc-DUNKNOWN.o 34439 35240 174607 28805 bpf_netdev.o 9721 8753 8407 6801 bpf_overlay.o 6184 7901 5420 4754 bpf_lxc_jit.o 39389 50925 39389 50925 Consider code: 654: (85) call bpf_get_hash_recalc#34 655: (bf) r7 = r0 656: (15) if r8 == 0x0 goto pc+29 657: (bf) r2 = r10 658: (07) r2 += -48 659: (18) r1 = 0xffff8881e41e1b00 661: (85) call bpf_map_lookup_elem#1 662: (15) if r0 == 0x0 goto pc+23 663: (69) r1 = *(u16 *)(r0 +0) 664: (15) if r1 == 0x0 goto pc+21 665: (bf) r8 = r7 666: (57) r8 &= 65535 667: (bf) r2 = r8 668: (3f) r2 /= r1 669: (2f) r2 *= r1 670: (bf) r1 = r8 671: (1f) r1 -= r2 672: (57) r1 &= 255 673: (25) if r1 > 0x1e goto pc+12 R0=map_value(id=0,off=0,ks=20,vs=64,imm=0) R1_w=inv(id=0,umax_value=30,var_off=(0x0; 0x1f)) 674: (67) r1 <<= 1 675: (0f) r0 += r1 At this point the verifier will notice that scalar R1 is used in map pointer adjustment. R1 has to be precise for later operations on R0 to be validated properly. The verifier will backtrack the above code in the following way: last_idx 675 first_idx 664 regs=2 stack=0 before 675: (0f) r0 += r1 // started backtracking R1 regs=2 is a bitmask regs=2 stack=0 before 674: (67) r1 <<= 1 regs=2 stack=0 before 673: (25) if r1 > 0x1e goto pc+12 regs=2 stack=0 before 672: (57) r1 &= 255 regs=2 stack=0 before 671: (1f) r1 -= r2 // now both R1 and R2 has to be precise -> regs=6 mask regs=6 stack=0 before 670: (bf) r1 = r8 // after this insn R8 and R2 has to be precise regs=104 stack=0 before 669: (2f) r2 *= r1 // after this one R8, R2, and R1 regs=106 stack=0 before 668: (3f) r2 /= r1 regs=106 stack=0 before 667: (bf) r2 = r8 regs=102 stack=0 before 666: (57) r8 &= 65535 regs=102 stack=0 before 665: (bf) r8 = r7 regs=82 stack=0 before 664: (15) if r1 == 0x0 goto pc+21 // this is the end of verifier state. The following regs will be marked precised: R1_rw=invP(id=0,umax_value=65535,var_off=(0x0; 0xffff)) R7_rw=invP(id=0) parent didn't have regs=82 stack=0 marks // so backtracking continues into parent state last_idx 663 first_idx 655 regs=82 stack=0 before 663: (69) r1 = *(u16 *)(r0 +0) // R1 was assigned no need to track it further regs=80 stack=0 before 662: (15) if r0 == 0x0 goto pc+23 // keep tracking R7 regs=80 stack=0 before 661: (85) call bpf_map_lookup_elem#1 // keep tracking R7 regs=80 stack=0 before 659: (18) r1 = 0xffff8881e41e1b00 regs=80 stack=0 before 658: (07) r2 += -48 regs=80 stack=0 before 657: (bf) r2 = r10 regs=80 stack=0 before 656: (15) if r8 == 0x0 goto pc+29 regs=80 stack=0 before 655: (bf) r7 = r0 // here the assignment into R7 // mark R0 to be precise: R0_rw=invP(id=0) parent didn't have regs=1 stack=0 marks // regs=1 -> tracking R0 last_idx 654 first_idx 644 regs=1 stack=0 before 654: (85) call bpf_get_hash_recalc#34 // and in the parent frame it was a return value // nothing further to backtrack Two scalar registers not marked precise are equivalent from state pruning point of view. More details in the patch comments. It doesn't support bpf2bpf calls yet and enabled for root only. Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 491 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 480 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 870c8f19ce80..709ce4cef8ba 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -455,12 +455,12 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, " R%d", i); print_liveness(env, reg->live); verbose(env, "=%s", reg_type_str[t]); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); - if (t == PTR_TO_STACK) - verbose(env, ",call_%d", func(env, reg)->callsite); } else { verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -522,11 +522,17 @@ static void print_verifier_state(struct bpf_verifier_env *env, continue; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, state->stack[i].spilled_ptr.live); - if (state->stack[i].slot_type[0] == STACK_SPILL) - verbose(env, "=%s", - reg_type_str[state->stack[i].spilled_ptr.type]); - else + if (state->stack[i].slot_type[0] == STACK_SPILL) { + reg = &state->stack[i].spilled_ptr; + t = reg->type; + verbose(env, "=%s", reg_type_str[t]); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); + if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) + verbose(env, "%lld", reg->var_off.value + reg->off); + } else { verbose(env, "=%s", types_buf); + } } if (state->acquired_refs && state->refs[0].id) { verbose(env, " refs=%d", state->refs[0].id); @@ -675,6 +681,13 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } +static void clear_jmp_history(struct bpf_verifier_state *state) +{ + kfree(state->jmp_history); + state->jmp_history = NULL; + state->jmp_history_cnt = 0; +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { @@ -684,6 +697,7 @@ static void free_verifier_state(struct bpf_verifier_state *state, free_func_state(state->frame[i]); state->frame[i] = NULL; } + clear_jmp_history(state); if (free_self) kfree(state); } @@ -711,8 +725,18 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, const struct bpf_verifier_state *src) { struct bpf_func_state *dst; + u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt; int i, err; + if (dst_state->jmp_history_cnt < src->jmp_history_cnt) { + kfree(dst_state->jmp_history); + dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER); + if (!dst_state->jmp_history) + return -ENOMEM; + } + memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz); + dst_state->jmp_history_cnt = src->jmp_history_cnt; + /* if dst has more stack frames then src frame, free them */ for (i = src->curframe + 1; i <= dst_state->curframe; i++) { free_func_state(dst_state->frame[i]); @@ -723,6 +747,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->active_spin_lock = src->active_spin_lock; dst_state->branches = src->branches; dst_state->parent = src->parent; + dst_state->first_insn_idx = src->first_insn_idx; + dst_state->last_insn_idx = src->last_insn_idx; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -967,6 +993,9 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) reg->smax_value = S64_MAX; reg->umin_value = 0; reg->umax_value = U64_MAX; + + /* constant backtracking is enabled for root only for now */ + reg->precise = capable(CAP_SYS_ADMIN) ? false : true; } /* Mark a register as having a completely unknown (scalar) value. */ @@ -1378,6 +1407,389 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return 0; } +/* for any branch, call, exit record the history of jmps in the given state */ +static int push_jmp_history(struct bpf_verifier_env *env, + struct bpf_verifier_state *cur) +{ + u32 cnt = cur->jmp_history_cnt; + struct bpf_idx_pair *p; + + cnt++; + p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER); + if (!p) + return -ENOMEM; + p[cnt - 1].idx = env->insn_idx; + p[cnt - 1].prev_idx = env->prev_insn_idx; + cur->jmp_history = p; + cur->jmp_history_cnt = cnt; + return 0; +} + +/* Backtrack one insn at a time. If idx is not at the top of recorded + * history then previous instruction came from straight line execution. + */ +static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, + u32 *history) +{ + u32 cnt = *history; + + if (cnt && st->jmp_history[cnt - 1].idx == i) { + i = st->jmp_history[cnt - 1].prev_idx; + (*history)--; + } else { + i--; + } + return i; +} + +/* For given verifier state backtrack_insn() is called from the last insn to + * the first insn. Its purpose is to compute a bitmask of registers and + * stack slots that needs precision in the parent verifier state. + */ +static int backtrack_insn(struct bpf_verifier_env *env, int idx, + u32 *reg_mask, u64 *stack_mask) +{ + const struct bpf_insn_cbs cbs = { + .cb_print = verbose, + .private_data = env, + }; + struct bpf_insn *insn = env->prog->insnsi + idx; + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + u8 mode = BPF_MODE(insn->code); + u32 dreg = 1u << insn->dst_reg; + u32 sreg = 1u << insn->src_reg; + u32 spi; + + if (insn->code == 0) + return 0; + if (env->log.level & BPF_LOG_LEVEL) { + verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask); + verbose(env, "%d: ", idx); + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + } + + if (class == BPF_ALU || class == BPF_ALU64) { + if (!(*reg_mask & dreg)) + return 0; + if (opcode == BPF_MOV) { + if (BPF_SRC(insn->code) == BPF_X) { + /* dreg = sreg + * dreg needs precision after this insn + * sreg needs precision before this insn + */ + *reg_mask &= ~dreg; + *reg_mask |= sreg; + } else { + /* dreg = K + * dreg needs precision after this insn. + * Corresponding register is already marked + * as precise=true in this verifier state. + * No further markings in parent are necessary + */ + *reg_mask &= ~dreg; + } + } else { + if (BPF_SRC(insn->code) == BPF_X) { + /* dreg += sreg + * both dreg and sreg need precision + * before this insn + */ + *reg_mask |= sreg; + } /* else dreg += K + * dreg still needs precision before this insn + */ + } + } else if (class == BPF_LDX) { + if (!(*reg_mask & dreg)) + return 0; + *reg_mask &= ~dreg; + + /* scalars can only be spilled into stack w/o losing precision. + * Load from any other memory can be zero extended. + * The desire to keep that precision is already indicated + * by 'precise' mark in corresponding register of this state. + * No further tracking necessary. + */ + if (insn->src_reg != BPF_REG_FP) + return 0; + if (BPF_SIZE(insn->code) != BPF_DW) + return 0; + + /* dreg = *(u64 *)[fp - off] was a fill from the stack. + * that [fp - off] slot contains scalar that needs to be + * tracked with precision + */ + spi = (-insn->off - 1) / BPF_REG_SIZE; + if (spi >= 64) { + verbose(env, "BUG spi %d\n", spi); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + *stack_mask |= 1ull << spi; + } else if (class == BPF_STX) { + if (*reg_mask & dreg) + /* stx shouldn't be using _scalar_ dst_reg + * to access memory. It means backtracking + * encountered a case of pointer subtraction. + */ + return -ENOTSUPP; + /* scalars can only be spilled into stack */ + if (insn->dst_reg != BPF_REG_FP) + return 0; + if (BPF_SIZE(insn->code) != BPF_DW) + return 0; + spi = (-insn->off - 1) / BPF_REG_SIZE; + if (spi >= 64) { + verbose(env, "BUG spi %d\n", spi); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + if (!(*stack_mask & (1ull << spi))) + return 0; + *stack_mask &= ~(1ull << spi); + *reg_mask |= sreg; + } else if (class == BPF_JMP || class == BPF_JMP32) { + if (opcode == BPF_CALL) { + if (insn->src_reg == BPF_PSEUDO_CALL) + return -ENOTSUPP; + /* regular helper call sets R0 */ + *reg_mask &= ~1; + if (*reg_mask & 0x3f) { + /* if backtracing was looking for registers R1-R5 + * they should have been found already. + */ + verbose(env, "BUG regs %x\n", *reg_mask); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + } else if (opcode == BPF_EXIT) { + return -ENOTSUPP; + } + } else if (class == BPF_LD) { + if (!(*reg_mask & dreg)) + return 0; + *reg_mask &= ~dreg; + /* It's ld_imm64 or ld_abs or ld_ind. + * For ld_imm64 no further tracking of precision + * into parent is necessary + */ + if (mode == BPF_IND || mode == BPF_ABS) + /* to be analyzed */ + return -ENOTSUPP; + } else if (class == BPF_ST) { + if (*reg_mask & dreg) + /* likely pointer subtraction */ + return -ENOTSUPP; + } + return 0; +} + +/* the scalar precision tracking algorithm: + * . at the start all registers have precise=false. + * . scalar ranges are tracked as normal through alu and jmp insns. + * . once precise value of the scalar register is used in: + * . ptr + scalar alu + * . if (scalar cond K|scalar) + * . helper_call(.., scalar, ...) where ARG_CONST is expected + * backtrack through the verifier states and mark all registers and + * stack slots with spilled constants that these scalar regisers + * should be precise. + * . during state pruning two registers (or spilled stack slots) + * are equivalent if both are not precise. + * + * Note the verifier cannot simply walk register parentage chain, + * since many different registers and stack slots could have been + * used to compute single precise scalar. + * + * The approach of starting with precise=true for all registers and then + * backtrack to mark a register as not precise when the verifier detects + * that program doesn't care about specific value (e.g., when helper + * takes register as ARG_ANYTHING parameter) is not safe. + * + * It's ok to walk single parentage chain of the verifier states. + * It's possible that this backtracking will go all the way till 1st insn. + * All other branches will be explored for needing precision later. + * + * The backtracking needs to deal with cases like: + * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0) + * r9 -= r8 + * r5 = r9 + * if r5 > 0x79f goto pc+7 + * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff)) + * r5 += 1 + * ... + * call bpf_perf_event_output#25 + * where .arg5_type = ARG_CONST_SIZE_OR_ZERO + * + * and this case: + * r6 = 1 + * call foo // uses callee's r6 inside to compute r0 + * r0 += r6 + * if r0 == 0 goto + * + * to track above reg_mask/stack_mask needs to be independent for each frame. + * + * Also if parent's curframe > frame where backtracking started, + * the verifier need to mark registers in both frames, otherwise callees + * may incorrectly prune callers. This is similar to + * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") + * + * For now backtracking falls back into conservative marking. + */ +static void mark_all_scalars_precise(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + /* big hammer: mark all scalars precise in this path. + * pop_stack may still get !precise scalars. + */ + for (; st; st = st->parent) + for (i = 0; i <= st->curframe; i++) { + func = st->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = true; + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (func->stack[j].slot_type[0] != STACK_SPILL) + continue; + reg = &func->stack[j].spilled_ptr; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = true; + } + } +} + +static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +{ + struct bpf_verifier_state *st = env->cur_state; + int first_idx = st->first_insn_idx; + int last_idx = env->insn_idx; + struct bpf_func_state *func; + struct bpf_reg_state *reg; + u32 reg_mask = 1u << regno; + u64 stack_mask = 0; + bool skip_first = true; + int i, err; + + if (!env->allow_ptr_leaks) + /* backtracking is root only for now */ + return 0; + + func = st->frame[st->curframe]; + reg = &func->regs[regno]; + if (reg->type != SCALAR_VALUE) { + WARN_ONCE(1, "backtracing misuse"); + return -EFAULT; + } + if (reg->precise) + return 0; + func->regs[regno].precise = true; + + for (;;) { + DECLARE_BITMAP(mask, 64); + bool new_marks = false; + u32 history = st->jmp_history_cnt; + + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); + for (i = last_idx;;) { + if (skip_first) { + err = 0; + skip_first = false; + } else { + err = backtrack_insn(env, i, ®_mask, &stack_mask); + } + if (err == -ENOTSUPP) { + mark_all_scalars_precise(env, st); + return 0; + } else if (err) { + return err; + } + if (!reg_mask && !stack_mask) + /* Found assignment(s) into tracked register in this state. + * Since this state is already marked, just return. + * Nothing to be tracked further in the parent state. + */ + return 0; + if (i == first_idx) + break; + i = get_prev_insn_idx(st, i, &history); + if (i >= env->prog->len) { + /* This can happen if backtracking reached insn 0 + * and there are still reg_mask or stack_mask + * to backtrack. + * It means the backtracking missed the spot where + * particular register was initialized with a constant. + */ + verbose(env, "BUG backtracking idx %d\n", i); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + } + st = st->parent; + if (!st) + break; + + func = st->frame[st->curframe]; + bitmap_from_u64(mask, reg_mask); + for_each_set_bit(i, mask, 32) { + reg = &func->regs[i]; + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->precise) + new_marks = true; + reg->precise = true; + } + + bitmap_from_u64(mask, stack_mask); + for_each_set_bit(i, mask, 64) { + if (i >= func->allocated_stack / BPF_REG_SIZE) { + /* This can happen if backtracking + * is propagating stack precision where + * caller has larger stack frame + * than callee, but backtrack_insn() should + * have returned -ENOTSUPP. + */ + verbose(env, "BUG spi %d stack_size %d\n", + i, func->allocated_stack); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + + if (func->stack[i].slot_type[0] != STACK_SPILL) + continue; + reg = &func->stack[i].spilled_ptr; + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->precise) + new_marks = true; + reg->precise = true; + } + if (env->log.level & BPF_LOG_LEVEL) { + print_verifier_state(env, func); + verbose(env, "parent %s regs=%x stack=%llx marks\n", + new_marks ? "didn't have" : "already had", + reg_mask, stack_mask); + } + + if (!new_marks) + break; + + last_idx = st->last_insn_idx; + first_idx = st->first_insn_idx; + } + return 0; +} + + static bool is_spillable_regtype(enum bpf_reg_type type) { switch (type) { @@ -1435,6 +1847,7 @@ static int check_stack_write(struct bpf_verifier_env *env, { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg; struct bpf_reg_state *reg = NULL; err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), @@ -1457,6 +1870,17 @@ static int check_stack_write(struct bpf_verifier_env *env, if (reg && size == BPF_REG_SIZE && register_is_const(reg) && !register_is_null(reg) && env->allow_ptr_leaks) { + if (dst_reg != BPF_REG_FP) { + /* The backtracking logic can only recognize explicit + * stack slot address like [fp - 8]. Other spill of + * scalar via different register has to be conervative. + * Backtrack from here and mark all registers as precise + * that contributed into 'reg' being a constant. + */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; + } save_register_state(state, spi, reg); } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ @@ -1529,8 +1953,13 @@ static int check_stack_write(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; /* when we zero initialize stack slots mark them as such */ - if (reg && register_is_null(reg)) + if (reg && register_is_null(reg)) { + /* backtracking doesn't work for STACK_ZERO yet. */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; type = STACK_ZERO; + } /* Mark slots affected by this stack write. */ for (i = 0; i < size; i++) @@ -1610,6 +2039,17 @@ static int check_stack_read(struct bpf_verifier_env *env, * so the whole register == const_zero */ __mark_reg_const_zero(&state->regs[value_regno]); + /* backtracking doesn't support STACK_ZERO yet, + * so mark it precise here, so that later + * backtracking can stop here. + * Backtracking may not need this if this register + * doesn't participate in pointer adjustment. + * Forward propagation of precise flag is not + * necessary either. This mark is only to stop + * backtracking. Any register that contributed + * to const 0 was marked precise before spill. + */ + state->regs[value_regno].precise = true; } else { /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, value_regno); @@ -2925,6 +3365,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno - 1, reg->umax_value, zero_size_allowed, meta); + if (!err) + err = mark_chain_precision(env, regno); } else if (arg_type_is_int_ptr(arg_type)) { int size = int_ptr_type_to_size(arg_type); @@ -4361,6 +4803,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); + int err; dst_reg = ®s[insn->dst_reg]; src_reg = NULL; @@ -4387,11 +4830,17 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * This is legal, but we have to reverse our * src/dest handling in computing the range */ + err = mark_chain_precision(env, insn->dst_reg); + if (err) + return err; return adjust_ptr_min_max_vals(env, insn, src_reg, dst_reg); } } else if (ptr_reg) { /* pointer += scalar */ + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; return adjust_ptr_min_max_vals(env, insn, dst_reg, src_reg); } @@ -5348,6 +5797,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, tnum_is_const(src_reg->var_off)) pred = is_branch_taken(dst_reg, src_reg->var_off.value, opcode, is_jmp32); + if (pred >= 0) { + err = mark_chain_precision(env, insn->dst_reg); + if (BPF_SRC(insn->code) == BPF_X && !err) + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; + } if (pred == 1) { /* only follow the goto, ignore fall-through */ *insn_idx += insn->off; @@ -5825,6 +6281,11 @@ peek_stack: goto peek_stack; else if (ret < 0) goto err_free; + /* unconditional jmp is not a good pruning point, + * but it's marked, since backtracking needs + * to record jmp history in is_state_visited(). + */ + init_explored_state(env, t + insns[t].off + 1); /* tell verifier to check for equivalent states * after every call and jump */ @@ -6325,6 +6786,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, switch (rold->type) { case SCALAR_VALUE: if (rcur->type == SCALAR_VALUE) { + if (!rold->precise && !rcur->precise) + return true; /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); @@ -6675,6 +7138,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) int i, j, err, states_cnt = 0; bool add_new_state = false; + cur->last_insn_idx = env->prev_insn_idx; if (!env->insn_aux_data[insn_idx].prune_point) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here @@ -6791,10 +7255,10 @@ next: env->max_states_per_insn = states_cnt; if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) - return 0; + return push_jmp_history(env, cur); if (!add_new_state) - return 0; + return push_jmp_history(env, cur); /* There were no equivalent states, remember the current one. * Technically the current state is not proven to be safe yet, @@ -6824,7 +7288,10 @@ next: new->insn_idx = insn_idx; WARN_ONCE(new->branches != 1, "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); + cur->parent = new; + cur->first_insn_idx = insn_idx; + clear_jmp_history(cur); new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -6904,6 +7371,7 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_reg_state *regs; int insn_cnt = env->prog->len; bool do_print_state = false; + int prev_insn_idx = -1; env->prev_linfo = NULL; @@ -6929,6 +7397,7 @@ static int do_check(struct bpf_verifier_env *env) u8 class; int err; + env->prev_insn_idx = prev_insn_idx; if (env->insn_idx >= insn_cnt) { verbose(env, "invalid insn idx %d insn_cnt %d\n", env->insn_idx, insn_cnt); @@ -7001,6 +7470,7 @@ static int do_check(struct bpf_verifier_env *env) regs = cur_regs(env); env->insn_aux_data[env->insn_idx].seen = true; + prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); @@ -7174,7 +7644,6 @@ static int do_check(struct bpf_verifier_env *env) if (state->curframe) { /* exit from nested function */ - env->prev_insn_idx = env->insn_idx; err = prepare_func_exit(env, &env->insn_idx); if (err) return err; @@ -7206,7 +7675,7 @@ static int do_check(struct bpf_verifier_env *env) return err; process_bpf_exit: update_branch_counts(env, env->cur_state); - err = pop_stack(env, &env->prev_insn_idx, + err = pop_stack(env, &prev_insn_idx, &env->insn_idx); if (err < 0) { if (err != -ENOENT) -- cgit v1.2.3 From 0b385a0c3bd3f6d1044728b732bfc7dfb01c9fb5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Jun 2019 10:18:28 +0200 Subject: PM: suspend: Rename pm_suspend_via_s2idle() The name of pm_suspend_via_s2idle() is confusing, as it doesn't reflect the purpose of the function precisely enough and it is very similar to pm_suspend_via_firmware(), which has a different purpose, so rename it as pm_suspend_default_s2idle() and update its only caller, i8042_register_ports(), accordingly. Signed-off-by: Rafael J. Wysocki Acked-by: Dmitry Torokhov --- kernel/power/suspend.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 9505101ed2bc..8703b0ca4986 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -62,16 +62,16 @@ enum s2idle_states __read_mostly s2idle_state; static DEFINE_RAW_SPINLOCK(s2idle_lock); /** - * pm_suspend_via_s2idle - Check if suspend-to-idle is the default suspend. + * pm_suspend_default_s2idle - Check if suspend-to-idle is the default suspend. * * Return 'true' if suspend-to-idle has been selected as the default system * suspend method. */ -bool pm_suspend_via_s2idle(void) +bool pm_suspend_default_s2idle(void) { return mem_sleep_current == PM_SUSPEND_TO_IDLE; } -EXPORT_SYMBOL_GPL(pm_suspend_via_s2idle); +EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle); void s2idle_set_ops(const struct platform_s2idle_ops *ops) { -- cgit v1.2.3 From 9c106119f6538f65bdddb7948a157d90625effa7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 17 Jun 2019 15:28:43 +0200 Subject: swiotlb: fix phys_addr_t overflow warning On architectures that have a larger dma_addr_t than phys_addr_t, the swiotlb_tbl_map_single() function truncates its return code in the failure path, making it impossible to identify the error later, as we compare to the original value: kernel/dma/swiotlb.c:551:9: error: implicit conversion from 'dma_addr_t' (aka 'unsigned long long') to 'phys_addr_t' (aka 'unsigned int') changes value from 18446744073709551615 to 4294967295 [-Werror,-Wconstant-conversion] return DMA_MAPPING_ERROR; Use an explicit typecast here to convert it to the narrower type, and use the same expression in the error handling later. Fixes: b907e20508d0 ("swiotlb: remove SWIOTLB_MAP_ERROR") Acked-by: Stefano Stabellini Signed-off-by: Arnd Bergmann Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 38d57218809c..48e7488e3482 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -538,7 +538,7 @@ not_found: if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", size, io_tlb_nslabs, tmp_io_tlb_used); - return DMA_MAPPING_ERROR; + return (phys_addr_t)DMA_MAPPING_ERROR; found: io_tlb_used += nslots; spin_unlock_irqrestore(&io_tlb_lock, flags); @@ -656,7 +656,7 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, /* Oh well, have to allocate and map a bounce buffer. */ *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), *phys, size, dir, attrs); - if (*phys == DMA_MAPPING_ERROR) + if (*phys == (phys_addr_t)DMA_MAPPING_ERROR) return false; /* Ensure that the address returned is DMA'ble */ -- cgit v1.2.3 From 7743c48e54ee9be9c799cbf3b8e3e9f2b8d19e72 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 19 Jun 2019 16:10:15 +0100 Subject: keys: Cache result of request_key*() temporarily in task_struct If a filesystem uses keys to hold authentication tokens, then it needs a token for each VFS operation that might perform an authentication check - either by passing it to the server, or using to perform a check based on authentication data cached locally. For open files this isn't a problem, since the key should be cached in the file struct since it represents the subject performing operations on that file descriptor. During pathwalk, however, there isn't anywhere to cache the key, except perhaps in the nameidata struct - but that isn't exposed to the filesystems. Further, a pathwalk can incur a lot of operations, calling one or more of the following, for instance: ->lookup() ->permission() ->d_revalidate() ->d_automount() ->get_acl() ->getxattr() on each dentry/inode it encounters - and each one may need to call request_key(). And then, at the end of pathwalk, it will call the actual operation: ->mkdir() ->mknod() ->getattr() ->open() ... which may need to go and get the token again. However, it is very likely that all of the operations on a single dentry/inode - and quite possibly a sequence of them - will all want to use the same authentication token, which suggests that caching it would be a good idea. To this end: (1) Make it so that a positive result of request_key() and co. that didn't require upcalling to userspace is cached temporarily in task_struct. (2) The cache is 1 deep, so a new result displaces the old one. (3) The key is released by exit and by notify-resume. (4) The cache is cleared in a newly forked process. Signed-off-by: David Howells --- kernel/cred.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 3bd40de9e192..26da7e77098f 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -174,6 +174,11 @@ void exit_creds(struct task_struct *tsk) validate_creds(cred); alter_cred_subscribers(cred, -1); put_cred(cred); + +#ifdef CONFIG_KEYS_REQUEST_CACHE + key_put(current->cached_requested_key); + current->cached_requested_key = NULL; +#endif } /** @@ -327,6 +332,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; +#ifdef CONFIG_KEYS_REQUEST_CACHE + p->cached_requested_key = NULL; +#endif + if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && -- cgit v1.2.3 From 6bf071bf09d4b2ff3ee8783531e2ce814f0870cb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 18 Jun 2019 15:05:27 +0200 Subject: xdp: page_pool related fix to cpumap When converting an xdp_frame into an SKB, and sending this into the network stack, then the underlying XDP memory model need to release associated resources, because the network stack don't have callbacks for XDP memory models. The only memory model that needs this is page_pool, when a driver use the DMA-mapping feature. Introduce page_pool_release_page(), which basically does the same as page_pool_unmap_page(). Add xdp_release_frame() as the XDP memory model interface for calling it, if the memory model match MEM_TYPE_PAGE_POOL, to save the function call overhead for others. Have cpumap call xdp_release_frame() before xdp_scrub_frame(). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8ee5532cf6a6..8dff08768087 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -208,6 +208,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * - RX ring dev queue index (skb_record_rx_queue) */ + /* Until page_pool get SKB return path, release DMA here */ + xdp_release_frame(xdpf); + /* Allow SKB to reuse area used by xdp_frame */ xdp_scrub_frame(xdpf); -- cgit v1.2.3 From 380178ef7fde58f2040788a1bab972ce4867ac58 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Tue, 11 Jun 2019 16:13:18 +0200 Subject: stacktrace: Remove weak version of save_stack_trace_tsk_reliable() Recent rework of stack trace infrastructure introduced a new set of helpers for common stack trace operations (commit e9b98e162aa5 ("stacktrace: Provide helpers for common stack trace operations") and related). As a result, save_stack_trace_tsk_reliable() is not directly called anywhere. Livepatch, currently the only user of the reliable stack trace feature, now calls stack_trace_save_tsk_reliable(). When CONFIG_HAVE_RELIABLE_STACKTRACE is set and depending on CONFIG_ARCH_STACKWALK, stack_trace_save_tsk_reliable() calls either arch_stack_walk_reliable() or mentioned save_stack_trace_tsk_reliable(). x86_64 defines the former, ppc64le the latter. All other architectures do not have HAVE_RELIABLE_STACKTRACE and include/linux/stacktrace.h defines -ENOSYS returning version for them. In short, stack_trace_save_tsk_reliable() returning -ENOSYS defined in include/linux/stacktrace.h serves the same purpose as the old weak version of save_stack_trace_tsk_reliable() which is therefore no longer needed. Signed-off-by: Miroslav Benes Acked-by: Josh Poimboeuf Reviewed-by: Thomas Gleixner Reviewed-by: Kamalesh Babulal Signed-off-by: Petr Mladek --- kernel/stacktrace.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 27bafc1e271e..319e7fcf3083 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -254,14 +254,6 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); } -__weak int -save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) -{ - WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n"); - return -ENOSYS; -} - /** * stack_trace_save - Save a stack trace into a storage array * @store: Pointer to storage array -- cgit v1.2.3 From 67059d65f7da537c41513fc52e78eff096092b8c Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Tue, 11 Jun 2019 16:13:19 +0200 Subject: Revert "livepatch: Remove reliable stacktrace check in klp_try_switch_task()" This reverts commit 1d98a69e5cef3aeb68bcefab0e67e342d6bb4dad. Commit 31adf2308f33 ("livepatch: Convert error about unsupported reliable stacktrace into a warning") weakened the enforcement for architectures to have reliable stack traces support. The system only warns now about it. It only makes sense to reintroduce the compile time checking in klp_try_switch_task() again and bail out early. Signed-off-by: Miroslav Benes Acked-by: Josh Poimboeuf Reviewed-by: Kamalesh Babulal Signed-off-by: Petr Mladek --- kernel/livepatch/transition.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 0a3889c4f617..cb85dae09ce5 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -305,6 +305,13 @@ static bool klp_try_switch_task(struct task_struct *task) if (task->patch_state == klp_target_state) return true; + /* + * For arches which don't have reliable stack traces, we have to rely + * on other methods (e.g., switching tasks at kernel exit). + */ + if (!klp_have_reliable_stack()) + return false; + /* * Now try to check the stack for any to-be-patched or to-be-unpatched * functions. If all goes well, switch the task to the target patch -- cgit v1.2.3 From ac59a471e9371e3184425efd6a2fd8ac5c7e4c2b Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Jun 2019 16:13:20 +0200 Subject: livepatch: Remove duplicate warning about missing reliable stacktrace support WARN_ON_ONCE() could not be called safely under rq lock because of console deadlock issues. Moreover WARN_ON_ONCE() is superfluous in klp_check_stack(), because stack_trace_save_tsk_reliable() cannot return -ENOSYS thanks to klp_have_reliable_stack() check in klp_try_switch_task(). [ mbenes: changelog edited ] Signed-off-by: Miroslav Benes Acked-by: Josh Poimboeuf Reviewed-by: Kamalesh Babulal Signed-off-by: Petr Mladek --- kernel/livepatch/transition.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index cb85dae09ce5..3c764e73b032 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -259,7 +259,6 @@ static int klp_check_stack(struct task_struct *task, char *err_buf) int ret, nr_entries; ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries)); - WARN_ON_ONCE(ret == -ENOSYS); if (ret < 0) { snprintf(err_buf, STACK_ERR_BUF_SIZE, "%s: %s:%d has an unreliable stack\n", -- cgit v1.2.3 From d68dbb0c9ac8b1ff52eb09aa58ce6358400fa939 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Jun 2019 01:26:35 +0200 Subject: arch: handle arches who do not yet define clone3 This cleanly handles arches who do not yet define clone3. clone3() was initially placed under __ARCH_WANT_SYS_CLONE under the assumption that this would cleanly handle all architectures. It does not. Architectures such as nios2 or h8300 simply take the asm-generic syscall definitions and generate their syscall table from it. Since they don't define __ARCH_WANT_SYS_CLONE the build would fail complaining about sys_clone3 missing. The reason this doesn't happen for legacy clone is that nios2 and h8300 provide assembly stubs for sys_clone. This seems to be done for architectural reasons. The build failures for nios2 and h8300 were caught int -next luckily. The solution is to define __ARCH_WANT_SYS_CLONE3 that architectures can add. Additionally, we need a cond_syscall(clone3) for architectures such as nios2 or h8300 that generate their syscall table in the way I explained above. Fixes: 8f3220a80654 ("arch: wire-up clone3() syscall") Signed-off-by: Christian Brauner Cc: Arnd Bergmann Cc: Kees Cook Cc: David Howells Cc: Andrew Morton Cc: Oleg Nesterov Cc: Adrian Reber Cc: Linus Torvalds Cc: Al Viro Cc: Florian Weimer Cc: linux-api@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: x86@kernel.org --- kernel/fork.c | 2 ++ kernel/sys_ni.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 08ff131f26b4..98abea995629 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2490,7 +2490,9 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, return _do_fork(&args); } +#endif +#ifdef __ARCH_WANT_SYS_CLONE3 noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t size) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 4d9ae5ea6caf..34b76895b81e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -137,6 +137,8 @@ COND_SYSCALL(capset); /* kernel/exit.c */ /* kernel/fork.c */ +/* __ARCH_WANT_SYS_CLONE3 */ +COND_SYSCALL(clone3); /* kernel/futex.c */ COND_SYSCALL(futex); -- cgit v1.2.3 From 474a280036e8d319ef852f1dec59bedf4eda0107 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jun 2019 10:22:48 +0200 Subject: cgroup: export css_next_descendant_pre for bfq The bfq schedule now uses css_next_descendant_pre directly after the stats functionality depending on it has been from the core blk-cgroup code to bfq. Export the symbol so that bfq can still be build modular. Fixes: d6258980daf2 ("bfq-iosched: move bfq_stat_recursive_sum into the only caller") Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 426a0026225c..30aba80858e3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4221,6 +4221,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, return NULL; } +EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** * css_rightmost_descendant - return the rightmost descendant of a css -- cgit v1.2.3 From 17ce302f3117e9518395847a3120c8a108b587b8 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Tue, 11 Jun 2019 10:38:09 +0100 Subject: arm64: Fix interrupt tracing in the presence of NMIs In the presence of any form of instrumentation, nmi_enter() should be done before calling any traceable code and any instrumentation code. Currently, nmi_enter() is done in handle_domain_nmi(), which is much too late as instrumentation code might get called before. Move the nmi_enter/exit() calls to the arch IRQ vector handler. On arm64, it is not possible to know if the IRQ vector handler was called because of an NMI before acknowledging the interrupt. However, It is possible to know whether normal interrupts could be taken in the interrupted context (i.e. if taking an NMI in that context could introduce a potential race condition). When interrupting a context with IRQs disabled, call nmi_enter() as soon as possible. In contexts with IRQs enabled, defer this to the interrupt controller, which is in a better position to know if an interrupt taken is an NMI. Fixes: bc3c03ccb464 ("arm64: Enable the support of pseudo-NMIs") Cc: # 5.1.x- Cc: Will Deacon Cc: Thomas Gleixner Cc: Jason Cooper Cc: Mark Rutland Reviewed-by: Marc Zyngier Signed-off-by: Julien Thierry Signed-off-by: Catalin Marinas --- kernel/irq/irqdesc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index c52b737ab8e3..a92b33593b8d 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -680,6 +680,8 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, * @hwirq: The HW irq number to convert to a logical one * @regs: Register file coming from the low-level handling code * + * This function must be called from an NMI context. + * * Returns: 0 on success, or -EINVAL if conversion has failed */ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, @@ -689,7 +691,10 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, unsigned int irq; int ret = 0; - nmi_enter(); + /* + * NMI context needs to be setup earlier in order to deal with tracing. + */ + WARN_ON(!in_nmi()); irq = irq_find_mapping(domain, hwirq); @@ -702,7 +707,6 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, else ret = -EINVAL; - nmi_exit(); set_irq_regs(old_regs); return ret; } -- cgit v1.2.3 From d897a4ab11dc8a9fda50d2eccc081a96a6385998 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Tue, 18 Jun 2019 17:47:13 +0200 Subject: ntp: Limit TAI-UTC offset Don't allow the TAI-UTC offset of the system clock to be set by adjtimex() to a value larger than 100000 seconds. This prevents an overflow in the conversion to int, prevents the CLOCK_TAI clock from getting too far ahead of the CLOCK_REALTIME clock, and it is still large enough to allow leap seconds to be inserted at the maximum rate currently supported by the kernel (once per day) for the next ~270 years, however unlikely it is that someone can survive a catastrophic event which slowed down the rotation of the Earth so much. Reported-by: Weikang shi Signed-off-by: Miroslav Lichvar Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Link: https://lkml.kernel.org/r/20190618154713.20929-1-mlichvar@redhat.com --- kernel/time/ntp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8de4f789dc1b..65eb796610dc 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -43,6 +43,7 @@ static u64 tick_length_base; #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) +#define MAX_TAI_OFFSET 100000 /* * phase-lock loop variables @@ -691,7 +692,8 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, time_constant = max(time_constant, 0l); } - if (txc->modes & ADJ_TAI && txc->constant >= 0) + if (txc->modes & ADJ_TAI && + txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) -- cgit v1.2.3 From 0354c1a3cdf31f44b035cfad14d32282e815a572 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 21 Jun 2019 22:32:47 +0200 Subject: timekeeping: Use proper ktime_add when adding nsecs in coarse offset While this doesn't actually amount to a real difference, since the macro evaluates to the same thing, every place else operates on ktime_t using these functions, so let's not break the pattern. Fixes: e3ff9c3678b4 ("timekeeping: Repair ktime_get_coarse*() granularity") Signed-off-by: Jason A. Donenfeld Signed-off-by: Thomas Gleixner Reviewed-by: Arnd Bergmann Link: https://lkml.kernel.org/r/20190621203249.3909-1-Jason@zx2c4.com --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 44b726bab4bd..d911c8470149 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -819,7 +819,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) } while (read_seqcount_retry(&tk_core.seq, seq)); - return base + nsecs; + return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); -- cgit v1.2.3 From 9285ec4c8b61d4930a575081abeba2cd4f449a74 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 21 Jun 2019 22:32:48 +0200 Subject: timekeeping: Use proper clock specifier names in functions This makes boot uniformly boottime and tai uniformly clocktai, to address the remaining oversights. Signed-off-by: Jason A. Donenfeld Signed-off-by: Thomas Gleixner Reviewed-by: Arnd Bergmann Link: https://lkml.kernel.org/r/20190621203249.3909-2-Jason@zx2c4.com --- kernel/bpf/syscall.c | 2 +- kernel/events/core.c | 4 ++-- kernel/fork.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ef63d26622f2..96c8928b468b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1666,7 +1666,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (err < 0) goto free_prog; - prog->aux->load_time = ktime_get_boot_ns(); + prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); if (err) goto free_prog; diff --git a/kernel/events/core.c b/kernel/events/core.c index abbd4b3b96c2..e2d014395fc6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10680,11 +10680,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) break; case CLOCK_BOOTTIME: - event->clock = &ktime_get_boot_ns; + event->clock = &ktime_get_boottime_ns; break; case CLOCK_TAI: - event->clock = &ktime_get_tai_ns; + event->clock = &ktime_get_clocktai_ns; break; default: diff --git a/kernel/fork.c b/kernel/fork.c index 75675b9bf6df..4722f1a320bf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2139,7 +2139,7 @@ static __latent_entropy struct task_struct *copy_process( */ p->start_time = ktime_get_ns(); - p->real_start_time = ktime_get_boot_ns(); + p->real_start_time = ktime_get_boottime_ns(); /* * Make it visible to the rest of the system, but dont wake it up yet. -- cgit v1.2.3 From 12063d431078be73d11cb5e48a17c6db5f0d8254 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 21 Jun 2019 16:36:42 +0200 Subject: posix-timers: Remove "it_signal = NULL" assignment in itimer_delete() itimer_delete() is invoked during do_exit(). At this point it is the last thread in the group dying and doing the clean up. Since it is the last thread in the group, there can not be any other task attempting to lock the itimer which means the NULL assignment (which avoids lookups in __lock_timer()) is not required. The assignment and comment was copied in commit 0e568881178ff ("[PATCH] fix posix-timers to have proper per-process scope") from sys_timer_delete() which was/is the syscall interface and requires the assignment. Remove the superfluous ->it_signal = NULL assignment. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190621143643.25649-2-bigeasy@linutronix.de --- kernel/time/posix-timers.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 29176635991f..caa63e58e3d8 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -990,11 +990,6 @@ retry_delete: goto retry_delete; } list_del(&timer->list); - /* - * This keeps any tasks waiting on the spin lock from thinking - * they got something (see the lock code above). - */ - timer->it_signal = NULL; unlock_timer(timer, flags); release_posix_timer(timer, IT_ID_SET); -- cgit v1.2.3 From 7586addb99322faf4d096fc8beb140f879409212 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 21 Jun 2019 16:36:43 +0200 Subject: posix-timers: Use spin_lock_irq() in itimer_delete() itimer_delete() uses spin_lock_irqsave() to obtain a `flags' variable which can then be passed to unlock_timer(). It uses already spin_lock locking for the structure instead of lock_timer() because it has a timer which can not be removed by others at this point. The cleanup is always performed with enabled interrupts. Use spin_lock_irq() / spin_unlock_irq() so the `flags' variable can be removed. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190621143643.25649-3-bigeasy@linutronix.de --- kernel/time/posix-timers.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index caa63e58e3d8..d7f2d91acdac 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -980,18 +980,16 @@ retry_delete: */ static void itimer_delete(struct k_itimer *timer) { - unsigned long flags; - retry_delete: - spin_lock_irqsave(&timer->it_lock, flags); + spin_lock_irq(&timer->it_lock); if (timer_delete_hook(timer) == TIMER_RETRY) { - unlock_timer(timer, flags); + spin_unlock_irq(&timer->it_lock); goto retry_delete; } list_del(&timer->list); - unlock_timer(timer, flags); + spin_unlock_irq(&timer->it_lock); release_posix_timer(timer, IT_ID_SET); } -- cgit v1.2.3 From 44f57d788e7deecb504843534081d3449c2eede9 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 21 Jun 2019 10:52:30 +0100 Subject: timekeeping: Provide a generic update_vsyscall() implementation The new generic VDSO library allows to unify the update_vsyscall[_tz]() implementations. Provide a generic implementation based on the x86 code and the bindings which need to be implemented in architecture specific code. [ tglx: Moved it into kernel/time where it belongs. Removed the pointless line breaks in the stub functions. Massaged changelog ] Signed-off-by: Vincenzo Frascino Signed-off-by: Thomas Gleixner Tested-by: Shijith Thotton Tested-by: Andre Przywara Cc: linux-arch@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-mips@vger.kernel.org Cc: linux-kselftest@vger.kernel.org Cc: Catalin Marinas Cc: Will Deacon Cc: Arnd Bergmann Cc: Russell King Cc: Ralf Baechle Cc: Paul Burton Cc: Daniel Lezcano Cc: Mark Salyzyn Cc: Peter Collingbourne Cc: Shuah Khan Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Rasmus Villemoes Cc: Huw Davies Link: https://lkml.kernel.org/r/20190621095252.32307-4-vincenzo.frascino@arm.com --- kernel/time/Makefile | 1 + kernel/time/vsyscall.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 kernel/time/vsyscall.c (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index f1e46f338a9c..1867044800bb 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -16,5 +16,6 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o +obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c new file mode 100644 index 000000000000..a80893180826 --- /dev/null +++ b/kernel/time/vsyscall.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 ARM Ltd. + * + * Generic implementation of update_vsyscall and update_vsyscall_tz. + * + * Based on the x86 specific implementation. + */ + +#include +#include +#include +#include +#include + +static inline void update_vdso_data(struct vdso_data *vdata, + struct timekeeper *tk) +{ + struct vdso_timestamp *vdso_ts; + u64 nsec; + + vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; + vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; + vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; + vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; + vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; + vdata[CS_RAW].mask = tk->tkr_raw.mask; + vdata[CS_RAW].mult = tk->tkr_raw.mult; + vdata[CS_RAW].shift = tk->tkr_raw.shift; + + /* CLOCK_REALTIME */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts->sec = tk->xtime_sec; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec; + + /* CLOCK_MONOTONIC */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; + vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + + nsec = tk->tkr_mono.xtime_nsec; + nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); + vdso_ts->sec++; + } + vdso_ts->nsec = nsec; + + /* CLOCK_MONOTONIC_RAW */ + vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts->sec = tk->raw_sec; + vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + + /* CLOCK_BOOTTIME */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; + vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsec = tk->tkr_mono.xtime_nsec; + nsec += ((u64)(tk->wall_to_monotonic.tv_nsec + + ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift); + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); + vdso_ts->sec++; + } + vdso_ts->nsec = nsec; + + /* CLOCK_TAI */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; + vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec; + + /* + * Read without the seqlock held by clock_getres(). + * Note: No need to have a second copy. + */ + WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_timestamp *vdso_ts; + u64 nsec; + + if (__arch_update_vdso_data()) { + /* + * Some architectures might want to skip the update of the + * data page. + */ + return; + } + + /* copy vsyscall data */ + vdso_write_begin(vdata); + + vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk); + vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk); + + /* CLOCK_REALTIME_COARSE */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; + vdso_ts->sec = tk->xtime_sec; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + + /* CLOCK_MONOTONIC_COARSE */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; + vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec = nsec + tk->wall_to_monotonic.tv_nsec; + while (nsec >= NSEC_PER_SEC) { + nsec = nsec - NSEC_PER_SEC; + vdso_ts->sec++; + } + vdso_ts->nsec = nsec; + + if (__arch_use_vsyscall(vdata)) + update_vdso_data(vdata, tk); + + __arch_update_vsyscall(vdata, tk); + + vdso_write_end(vdata); + + __arch_sync_vdso_data(vdata); +} + +void update_vsyscall_tz(void) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + + if (__arch_use_vsyscall(vdata)) { + vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; + vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; + } + + __arch_sync_vdso_data(vdata); +} -- cgit v1.2.3 From a9314773a91a1d3b36270085246a6715a326ff00 Mon Sep 17 00:00:00 2001 From: Nathan Huckleberry Date: Fri, 14 Jun 2019 11:16:04 -0700 Subject: timer_list: Guard procfs specific code With CONFIG_PROC_FS=n the following warning is emitted: kernel/time/timer_list.c:361:36: warning: unused variable 'timer_list_sops' [-Wunused-const-variable] static const struct seq_operations timer_list_sops = { Add #ifdef guard around procfs specific code. Signed-off-by: Nathan Huckleberry Signed-off-by: Thomas Gleixner Reviewed-by: Nick Desaulniers Cc: john.stultz@linaro.org Cc: sboyd@kernel.org Cc: clang-built-linux@googlegroups.com Link: https://github.com/ClangBuiltLinux/linux/issues/534 Link: https://lkml.kernel.org/r/20190614181604.112297-1-nhuck@google.com --- kernel/time/timer_list.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 98ba50dcb1b2..acb326f5f50a 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -282,23 +282,6 @@ static inline void timer_list_header(struct seq_file *m, u64 now) SEQ_printf(m, "\n"); } -static int timer_list_show(struct seq_file *m, void *v) -{ - struct timer_list_iter *iter = v; - - if (iter->cpu == -1 && !iter->second_pass) - timer_list_header(m, iter->now); - else if (!iter->second_pass) - print_cpu(m, iter->cpu, iter->now); -#ifdef CONFIG_GENERIC_CLOCKEVENTS - else if (iter->cpu == -1 && iter->second_pass) - timer_list_show_tickdevices_header(m); - else - print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); -#endif - return 0; -} - void sysrq_timer_list_show(void) { u64 now = ktime_to_ns(ktime_get()); @@ -317,6 +300,24 @@ void sysrq_timer_list_show(void) return; } +#ifdef CONFIG_PROC_FS +static int timer_list_show(struct seq_file *m, void *v) +{ + struct timer_list_iter *iter = v; + + if (iter->cpu == -1 && !iter->second_pass) + timer_list_header(m, iter->now); + else if (!iter->second_pass) + print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + else if (iter->cpu == -1 && iter->second_pass) + timer_list_show_tickdevices_header(m); + else + print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif + return 0; +} + static void *move_iter(struct timer_list_iter *iter, loff_t offset) { for (; offset; offset--) { @@ -376,3 +377,4 @@ static int __init init_timer_list_procfs(void) return 0; } __initcall(init_timer_list_procfs); +#endif -- cgit v1.2.3 From a22793c79d6ea0a492ce1a308ec46df52ee9406e Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 12 Jun 2019 23:48:11 -0700 Subject: smp: Do not mark call_function_data as shared cfd_data is marked as shared, but although it hold pointers to shared data structures, it is private per core. Signed-off-by: Nadav Amit Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Rik van Riel Link: https://lkml.kernel.org/r/20190613064813.8102-8-namit@vmware.com --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index d155374632eb..220ad142f5dd 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -34,7 +34,7 @@ struct call_function_data { cpumask_var_t cpumask_ipi; }; -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); +static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); -- cgit v1.2.3 From caa759323c73676b3e48c8d9c86093c88b4aba97 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 12 Jun 2019 23:48:05 -0700 Subject: smp: Remove smp_call_function() and on_each_cpu() return values The return value is fixed. Remove it and amend the callers. [ tglx: Fixup arm/bL_switcher and powerpc/rtas ] Signed-off-by: Nadav Amit Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Tony Luck Cc: Fenghua Yu Cc: Andrew Morton Link: https://lkml.kernel.org/r/20190613064813.8102-2-namit@vmware.com --- kernel/smp.c | 10 +++------- kernel/up.c | 3 +-- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 220ad142f5dd..616d4d114847 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -487,13 +487,11 @@ EXPORT_SYMBOL(smp_call_function_many); * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -int smp_call_function(smp_call_func_t func, void *info, int wait) +void smp_call_function(smp_call_func_t func, void *info, int wait) { preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); - - return 0; } EXPORT_SYMBOL(smp_call_function); @@ -594,18 +592,16 @@ void __init smp_init(void) * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead * of local_irq_disable/enable(). */ -int on_each_cpu(void (*func) (void *info), void *info, int wait) +void on_each_cpu(void (*func) (void *info), void *info, int wait) { unsigned long flags; - int ret = 0; preempt_disable(); - ret = smp_call_function(func, info, wait); + smp_call_function(func, info, wait); local_irq_save(flags); func(info); local_irq_restore(flags); preempt_enable(); - return ret; } EXPORT_SYMBOL(on_each_cpu); diff --git a/kernel/up.c b/kernel/up.c index 483c9962c999..862b460ab97a 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -35,14 +35,13 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) } EXPORT_SYMBOL(smp_call_function_single_async); -int on_each_cpu(smp_call_func_t func, void *info, int wait) +void on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; local_irq_save(flags); func(info); local_irq_restore(flags); - return 0; } EXPORT_SYMBOL(on_each_cpu); -- cgit v1.2.3 From 8afecaa68df1e94a9d634f1f961533a925f239fc Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 18 Jun 2019 22:33:05 +0800 Subject: softirq: Use __this_cpu_write() in takeover_tasklets() The code is executed with interrupts disabled, so it's safe to use __this_cpu_write(). [ tglx: Massaged changelog ] Signed-off-by: Muchun Song Signed-off-by: Thomas Gleixner Cc: joel@joelfernandes.org Cc: rostedt@goodmis.org Cc: frederic@kernel.org Cc: paulmck@linux.vnet.ibm.com Cc: alexander.levin@verizon.com Cc: peterz@infradead.org Link: https://lkml.kernel.org/r/20190618143305.2038-1-smuchun@gmail.com --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 2c3382378d94..eaf3bdf7c749 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -650,7 +650,7 @@ static int takeover_tasklets(unsigned int cpu) /* Find end, append list for that CPU. */ if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; - this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); + __this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); per_cpu(tasklet_vec, cpu).head = NULL; per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; } -- cgit v1.2.3 From 2eef1399a866c57687962e15142b141a4f8e7862 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 20 Jun 2019 10:18:14 +0800 Subject: modules: fix BUG when load module with rodata=n When loading a module with rodata=n, it causes an executing NX-protected page BUG. [ 32.379191] kernel tried to execute NX-protected page - exploit attempt? (uid: 0) [ 32.382917] BUG: unable to handle page fault for address: ffffffffc0005000 [ 32.385947] #PF: supervisor instruction fetch in kernel mode [ 32.387662] #PF: error_code(0x0011) - permissions violation [ 32.389352] PGD 240c067 P4D 240c067 PUD 240e067 PMD 421a52067 PTE 8000000421a53063 [ 32.391396] Oops: 0011 [#1] SMP PTI [ 32.392478] CPU: 7 PID: 2697 Comm: insmod Tainted: G O 5.2.0-rc5+ #202 [ 32.394588] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 [ 32.398157] RIP: 0010:ko_test_init+0x0/0x1000 [ko_test] [ 32.399662] Code: Bad RIP value. [ 32.400621] RSP: 0018:ffffc900029f3ca8 EFLAGS: 00010246 [ 32.402171] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 32.404332] RDX: 00000000000004c7 RSI: 0000000000000cc0 RDI: ffffffffc0005000 [ 32.406347] RBP: ffffffffc0005000 R08: ffff88842fbebc40 R09: ffffffff810ede4a [ 32.408392] R10: ffffea00108e3480 R11: 0000000000000000 R12: ffff88842bee21a0 [ 32.410472] R13: 0000000000000001 R14: 0000000000000001 R15: ffffc900029f3e78 [ 32.412609] FS: 00007fb4f0c0a700(0000) GS:ffff88842fbc0000(0000) knlGS:0000000000000000 [ 32.414722] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 32.416290] CR2: ffffffffc0004fd6 CR3: 0000000421a90004 CR4: 0000000000020ee0 [ 32.418471] Call Trace: [ 32.419136] do_one_initcall+0x41/0x1df [ 32.420199] ? _cond_resched+0x10/0x40 [ 32.421433] ? kmem_cache_alloc_trace+0x36/0x160 [ 32.422827] do_init_module+0x56/0x1f7 [ 32.423946] load_module+0x1e67/0x2580 [ 32.424947] ? __alloc_pages_nodemask+0x150/0x2c0 [ 32.426413] ? map_vm_area+0x2d/0x40 [ 32.427530] ? __vmalloc_node_range+0x1ef/0x260 [ 32.428850] ? __do_sys_init_module+0x135/0x170 [ 32.430060] ? _cond_resched+0x10/0x40 [ 32.431249] __do_sys_init_module+0x135/0x170 [ 32.432547] do_syscall_64+0x43/0x120 [ 32.433853] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Because if rodata=n, set_memory_x() can't be called, fix this by calling set_memory_x in complete_formation(); Fixes: f2c65fb3221a ("x86/modules: Avoid breaking W^X while loading modules") Suggested-by: Jian Cheng Reviewed-by: Nadav Amit Signed-off-by: Yang Yingliang Signed-off-by: Jessica Yu --- kernel/module.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index c1517053e9d6..41258bab24f1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1972,13 +1972,9 @@ void module_enable_ro(const struct module *mod, bool after_init) set_vm_flush_reset_perms(mod->core_layout.base); set_vm_flush_reset_perms(mod->init_layout.base); frob_text(&mod->core_layout, set_memory_ro); - frob_text(&mod->core_layout, set_memory_x); frob_rodata(&mod->core_layout, set_memory_ro); - frob_text(&mod->init_layout, set_memory_ro); - frob_text(&mod->init_layout, set_memory_x); - frob_rodata(&mod->init_layout, set_memory_ro); if (after_init) @@ -2041,6 +2037,12 @@ void set_all_modules_text_ro(void) static void module_enable_nx(const struct module *mod) { } #endif +static void module_enable_x(const struct module *mod) +{ + frob_text(&mod->core_layout, set_memory_x); + frob_text(&mod->init_layout, set_memory_x); +} + #ifdef CONFIG_LIVEPATCH /* * Persist Elf information about a module. Copy the Elf header, @@ -3637,6 +3639,7 @@ static int complete_formation(struct module *mod, struct load_info *info) module_enable_ro(mod, false); module_enable_nx(mod); + module_enable_x(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ -- cgit v1.2.3 From 38b37d631aec80da0c65ac03a7ef680b468c7857 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Fri, 7 Jun 2019 12:49:11 +0200 Subject: module: allow arch overrides for .exit section names Some archs like ARM store unwind information for .exit.text in sections with unusual names. As this unwind information refers to .exit.text, it must not be loaded when .exit.text is not loaded (when CONFIG_MODULE_UNLOAD is unset); otherwise, loading a module can fail due to relocation failures. Signed-off-by: Matthias Schiffer Signed-off-by: Jessica Yu --- kernel/module.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 41258bab24f1..537c456ce3ee 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2748,6 +2748,11 @@ void * __weak module_alloc(unsigned long size) return vmalloc_exec(size); } +bool __weak module_exit_section(const char *name) +{ + return strstarts(name, ".exit"); +} + #ifdef CONFIG_DEBUG_KMEMLEAK static void kmemleak_load_module(const struct module *mod, const struct load_info *info) @@ -2937,7 +2942,7 @@ static int rewrite_section_headers(struct load_info *info, int flags) #ifndef CONFIG_MODULE_UNLOAD /* Don't load .exit sections */ - if (strstarts(info->secstrings+shdr->sh_name, ".exit")) + if (module_exit_section(info->secstrings+shdr->sh_name)) shdr->sh_flags &= ~(unsigned long)SHF_ALLOC; #endif } -- cgit v1.2.3 From e4f07120210a1794c1f1ae64d209a2fbc7bd2682 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 19 Jun 2019 12:01:05 -0700 Subject: bpf: fix NULL deref in btf_type_is_resolve_source_only Commit 1dc92851849c ("bpf: kernel side support for BTF Var and DataSec") added invocations of btf_type_is_resolve_source_only before btf_type_nosize_or_null which checks for the NULL pointer. Swap the order of btf_type_nosize_or_null and btf_type_is_resolve_source_only to make sure the do the NULL pointer check first. Fixes: 1dc92851849c ("bpf: kernel side support for BTF Var and DataSec") Reported-by: syzbot Signed-off-by: Stanislav Fomichev Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index cad09858a5f2..546ebee39e2a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1928,8 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_is_resolve_source_only(index_type) || - btf_type_nosize_or_null(index_type)) { + if (btf_type_nosize_or_null(index_type) || + btf_type_is_resolve_source_only(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } @@ -1948,8 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_is_resolve_source_only(elem_type) || - btf_type_nosize_or_null(elem_type)) { + if (btf_type_nosize_or_null(elem_type) || + btf_type_is_resolve_source_only(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; @@ -2170,8 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); - if (btf_type_is_resolve_source_only(member_type) || - btf_type_nosize_or_null(member_type)) { + if (btf_type_nosize_or_null(member_type) || + btf_type_is_resolve_source_only(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; -- cgit v1.2.3 From 8ec59c0f5f4966f89f4e3e3cab81710c7fa959d0 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 17 Jun 2019 17:00:17 +0200 Subject: sched/topology: Remove unused 'sd' parameter from arch_scale_cpu_capacity() The 'struct sched_domain *sd' parameter to arch_scale_cpu_capacity() is unused since commit: 765d0af19f5f ("sched/topology: Remove the ::smt_gain field from 'struct sched_domain'") Remove it. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Viresh Kumar Reviewed-by: Valentin Schneider Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: gregkh@linuxfoundation.org Cc: linux@armlinux.org.uk Cc: quentin.perret@arm.com Cc: rafael@kernel.org Link: https://lkml.kernel.org/r/1560783617-5827-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/power/energy_model.c | 2 +- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 6 +++--- kernel/sched/pelt.c | 2 +- kernel/sched/pelt.h | 2 +- kernel/sched/sched.h | 2 +- kernel/sched/topology.c | 8 ++++---- 8 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 7d66ee68aaaf..0a9326f5f421 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -223,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, * All CPUs of a domain must have the same micro-architecture * since they all share the same table. */ - cap = arch_scale_cpu_capacity(NULL, cpu); + cap = arch_scale_cpu_capacity(cpu); if (prev_cap && prev_cap != cap) { pr_err("CPUs of %*pbl must have the same capacity\n", cpumask_pr_args(span)); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 962cf343f798..7c4ce69067c4 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -276,7 +276,7 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); unsigned long util = cpu_util_cfs(rq); - unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c1ef30861068..8b5bb2ac16e2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1195,7 +1195,7 @@ static void update_curr_dl(struct rq *rq) &curr->dl); } else { unsigned long scale_freq = arch_scale_freq_capacity(cpu); - unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + unsigned long scale_cpu = arch_scale_cpu_capacity(cpu); scaled_delta_exec = cap_scale(delta_exec, scale_freq); scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c11dcdedcbc..4f8754157763 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -764,7 +764,7 @@ void post_init_entity_util_avg(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; - long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; if (cap > 0) { @@ -7646,7 +7646,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(sd, cpu); + unsigned long max = arch_scale_cpu_capacity(cpu); unsigned long used, free; unsigned long irq; @@ -7671,7 +7671,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) unsigned long capacity = scale_rt_capacity(sd, cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); if (!capacity) capacity = 1; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index befce29bd882..42ea66b07b1d 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -366,7 +366,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) * reflect the real amount of computation */ running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); - running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq))); /* * We know the time that has been used by interrupt since last update diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 7489d5f56960..afff644da065 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) * Scale the elapsed time to reflect the real amount of * computation */ - delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq))); delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); rq->clock_pelt += delta; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b08dee29ef5e..e58ab597ec88 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2248,7 +2248,7 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) { - unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + unsigned long max = arch_scale_cpu_capacity(cpu); return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 63184cf0d0d7..f751ce0b783e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1874,10 +1874,10 @@ static struct sched_domain_topology_level unsigned long cap; /* Is there any asymmetry? */ - cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); + cap = arch_scale_cpu_capacity(cpumask_first(cpu_map)); for_each_cpu(i, cpu_map) { - if (arch_scale_cpu_capacity(NULL, i) != cap) { + if (arch_scale_cpu_capacity(i) != cap) { asym = true; break; } @@ -1892,7 +1892,7 @@ static struct sched_domain_topology_level * to everyone. */ for_each_cpu(i, cpu_map) { - unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); + unsigned long max_capacity = arch_scale_cpu_capacity(i); int tl_id = 0; for_each_sd_topology(tl) { @@ -1902,7 +1902,7 @@ static struct sched_domain_topology_level for_each_cpu_and(j, tl->mask(i), cpu_map) { unsigned long capacity; - capacity = arch_scale_cpu_capacity(NULL, j); + capacity = arch_scale_cpu_capacity(j); if (capacity <= max_capacity) continue; -- cgit v1.2.3 From 016190a4b5824df2d5bb97951a04dd3629973671 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 11 Jun 2019 15:29:07 +0300 Subject: sched/wait: Deduplicate code with do-while Statements in the loop's body and before it are identical. Use do-while to not repeat it. Signed-off-by: Pavel Begunkov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/43ffea6ee2152b90dedf962eac851609e4197218.1560256112.git.asml.silence@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index fa0f9adfb752..c1e566a114ca 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -118,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int bookmark.func = NULL; INIT_LIST_HEAD(&bookmark.entry); - spin_lock_irqsave(&wq_head->lock, flags); - nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark); - spin_unlock_irqrestore(&wq_head->lock, flags); - - while (bookmark.flags & WQ_FLAG_BOOKMARK) { + do { spin_lock_irqsave(&wq_head->lock, flags); nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark); spin_unlock_irqrestore(&wq_head->lock, flags); - } + } while (bookmark.flags & WQ_FLAG_BOOKMARK); } /** -- cgit v1.2.3 From 9ba5090aecac08ff3ae54ac3bd94b61db7708ffc Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 4 Jun 2019 12:14:54 +0100 Subject: sched/autogroup: Make autogroup_path() always available Remove the #ifdef CONFIG_SCHED_DEBUG. Some of the tracepoints to be introduced in later patches need to access this function. Hence make it always available since the tracepoints are not protected by CONFIG_SCHED_DEBUG. Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Pavankumar Kondeti Cc: Peter Zijlstra Cc: Quentin Perret Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Uwe Kleine-Konig Link: https://lkml.kernel.org/r/20190604111459.2862-2-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/autogroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2d4ff5353ded..2067080bb235 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -259,7 +259,6 @@ out: } #endif /* CONFIG_PROC_FS */ -#ifdef CONFIG_SCHED_DEBUG int autogroup_path(struct task_group *tg, char *buf, int buflen) { if (!task_group_is_autogroup(tg)) @@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } -#endif -- cgit v1.2.3 From 3c93a0c04dfdcba199982b53b97488b1b1d90eff Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 4 Jun 2019 12:14:55 +0100 Subject: sched/debug: Add a new sched_trace_*() helper functions The new functions allow modules to access internal data structures of unexported struct cfs_rq and struct rq to extract important information from the tracepoints to be introduced in later patches. While at it fix alphabetical order of struct declarations in sched.h Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Pavankumar Kondeti Cc: Peter Zijlstra Cc: Quentin Perret Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Uwe Kleine-Konig Link: https://lkml.kernel.org/r/20190604111459.2862-3-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4f8754157763..461c3e9a67b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -275,6 +275,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } +static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) +{ + if (!path) + return; + + if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) + autogroup_path(cfs_rq->tg, path, len); + else if (cfs_rq && cfs_rq->tg->css.cgroup) + cgroup_path(cfs_rq->tg->css.cgroup, path, len); + else + strlcpy(path, "(null)", len); +} + static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -449,6 +462,12 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return NULL; } +static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) +{ + if (path) + strlcpy(path, "(null)", len); +} + static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { return true; @@ -10408,3 +10427,83 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } + +/* + * Helper functions to facilitate extracting info from tracepoints. + */ + +const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) +{ +#ifdef CONFIG_SMP + return cfs_rq ? &cfs_rq->avg : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); + +char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) +{ + if (!cfs_rq) { + if (str) + strlcpy(str, "(null)", len); + else + return NULL; + } + + cfs_rq_tg_path(cfs_rq, str, len); + return str; +} +EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); + +int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) +{ + return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); + +const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq ? &rq->avg_rt : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); + +const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq ? &rq->avg_dl : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); + +const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) + return rq ? &rq->avg_irq : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); + +int sched_trace_rq_cpu(struct rq *rq) +{ + return rq ? cpu_of(rq) : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); + +const struct cpumask *sched_trace_rd_span(struct root_domain *rd) +{ +#ifdef CONFIG_SMP + return rd ? rd->span : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rd_span); -- cgit v1.2.3 From ba19f51fcb549c7ee6261da243eea55a47e98d78 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 4 Jun 2019 12:14:56 +0100 Subject: sched/debug: Add new tracepoints to track PELT at rq level The new tracepoints allow tracking PELT signals at rq level for all scheduling classes + irq. Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Pavankumar Kondeti Cc: Peter Zijlstra Cc: Quentin Perret Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Uwe Kleine-Konig Link: https://lkml.kernel.org/r/20190604111459.2862-4-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++++++ kernel/sched/pelt.c | 9 ++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 461c3e9a67b2..e883d7e17e36 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3347,6 +3347,8 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) update_tg_cfs_util(cfs_rq, se, gcfs_rq); update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); + trace_pelt_cfs_tp(cfs_rq); + return 1; } @@ -3499,6 +3501,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); cfs_rq_util_change(cfs_rq, flags); + + trace_pelt_cfs_tp(cfs_rq); } /** @@ -3518,6 +3522,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); cfs_rq_util_change(cfs_rq, 0); + + trace_pelt_cfs_tp(cfs_rq); } /* diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 42ea66b07b1d..4e961b55b5ea 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -28,6 +28,8 @@ #include "sched.h" #include "pelt.h" +#include + /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) @@ -292,6 +294,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1, 1); + trace_pelt_cfs_tp(cfs_rq); return 1; } @@ -317,6 +320,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) running)) { ___update_load_avg(&rq->avg_rt, 1, 1); + trace_pelt_rt_tp(rq); return 1; } @@ -340,6 +344,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) running)) { ___update_load_avg(&rq->avg_dl, 1, 1); + trace_pelt_dl_tp(rq); return 1; } @@ -388,8 +393,10 @@ int update_irq_load_avg(struct rq *rq, u64 running) 1, 1); - if (ret) + if (ret) { ___update_load_avg(&rq->avg_irq, 1, 1); + trace_pelt_irq_tp(rq); + } return ret; } -- cgit v1.2.3 From 8de6242cca17d9299e654e29c966d8612d397272 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 4 Jun 2019 12:14:57 +0100 Subject: sched/debug: Add new tracepoint to track PELT at se level The new tracepoint allows tracking PELT signals at sched_entity level. Which is supported in CFS tasks and taskgroups only. Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Pavankumar Kondeti Cc: Peter Zijlstra Cc: Quentin Perret Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Uwe Kleine-Konig Link: https://lkml.kernel.org/r/20190604111459.2862-5-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 + kernel/sched/pelt.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e883d7e17e36..75218ab1fa07 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3348,6 +3348,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); trace_pelt_cfs_tp(cfs_rq); + trace_pelt_se_tp(se); return 1; } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 4e961b55b5ea..a96db50d40e0 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -267,6 +267,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) { if (___update_load_sum(now, &se->avg, 0, 0, 0)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + trace_pelt_se_tp(se); return 1; } @@ -280,6 +281,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); cfs_se_util_change(&se->avg); + trace_pelt_se_tp(se); return 1; } -- cgit v1.2.3 From f9f240f96efc5bcec62379eac701523e11fbb45b Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 4 Jun 2019 12:14:58 +0100 Subject: sched/debug: Add sched_overutilized tracepoint The new tracepoint allows us to track the changes in overutilized status. Overutilized status is associated with EAS. It indicates that the system is in high performance state. EAS is disabled when the system is in this state since there's not much energy savings while high performance tasks are pushing the system to the limit and it's better to default to the spreading behavior of the scheduler. This tracepoint helps understanding and debugging the conditions under which this happens. Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Pavankumar Kondeti Cc: Peter Zijlstra Cc: Quentin Perret Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Uwe Kleine-Konig Link: https://lkml.kernel.org/r/20190604111459.2862-6-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 75218ab1fa07..11ec52709323 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5181,8 +5181,10 @@ static inline bool cpu_overutilized(int cpu) static inline void update_overutilized_status(struct rq *rq) { - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); + trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); + } } #else static inline void update_overutilized_status(struct rq *rq) { } @@ -8214,8 +8216,12 @@ next_group: /* Update over-utilization (tipping point, U >= 0) indicator */ WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); + trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); } else if (sg_status & SG_OVERUTILIZED) { - WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED); + struct root_domain *rd = env->dst_rq->rd; + + WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); + trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); } } -- cgit v1.2.3 From a056a5bed7fa67706574b00cf1122c38596b2be1 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 4 Jun 2019 12:14:59 +0100 Subject: sched/debug: Export the newly added tracepoints So that external modules can hook into them and extract the info they need. Since these new tracepoints have no events associated with them exporting these tracepoints make them useful for external modules to perform testing and debugging. There's no other way otherwise to access them. BPF doesn't have infrastructure to access these bare tracepoints either. Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Pavankumar Kondeti Cc: Peter Zijlstra Cc: Quentin Perret Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Uwe Kleine-Konig Link: https://lkml.kernel.org/r/20190604111459.2862-7-qais.yousef@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83bd6bb32a34..e5e02d23e693 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -23,6 +23,17 @@ #define CREATE_TRACE_POINTS #include +/* + * Export tracepoints that act as a bare tracehook (ie: have no trace event + * associated with them) to allow external modules to probe them. + */ +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); + DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) -- cgit v1.2.3 From a3df067974c52df936f548ed218120f623c4c560 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 18 Jun 2019 14:23:10 +0200 Subject: sched/fair: Rename weighted_cpuload() to cpu_runnable_load() The term 'weighted' is not needed since there is no 'unweighted' load. Instead use the term 'runnable' to distinguish 'runnable' load (avg.runnable_load_avg) used in load balance from load (avg.load_avg) which is the sum of 'runnable' and 'blocked' load. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rik van Riel Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lkml.kernel.org/r/57f27a7f-2775-d832-e965-0f4d51bb1954@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11ec52709323..3bdcd3c718bc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1485,7 +1485,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long weighted_cpuload(struct rq *rq); +static unsigned long cpu_runnable_load(struct rq *rq); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -1506,7 +1506,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) for_each_cpu(cpu, cpumask_of_node(nid)) { struct rq *rq = cpu_rq(cpu); - ns->load += weighted_cpuload(rq); + ns->load += cpu_runnable_load(rq); ns->compute_capacity += capacity_of(cpu); } @@ -5366,7 +5366,7 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -static unsigned long weighted_cpuload(struct rq *rq) +static unsigned long cpu_runnable_load(struct rq *rq) { return cfs_rq_runnable_load_avg(&rq->cfs); } @@ -5380,7 +5380,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = weighted_cpuload(rq); + unsigned long load_avg = cpu_runnable_load(rq); if (nr_running) return load_avg / nr_running; @@ -5478,7 +5478,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, s64 this_eff_load, prev_eff_load; unsigned long task_load; - this_eff_load = weighted_cpuload(cpu_rq(this_cpu)); + this_eff_load = cpu_runnable_load(cpu_rq(this_cpu)); if (sync) { unsigned long current_load = task_h_load(current); @@ -5496,7 +5496,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); - prev_eff_load = weighted_cpuload(cpu_rq(prev_cpu)); + prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu)); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; @@ -5584,7 +5584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, max_spare_cap = 0; for_each_cpu(i, sched_group_span(group)) { - load = weighted_cpuload(cpu_rq(i)); + load = cpu_runnable_load(cpu_rq(i)); runnable_load += load; avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); @@ -5720,7 +5720,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(cpu_rq(i)); + load = cpu_runnable_load(cpu_rq(i)); if (load < min_load) { min_load = load; least_loaded_cpu = i; @@ -7291,7 +7291,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) static const unsigned int sched_nr_migrate_break = 32; /* - * detach_tasks() -- tries to detach up to imbalance weighted load from + * detach_tasks() -- tries to detach up to imbalance runnable load from * busiest_rq, as part of a balancing operation within domain "sd". * * Returns number of detached tasks if successful and 0 otherwise. @@ -7359,7 +7359,7 @@ static int detach_tasks(struct lb_env *env) /* * We only want to steal up to the prescribed amount of - * weighted load. + * runnable load. */ if (env->imbalance <= 0) break; @@ -7969,7 +7969,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; - sgs->group_load += weighted_cpuload(rq); + sgs->group_load += cpu_runnable_load(rq); sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; @@ -8427,7 +8427,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. * - * Also calculates the amount of weighted load which should be moved + * Also calculates the amount of runnable load which should be moved * to restore balance. * * @env: The load balancing environment. @@ -8546,7 +8546,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_span(group), env->cpus) { - unsigned long capacity, wl; + unsigned long capacity, load; enum fbq_type rt; rq = cpu_rq(i); @@ -8600,30 +8600,30 @@ static struct rq *find_busiest_queue(struct lb_env *env, rq->nr_running == 1) continue; - wl = weighted_cpuload(rq); + load = cpu_runnable_load(rq); /* - * When comparing with imbalance, use weighted_cpuload() + * When comparing with imbalance, use cpu_runnable_load() * which is not scaled with the CPU capacity. */ - if (rq->nr_running == 1 && wl > env->imbalance && + if (rq->nr_running == 1 && load > env->imbalance && !check_cpu_capacity(rq, env->sd)) continue; /* * For the load comparisons with the other CPU's, consider - * the weighted_cpuload() scaled with the CPU capacity, so + * the cpu_runnable_load() scaled with the CPU capacity, so * that the load can be moved away from the CPU that is * potentially running at a lower capacity. * - * Thus we're looking for max(wl_i / capacity_i), crosswise + * Thus we're looking for max(load_i / capacity_i), crosswise * multiplication to rid ourselves of the division works out - * to: wl_i * capacity_j > wl_j * capacity_i; where j is + * to: load_i * capacity_j > load_j * capacity_i; where j is * our previous maximum. */ - if (wl * busiest_capacity > busiest_load * capacity) { - busiest_load = wl; + if (load * busiest_capacity > busiest_load * capacity) { + busiest_load = load; busiest_capacity = capacity; busiest = rq; } -- cgit v1.2.3 From 69842cba9ace84849bb9b8edcdf2cefccd97901c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:02 +0100 Subject: sched/uclamp: Add CPU's clamp buckets refcounting Utilization clamping allows to clamp the CPU's utilization within a [util_min, util_max] range, depending on the set of RUNNABLE tasks on that CPU. Each task references two "clamp buckets" defining its minimum and maximum (util_{min,max}) utilization "clamp values". A CPU's clamp bucket is active if there is at least one RUNNABLE tasks enqueued on that CPU and refcounting that bucket. When a task is {en,de}queued {on,from} a rq, the set of active clamp buckets on that CPU can change. If the set of active clamp buckets changes for a CPU a new "aggregated" clamp value is computed for that CPU. This is because each clamp bucket enforces a different utilization clamp value. Clamp values are always MAX aggregated for both util_min and util_max. This ensures that no task can affect the performance of other co-scheduled tasks which are more boosted (i.e. with higher util_min clamp) or less capped (i.e. with higher util_max clamp). A task has: task_struct::uclamp[clamp_id]::bucket_id to track the "bucket index" of the CPU's clamp bucket it refcounts while enqueued, for each clamp index (clamp_id). A runqueue has: rq::uclamp[clamp_id]::bucket[bucket_id].tasks to track how many RUNNABLE tasks on that CPU refcount each clamp bucket (bucket_id) of a clamp index (clamp_id). It also has a: rq::uclamp[clamp_id]::bucket[bucket_id].value to track the clamp value of each clamp bucket (bucket_id) of a clamp index (clamp_id). The rq::uclamp::bucket[clamp_id][] array is scanned every time it's needed to find a new MAX aggregated clamp value for a clamp_id. This operation is required only when it's dequeued the last task of a clamp bucket tracking the current MAX aggregated clamp value. In this case, the CPU is either entering IDLE or going to schedule a less boosted or more clamped task. The expected number of different clamp values configured at build time is small enough to fit the full unordered array into a single cache line, for configurations of up to 7 buckets. Add to struct rq the basic data structures required to refcount the number of RUNNABLE tasks for each clamp bucket. Add also the max aggregation required to update the rq's clamp value at each enqueue/dequeue event. Use a simple linear mapping of clamp values into clamp buckets. Pre-compute and cache bucket_id to avoid integer divisions at enqueue/dequeue time. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 51 ++++++++++++++++ 2 files changed, 217 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e5e02d23e693..d8c1e67afd82 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -772,6 +772,168 @@ static void set_load_weight(struct task_struct *p, bool update_load) } } +#ifdef CONFIG_UCLAMP_TASK + +/* Integer rounded range for each bucket */ +#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) + +#define for_each_clamp_id(clamp_id) \ + for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) + +static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) +{ + return clamp_value / UCLAMP_BUCKET_DELTA; +} + +static inline unsigned int uclamp_none(int clamp_id) +{ + if (clamp_id == UCLAMP_MIN) + return 0; + return SCHED_CAPACITY_SCALE; +} + +static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value) +{ + uc_se->value = value; + uc_se->bucket_id = uclamp_bucket_id(value); +} + +static inline +unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id) +{ + struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; + int bucket_id = UCLAMP_BUCKETS - 1; + + /* + * Since both min and max clamps are max aggregated, find the + * top most bucket with tasks in. + */ + for ( ; bucket_id >= 0; bucket_id--) { + if (!bucket[bucket_id].tasks) + continue; + return bucket[bucket_id].value; + } + + /* No tasks -- default clamp values */ + return uclamp_none(clamp_id); +} + +/* + * When a task is enqueued on a rq, the clamp bucket currently defined by the + * task's uclamp::bucket_id is refcounted on that rq. This also immediately + * updates the rq's clamp value if required. + */ +static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, + unsigned int clamp_id) +{ + struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; + struct uclamp_se *uc_se = &p->uclamp[clamp_id]; + struct uclamp_bucket *bucket; + + lockdep_assert_held(&rq->lock); + + bucket = &uc_rq->bucket[uc_se->bucket_id]; + bucket->tasks++; + + if (uc_se->value > READ_ONCE(uc_rq->value)) + WRITE_ONCE(uc_rq->value, bucket->value); +} + +/* + * When a task is dequeued from a rq, the clamp bucket refcounted by the task + * is released. If this is the last task reference counting the rq's max + * active clamp value, then the rq's clamp value is updated. + * + * Both refcounted tasks and rq's cached clamp values are expected to be + * always valid. If it's detected they are not, as defensive programming, + * enforce the expected state and warn. + */ +static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, + unsigned int clamp_id) +{ + struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; + struct uclamp_se *uc_se = &p->uclamp[clamp_id]; + struct uclamp_bucket *bucket; + unsigned int rq_clamp; + + lockdep_assert_held(&rq->lock); + + bucket = &uc_rq->bucket[uc_se->bucket_id]; + SCHED_WARN_ON(!bucket->tasks); + if (likely(bucket->tasks)) + bucket->tasks--; + + if (likely(bucket->tasks)) + return; + + rq_clamp = READ_ONCE(uc_rq->value); + /* + * Defensive programming: this should never happen. If it happens, + * e.g. due to future modification, warn and fixup the expected value. + */ + SCHED_WARN_ON(bucket->value > rq_clamp); + if (bucket->value >= rq_clamp) + WRITE_ONCE(uc_rq->value, uclamp_rq_max_value(rq, clamp_id)); +} + +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) +{ + unsigned int clamp_id; + + if (unlikely(!p->sched_class->uclamp_enabled)) + return; + + for_each_clamp_id(clamp_id) + uclamp_rq_inc_id(rq, p, clamp_id); +} + +static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) +{ + unsigned int clamp_id; + + if (unlikely(!p->sched_class->uclamp_enabled)) + return; + + for_each_clamp_id(clamp_id) + uclamp_rq_dec_id(rq, p, clamp_id); +} + +static void __init init_uclamp(void) +{ + unsigned int clamp_id; + int cpu; + + for_each_possible_cpu(cpu) { + struct uclamp_bucket *bucket; + struct uclamp_rq *uc_rq; + unsigned int bucket_id; + + memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); + + for_each_clamp_id(clamp_id) { + uc_rq = &cpu_rq(cpu)->uclamp[clamp_id]; + + bucket_id = 1; + while (bucket_id < UCLAMP_BUCKETS) { + bucket = &uc_rq->bucket[bucket_id]; + bucket->value = bucket_id * UCLAMP_BUCKET_DELTA; + ++bucket_id; + } + } + } + + for_each_clamp_id(clamp_id) { + uclamp_se_set(&init_task.uclamp[clamp_id], + uclamp_none(clamp_id)); + } +} + +#else /* CONFIG_UCLAMP_TASK */ +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } +static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } +static inline void init_uclamp(void) { } +#endif /* CONFIG_UCLAMP_TASK */ + static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & ENQUEUE_NOCLOCK)) @@ -782,6 +944,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) psi_enqueue(p, flags & ENQUEUE_WAKEUP); } + uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); } @@ -795,6 +958,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) psi_dequeue(p, flags & DEQUEUE_SLEEP); } + uclamp_rq_dec(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -6093,6 +6257,8 @@ void __init sched_init(void) psi_init(); + init_uclamp(); + scheduler_running = 1; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e58ab597ec88..cecc6baaba93 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -791,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work); #endif #endif /* CONFIG_SMP */ +#ifdef CONFIG_UCLAMP_TASK +/* + * struct uclamp_bucket - Utilization clamp bucket + * @value: utilization clamp value for tasks on this clamp bucket + * @tasks: number of RUNNABLE tasks on this clamp bucket + * + * Keep track of how many tasks are RUNNABLE for a given utilization + * clamp value. + */ +struct uclamp_bucket { + unsigned long value : bits_per(SCHED_CAPACITY_SCALE); + unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE); +}; + +/* + * struct uclamp_rq - rq's utilization clamp + * @value: currently active clamp values for a rq + * @bucket: utilization clamp buckets affecting a rq + * + * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values. + * A clamp value is affecting a rq when there is at least one task RUNNABLE + * (or actually running) with that value. + * + * There are up to UCLAMP_CNT possible different clamp values, currently there + * are only two: minimum utilization and maximum utilization. + * + * All utilization clamping values are MAX aggregated, since: + * - for util_min: we want to run the CPU at least at the max of the minimum + * utilization required by its currently RUNNABLE tasks. + * - for util_max: we want to allow the CPU to run up to the max of the + * maximum utilization allowed by its currently RUNNABLE tasks. + * + * Since on each system we expect only a limited number of different + * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track + * the metrics required to compute all the per-rq utilization clamp values. + */ +struct uclamp_rq { + unsigned int value; + struct uclamp_bucket bucket[UCLAMP_BUCKETS]; +}; +#endif /* CONFIG_UCLAMP_TASK */ + /* * This is the main, per-CPU runqueue data structure. * @@ -825,6 +867,11 @@ struct rq { unsigned long nr_load_updates; u64 nr_switches; +#ifdef CONFIG_UCLAMP_TASK + /* Utilization clamp values based on CPU's RUNNABLE tasks */ + struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; +#endif + struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; @@ -1639,6 +1686,10 @@ extern const u32 sched_prio_to_wmult[40]; struct sched_class { const struct sched_class *next; +#ifdef CONFIG_UCLAMP_TASK + int uclamp_enabled; +#endif + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); -- cgit v1.2.3 From 60daf9c19410604f08c99e146bc378c8a64f4ccd Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:03 +0100 Subject: sched/uclamp: Add bucket local max tracking Because of bucketization, different task-specific clamp values are tracked in the same bucket. For example, with 20% bucket size and assuming to have: Task1: util_min=25% Task2: util_min=35% both tasks will be refcounted in the [20..39]% bucket and always boosted only up to 20% thus implementing a simple floor aggregation normally used in histograms. In systems with only few and well-defined clamp values, it would be useful to track the exact clamp value required by a task whenever possible. For example, if a system requires only 23% and 47% boost values then it's possible to track the exact boost required by each task using only 3 buckets of ~33% size each. Introduce a mechanism to max aggregate the requested clamp values of RUNNABLE tasks in the same bucket. Keep it simple by resetting the bucket value to its base value only when a bucket becomes inactive. Allow a limited and controlled overboosting margin for tasks recounted in the same bucket. In systems where the boost values are not known in advance, it is still possible to control the maximum acceptable overboosting margin by tuning the number of clamp groups. For example, 20 groups ensure a 5% maximum overboost. Remove the rq bucket initialization code since a correct bucket value is now computed when a task is refcounted into a CPU's rq. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-3-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8c1e67afd82..0a6eff8a278b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -785,6 +785,11 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) return clamp_value / UCLAMP_BUCKET_DELTA; } +static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) +{ + return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); +} + static inline unsigned int uclamp_none(int clamp_id) { if (clamp_id == UCLAMP_MIN) @@ -822,6 +827,11 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id) * When a task is enqueued on a rq, the clamp bucket currently defined by the * task's uclamp::bucket_id is refcounted on that rq. This also immediately * updates the rq's clamp value if required. + * + * Tasks can have a task-specific value requested from user-space, track + * within each bucket the maximum value for tasks refcounted in it. + * This "local max aggregation" allows to track the exact "requested" value + * for each bucket when all its RUNNABLE tasks require the same clamp. */ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, unsigned int clamp_id) @@ -835,8 +845,15 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, bucket = &uc_rq->bucket[uc_se->bucket_id]; bucket->tasks++; + /* + * Local max aggregation: rq buckets always track the max + * "requested" clamp value of its RUNNABLE tasks. + */ + if (bucket->tasks == 1 || uc_se->value > bucket->value) + bucket->value = uc_se->value; + if (uc_se->value > READ_ONCE(uc_rq->value)) - WRITE_ONCE(uc_rq->value, bucket->value); + WRITE_ONCE(uc_rq->value, uc_se->value); } /* @@ -863,6 +880,12 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, if (likely(bucket->tasks)) bucket->tasks--; + /* + * Keep "local max aggregation" simple and accept to (possibly) + * overboost some RUNNABLE tasks in the same bucket. + * The rq clamp bucket value is reset to its base value whenever + * there are no more RUNNABLE tasks refcounting it. + */ if (likely(bucket->tasks)) return; @@ -903,25 +926,9 @@ static void __init init_uclamp(void) unsigned int clamp_id; int cpu; - for_each_possible_cpu(cpu) { - struct uclamp_bucket *bucket; - struct uclamp_rq *uc_rq; - unsigned int bucket_id; - + for_each_possible_cpu(cpu) memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); - for_each_clamp_id(clamp_id) { - uc_rq = &cpu_rq(cpu)->uclamp[clamp_id]; - - bucket_id = 1; - while (bucket_id < UCLAMP_BUCKETS) { - bucket = &uc_rq->bucket[bucket_id]; - bucket->value = bucket_id * UCLAMP_BUCKET_DELTA; - ++bucket_id; - } - } - } - for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp[clamp_id], uclamp_none(clamp_id)); -- cgit v1.2.3 From e496187da71070687b55ff455e7d8d7d7f0ae0b9 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:04 +0100 Subject: sched/uclamp: Enforce last task's UCLAMP_MAX When a task sleeps it removes its max utilization clamp from its CPU. However, the blocked utilization on that CPU can be higher than the max clamp value enforced while the task was running. This allows undesired CPU frequency increases while a CPU is idle, for example, when another CPU on the same frequency domain triggers a frequency update, since schedutil can now see the full not clamped blocked utilization of the idle CPU. Fix this by using: uclamp_rq_dec_id(p, rq, UCLAMP_MAX) uclamp_rq_max_value(rq, UCLAMP_MAX, clamp_value) to detect when a CPU has no more RUNNABLE clamped tasks and to flag this condition. Don't track any minimum utilization clamps since an idle CPU never requires a minimum frequency. The decay of the blocked utilization is good enough to reduce the CPU frequency. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-4-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 49 ++++++++++++++++++++++++++++++++++++++++++++----- kernel/sched/sched.h | 2 ++ 2 files changed, 46 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0a6eff8a278b..2dde735635ec 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -803,8 +803,36 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value) uc_se->bucket_id = uclamp_bucket_id(value); } +static inline unsigned int +uclamp_idle_value(struct rq *rq, unsigned int clamp_id, + unsigned int clamp_value) +{ + /* + * Avoid blocked utilization pushing up the frequency when we go + * idle (which drops the max-clamp) by retaining the last known + * max-clamp. + */ + if (clamp_id == UCLAMP_MAX) { + rq->uclamp_flags |= UCLAMP_FLAG_IDLE; + return clamp_value; + } + + return uclamp_none(UCLAMP_MIN); +} + +static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, + unsigned int clamp_value) +{ + /* Reset max-clamp retention only on idle exit */ + if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + return; + + WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value); +} + static inline -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id) +unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, + unsigned int clamp_value) { struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; int bucket_id = UCLAMP_BUCKETS - 1; @@ -820,7 +848,7 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id) } /* No tasks -- default clamp values */ - return uclamp_none(clamp_id); + return uclamp_idle_value(rq, clamp_id, clamp_value); } /* @@ -845,6 +873,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, bucket = &uc_rq->bucket[uc_se->bucket_id]; bucket->tasks++; + uclamp_idle_reset(rq, clamp_id, uc_se->value); + /* * Local max aggregation: rq buckets always track the max * "requested" clamp value of its RUNNABLE tasks. @@ -871,6 +901,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; struct uclamp_bucket *bucket; + unsigned int bkt_clamp; unsigned int rq_clamp; lockdep_assert_held(&rq->lock); @@ -895,8 +926,10 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, * e.g. due to future modification, warn and fixup the expected value. */ SCHED_WARN_ON(bucket->value > rq_clamp); - if (bucket->value >= rq_clamp) - WRITE_ONCE(uc_rq->value, uclamp_rq_max_value(rq, clamp_id)); + if (bucket->value >= rq_clamp) { + bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); + WRITE_ONCE(uc_rq->value, bkt_clamp); + } } static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) @@ -908,6 +941,10 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) for_each_clamp_id(clamp_id) uclamp_rq_inc_id(rq, p, clamp_id); + + /* Reset clamp idle holding when there is one RUNNABLE task */ + if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) @@ -926,8 +963,10 @@ static void __init init_uclamp(void) unsigned int clamp_id; int cpu; - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); + cpu_rq(cpu)->uclamp_flags = 0; + } for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp[clamp_id], diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cecc6baaba93..0d2ba8bb2cb3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -870,6 +870,8 @@ struct rq { #ifdef CONFIG_UCLAMP_TASK /* Utilization clamp values based on CPU's RUNNABLE tasks */ struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; + unsigned int uclamp_flags; +#define UCLAMP_FLAG_IDLE 0x01 #endif struct cfs_rq cfs; -- cgit v1.2.3 From e8f14172c6b11e9a86c65532497087f8eb0f91b1 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:05 +0100 Subject: sched/uclamp: Add system default clamps Tasks without a user-defined clamp value are considered not clamped and by default their utilization can have any value in the [0..SCHED_CAPACITY_SCALE] range. Tasks with a user-defined clamp value are allowed to request any value in that range, and the required clamp is unconditionally enforced. However, a "System Management Software" could be interested in limiting the range of clamp values allowed for all tasks. Add a privileged interface to define a system default configuration via: /proc/sys/kernel/sched_uclamp_util_{min,max} which works as an unconditional clamp range restriction for all tasks. With the default configuration, the full SCHED_CAPACITY_SCALE range of values is allowed for each clamp index. Otherwise, the task-specific clamp is capped by the corresponding system default value. Do that by tracking, for each task, the "effective" clamp value and bucket the task has been refcounted in at enqueue time. This allows to lazy aggregate "requested" and "system default" values at enqueue time and simplifies refcounting updates at dequeue time. The cached bucket ids are used to avoid (relatively) more expensive integer divisions every time a task is enqueued. An active flag is used to report when the "effective" value is valid and thus the task is actually refcounted in the corresponding rq's bucket. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-5-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/sysctl.c | 16 +++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2dde735635ec..b74de86b68c7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -773,6 +773,14 @@ static void set_load_weight(struct task_struct *p, bool update_load) } #ifdef CONFIG_UCLAMP_TASK +/* Max allowed minimum utilization */ +unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; + +/* Max allowed maximum utilization */ +unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; + +/* All clamps are required to be less or equal than these values */ +static struct uclamp_se uclamp_default[UCLAMP_CNT]; /* Integer rounded range for each bucket */ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) @@ -851,6 +859,25 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +/* + * The effective clamp bucket index of a task depends on, by increasing + * priority: + * - the task specific clamp value, when explicitly requested from userspace + * - the system default clamp value, defined by the sysadmin + */ +static inline struct uclamp_se +uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) +{ + struct uclamp_se uc_req = p->uclamp_req[clamp_id]; + struct uclamp_se uc_max = uclamp_default[clamp_id]; + + /* System default restrictions always apply */ + if (unlikely(uc_req.value > uc_max.value)) + return uc_max; + + return uc_req; +} + /* * When a task is enqueued on a rq, the clamp bucket currently defined by the * task's uclamp::bucket_id is refcounted on that rq. This also immediately @@ -870,8 +897,12 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, lockdep_assert_held(&rq->lock); + /* Update task effective clamp */ + p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); + bucket = &uc_rq->bucket[uc_se->bucket_id]; bucket->tasks++; + uc_se->active = true; uclamp_idle_reset(rq, clamp_id, uc_se->value); @@ -910,6 +941,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, SCHED_WARN_ON(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; + uc_se->active = false; /* * Keep "local max aggregation" simple and accept to (possibly) @@ -958,8 +990,65 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_min, old_max; + static DEFINE_MUTEX(mutex); + int result; + + mutex_lock(&mutex); + old_min = sysctl_sched_uclamp_util_min; + old_max = sysctl_sched_uclamp_util_max; + + result = proc_dointvec(table, write, buffer, lenp, ppos); + if (result) + goto undo; + if (!write) + goto done; + + if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + result = -EINVAL; + goto undo; + } + + if (old_min != sysctl_sched_uclamp_util_min) { + uclamp_se_set(&uclamp_default[UCLAMP_MIN], + sysctl_sched_uclamp_util_min); + } + if (old_max != sysctl_sched_uclamp_util_max) { + uclamp_se_set(&uclamp_default[UCLAMP_MAX], + sysctl_sched_uclamp_util_max); + } + + /* + * Updating all the RUNNABLE task is expensive, keep it simple and do + * just a lazy update at each next enqueue time. + */ + goto done; + +undo: + sysctl_sched_uclamp_util_min = old_min; + sysctl_sched_uclamp_util_max = old_max; +done: + mutex_unlock(&mutex); + + return result; +} + +static void uclamp_fork(struct task_struct *p) +{ + unsigned int clamp_id; + + for_each_clamp_id(clamp_id) + p->uclamp[clamp_id].active = false; +} + static void __init init_uclamp(void) { + struct uclamp_se uc_max = {}; unsigned int clamp_id; int cpu; @@ -969,14 +1058,20 @@ static void __init init_uclamp(void) } for_each_clamp_id(clamp_id) { - uclamp_se_set(&init_task.uclamp[clamp_id], + uclamp_se_set(&init_task.uclamp_req[clamp_id], uclamp_none(clamp_id)); } + + /* System defaults allow max clamp values for both indexes */ + uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX)); + for_each_clamp_id(clamp_id) + uclamp_default[clamp_id] = uc_max; } #else /* CONFIG_UCLAMP_TASK */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } +static inline void uclamp_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -2545,6 +2640,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) */ p->prio = current->normal_prio; + uclamp_fork(p); + /* * Revert to default priority/policy on fork if requested. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1beca96fb625..1c1ad1e14f21 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -452,6 +452,22 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_util_clamp_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, +#endif #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", -- cgit v1.2.3 From 1d6362fa0cfc8c7b243fa92924429d826599e691 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:06 +0100 Subject: sched/core: Allow sched_setattr() to use the current policy The sched_setattr() syscall mandates that a policy is always specified. This requires to always know which policy a task will have when attributes are configured and this makes it impossible to add more generic task attributes valid across different scheduling policies. Reading the policy before setting generic tasks attributes is racy since we cannot be sure it is not changed concurrently. Introduce the required support to change generic task attributes without affecting the current task policy. This is done by adding an attribute flag (SCHED_FLAG_KEEP_POLICY) to enforce the usage of the current policy. Add support for the SETPARAM_POLICY policy, which is already used by the sched_setparam() POSIX syscall, to the sched_setattr() non-POSIX syscall. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-6-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b74de86b68c7..6d519f3f9789 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4897,6 +4897,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if ((int)attr.sched_policy < 0) return -EINVAL; + if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) + attr.sched_policy = SETPARAM_POLICY; rcu_read_lock(); retval = -ESRCH; -- cgit v1.2.3 From a509a7cd79747074a2c018a45bbbc52d1f4aed44 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:07 +0100 Subject: sched/uclamp: Extend sched_setattr() to support utilization clamping The SCHED_DEADLINE scheduling class provides an advanced and formal model to define tasks requirements that can translate into proper decisions for both task placements and frequencies selections. Other classes have a more simplified model based on the POSIX concept of priorities. Such a simple priority based model however does not allow to exploit most advanced features of the Linux scheduler like, for example, driving frequencies selection via the schedutil cpufreq governor. However, also for non SCHED_DEADLINE tasks, it's still interesting to define tasks properties to support scheduler decisions. Utilization clamping exposes to user-space a new set of per-task attributes the scheduler can use as hints about the expected/required utilization for a task. This allows to implement a "proactive" per-task frequency control policy, a more advanced policy than the current one based just on "passive" measured task utilization. For example, it's possible to boost interactive tasks (e.g. to get better performance) or cap background tasks (e.g. to be more energy/thermal efficient). Introduce a new API to set utilization clamping values for a specified task by extending sched_setattr(), a syscall which already allows to define task specific properties for different scheduling classes. A new pair of attributes allows to specify a minimum and maximum utilization the scheduler can consider for a task. Do that by validating the required clamp values before and then applying the required changes using _the_ same pattern already in use for __setscheduler(). This ensures that the task is re-enqueued with the new clamp values. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-7-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 84 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6d519f3f9789..e9a669266fa9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -805,10 +805,12 @@ static inline unsigned int uclamp_none(int clamp_id) return SCHED_CAPACITY_SCALE; } -static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value) +static inline void uclamp_se_set(struct uclamp_se *uc_se, + unsigned int value, bool user_defined) { uc_se->value = value; uc_se->bucket_id = uclamp_bucket_id(value); + uc_se->user_defined = user_defined; } static inline unsigned int @@ -1016,11 +1018,11 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, if (old_min != sysctl_sched_uclamp_util_min) { uclamp_se_set(&uclamp_default[UCLAMP_MIN], - sysctl_sched_uclamp_util_min); + sysctl_sched_uclamp_util_min, false); } if (old_max != sysctl_sched_uclamp_util_max) { uclamp_se_set(&uclamp_default[UCLAMP_MAX], - sysctl_sched_uclamp_util_max); + sysctl_sched_uclamp_util_max, false); } /* @@ -1038,6 +1040,42 @@ done: return result; } +static int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; + unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) + lower_bound = attr->sched_util_min; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) + upper_bound = attr->sched_util_max; + + if (lower_bound > upper_bound) + return -EINVAL; + if (upper_bound > SCHED_CAPACITY_SCALE) + return -EINVAL; + + return 0; +} + +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) +{ + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) + return; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], + attr->sched_util_min, true); + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], + attr->sched_util_max, true); + } +} + static void uclamp_fork(struct task_struct *p) { unsigned int clamp_id; @@ -1059,11 +1097,11 @@ static void __init init_uclamp(void) for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id], - uclamp_none(clamp_id)); + uclamp_none(clamp_id), false); } /* System defaults allow max clamp values for both indexes */ - uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX)); + uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); for_each_clamp_id(clamp_id) uclamp_default[clamp_id] = uc_max; } @@ -1071,6 +1109,13 @@ static void __init init_uclamp(void) #else /* CONFIG_UCLAMP_TASK */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } +static inline int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + return -EOPNOTSUPP; +} +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -4412,6 +4457,13 @@ static void __setscheduler_params(struct task_struct *p, static void __setscheduler(struct rq *rq, struct task_struct *p, const struct sched_attr *attr, bool keep_boost) { + /* + * If params can't change scheduling class changes aren't allowed + * either. + */ + if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) + return; + __setscheduler_params(p, attr); /* @@ -4549,6 +4601,13 @@ recheck: return retval; } + /* Update task specific "requested" clamps */ + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { + retval = uclamp_validate(p, attr); + if (retval) + return retval; + } + /* * Make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -4578,6 +4637,8 @@ recheck: goto change; if (dl_policy(policy) && dl_param_changed(p, attr)) goto change; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) + goto change; p->sched_reset_on_fork = reset_on_fork; task_rq_unlock(rq, p, &rf); @@ -4658,7 +4719,9 @@ change: put_prev_task(rq, p); prev_class = p->sched_class; + __setscheduler(rq, p, attr, pi); + __setscheduler_uclamp(p, attr); if (queued) { /* @@ -4834,6 +4897,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a if (ret) return -EFAULT; + if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && + size < SCHED_ATTR_SIZE_VER1) + return -EINVAL; + /* * XXX: Do we want to be lenient like existing syscalls; or do we want * to be strict and return an error on out-of-bounds values? @@ -4903,10 +4970,15 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setattr(p, &attr); + if (likely(p)) + get_task_struct(p); rcu_read_unlock(); + if (likely(p)) { + retval = sched_setattr(p, &attr); + put_task_struct(p); + } + return retval; } @@ -5057,6 +5129,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, else attr.sched_nice = task_nice(p); +#ifdef CONFIG_UCLAMP_TASK + attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; +#endif + rcu_read_unlock(); retval = sched_read_attr(uattr, &attr, size); -- cgit v1.2.3 From a87498ace58e23b62a572dc7267579ede4c8495c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:08 +0100 Subject: sched/uclamp: Reset uclamp values on RESET_ON_FORK A forked tasks gets the same clamp values of its parent however, when the RESET_ON_FORK flag is set on parent, e.g. via: sys_sched_setattr() sched_setattr() __sched_setscheduler(attr::SCHED_FLAG_RESET_ON_FORK) the new forked task is expected to start with all attributes reset to default values. Do that for utilization clamp values too by checking the reset request from the existing uclamp_fork() call which already provides the required initialization for other uclamp related bits. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-8-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e9a669266fa9..ecc304ab906f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1082,6 +1082,14 @@ static void uclamp_fork(struct task_struct *p) for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; + + if (likely(!p->sched_reset_on_fork)) + return; + + for_each_clamp_id(clamp_id) { + uclamp_se_set(&p->uclamp_req[clamp_id], + uclamp_none(clamp_id), false); + } } static void __init init_uclamp(void) -- cgit v1.2.3 From 1a00d999971c78ab024a17b0efc37d78404dd120 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:09 +0100 Subject: sched/uclamp: Set default clamps for RT tasks By default FAIR tasks start without clamps, i.e. neither boosted nor capped, and they run at the best frequency matching their utilization demand. This default behavior does not fit RT tasks which instead are expected to run at the maximum available frequency, if not otherwise required by explicitly capping them. Enforce the correct behavior for RT tasks by setting util_min to max whenever: 1. the task is switched to the RT class and it does not already have a user-defined clamp value assigned. 2. an RT task is forked from a parent with RESET_ON_FORK set. NOTE: utilization clamp values are cross scheduling class attributes and thus they are never changed/reset once a value has been explicitly defined from user-space. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-9-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ecc304ab906f..6cd5133f0c2a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1062,6 +1062,27 @@ static int uclamp_validate(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { + unsigned int clamp_id; + + /* + * On scheduling class change, reset to default clamps for tasks + * without a task-specific value. + */ + for_each_clamp_id(clamp_id) { + struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; + unsigned int clamp_value = uclamp_none(clamp_id); + + /* Keep using defined clamps across class changes */ + if (uc_se->user_defined) + continue; + + /* By default, RT tasks always get 100% boost */ + if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) + clamp_value = uclamp_none(UCLAMP_MAX); + + uclamp_se_set(uc_se, clamp_value, false); + } + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) return; @@ -1087,8 +1108,13 @@ static void uclamp_fork(struct task_struct *p) return; for_each_clamp_id(clamp_id) { - uclamp_se_set(&p->uclamp_req[clamp_id], - uclamp_none(clamp_id), false); + unsigned int clamp_value = uclamp_none(clamp_id); + + /* By default, RT tasks always get 100% boost */ + if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) + clamp_value = uclamp_none(UCLAMP_MAX); + + uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false); } } -- cgit v1.2.3 From 982d9cdc22c9f6df5ad790caa229ff74fb1d95e7 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:10 +0100 Subject: sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks Each time a frequency update is required via schedutil, a frequency is selected to (possibly) satisfy the utilization reported by each scheduling class and irqs. However, when utilization clamping is in use, the frequency selection should consider userspace utilization clamping hints. This will allow, for example, to: - boost tasks which are directly affecting the user experience by running them at least at a minimum "requested" frequency - cap low priority tasks not directly affecting the user experience by running them only up to a maximum "allowed" frequency These constraints are meant to support a per-task based tuning of the frequency selection thus supporting a fine grained definition of performance boosting vs energy saving strategies in kernel space. Add support to clamp the utilization of RUNNABLE FAIR and RT tasks within the boundaries defined by their aggregated utilization clamp constraints. Do that by considering the max(min_util, max_util) to give boosted tasks the performance they need even when they happen to be co-scheduled with other capped tasks. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-10-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 15 ++++++++++++--- kernel/sched/fair.c | 4 ++++ kernel/sched/rt.c | 4 ++++ kernel/sched/sched.h | 23 +++++++++++++++++++++++ 4 files changed, 43 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7c4ce69067c4..d84e036a7536 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -202,8 +202,10 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, unsigned long dl_util, util, irq; struct rq *rq = cpu_rq(cpu); - if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) + if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && + type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { return max; + } /* * Early check to see if IRQ/steal time saturates the CPU, can be @@ -219,9 +221,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, * CFS tasks and we use the same metric to track the effective * utilization (PELT windows are synchronized) we can directly add them * to obtain the CPU's actual utilization. + * + * CFS and RT utilization can be boosted or capped, depending on + * utilization clamp constraints requested by currently RUNNABLE + * tasks. + * When there are no CFS RUNNABLE tasks, clamps are released and + * frequency will be gracefully reduced with the utilization decay. */ - util = util_cfs; - util += cpu_util_rt(rq); + util = util_cfs + cpu_util_rt(rq); + if (type == FREQUENCY_UTIL) + util = uclamp_util(rq, util); dl_util = cpu_util_dl(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3bdcd3c718bc..28db7ce5c3a6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10393,6 +10393,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_FAIR_GROUP_SCHED .task_change_group = task_change_group_fair, #endif + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 1, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 63ad7c90822c..a532558a5176 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2400,6 +2400,10 @@ const struct sched_class rt_sched_class = { .switched_to = switched_to_rt, .update_curr = update_curr_rt, + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 1, +#endif }; #ifdef CONFIG_RT_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0d2ba8bb2cb3..9b0c77a99346 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2265,6 +2265,29 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ +#ifdef CONFIG_UCLAMP_TASK +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + + /* + * Since CPU's {min,max}_util clamps are MAX aggregated considering + * RUNNABLE tasks with _different_ clamps, we can end up with an + * inversion. Fix it now when the clamps are applied. + */ + if (unlikely(min_util >= max_util)) + return min_util; + + return clamp(util, min_util, max_util); +} +#else /* CONFIG_UCLAMP_TASK */ +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + return util; +} +#endif /* CONFIG_UCLAMP_TASK */ + #ifdef arch_scale_freq_capacity # ifndef arch_scale_freq_invariant # define arch_scale_freq_invariant() true -- cgit v1.2.3 From 9d20ad7dfc9a5cc64e33d725902d3863d350a66a Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:11 +0100 Subject: sched/uclamp: Add uclamp_util_with() So far uclamp_util() allows to clamp a specified utilization considering the clamp values requested by RUNNABLE tasks in a CPU. For the Energy Aware Scheduler (EAS) it is interesting to test how clamp values will change when a task is becoming RUNNABLE on a given CPU. For example, EAS is interested in comparing the energy impact of different scheduling decisions and the clamp values can play a role on that. Add uclamp_util_with() which allows to clamp a given utilization by considering the possible impact on CPU clamp values of a specified task. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-11-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 +++++++++++++ kernel/sched/sched.h | 21 ++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6cd5133f0c2a..fa43ce3962e7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -880,6 +880,19 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) return uc_req; } +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) +{ + struct uclamp_se uc_eff; + + /* Task currently refcounted: use back-annotated (effective) value */ + if (p->uclamp[clamp_id].active) + return p->uclamp[clamp_id].value; + + uc_eff = uclamp_eff_get(p, clamp_id); + + return uc_eff.value; +} + /* * When a task is enqueued on a rq, the clamp bucket currently defined by the * task's uclamp::bucket_id is refcounted on that rq. This also immediately diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9b0c77a99346..1783f6b4c2e0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2266,11 +2266,20 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef CONFIG_UCLAMP_TASK -static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id); + +static __always_inline +unsigned int uclamp_util_with(struct rq *rq, unsigned int util, + struct task_struct *p) { unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + if (p) { + min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); + max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX)); + } + /* * Since CPU's {min,max}_util clamps are MAX aggregated considering * RUNNABLE tasks with _different_ clamps, we can end up with an @@ -2281,7 +2290,17 @@ static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) return clamp(util, min_util, max_util); } + +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + return uclamp_util_with(rq, util, NULL); +} #else /* CONFIG_UCLAMP_TASK */ +static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, + struct task_struct *p) +{ + return util; +} static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) { return util; -- cgit v1.2.3 From af24bde8df2029f067dc46aff0393c8f18ff6e2f Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 21 Jun 2019 09:42:12 +0100 Subject: sched/uclamp: Add uclamp support to energy_compute() The Energy Aware Scheduler (EAS) estimates the energy impact of waking up a task on a given CPU. This estimation is based on: a) an (active) power consumption defined for each CPU frequency b) an estimation of which frequency will be used on each CPU c) an estimation of the busy time (utilization) of each CPU Utilization clamping can affect both b) and c). A CPU is expected to run: - on an higher than required frequency, but for a shorter time, in case its estimated utilization will be smaller than the minimum utilization enforced by uclamp - on a smaller than required frequency, but for a longer time, in case its estimated utilization is bigger than the maximum utilization enforced by uclamp While compute_energy() already accounts clamping effects on busy time, the clamping effects on frequency selection are currently ignored. Fix it by considering how CPU clamp values will be affected by a task waking up and being RUNNABLE on that CPU. Do that by refactoring schedutil_freq_util() to take an additional task_struct* which allows EAS to evaluate the impact on clamp values of a task being eventually queued in a CPU. Clamp values are applied to the RT+CFS utilization only when a FREQUENCY_UTIL is required by compute_energy(). Do note that switching from ENERGY_UTIL to FREQUENCY_UTIL in the computation of the cpu_util signal implies that we are more likely to estimate the highest OPP when a RT task is running in another CPU of the same performance domain. This can have an impact on energy estimation but: - it's not easy to say which approach is better, since it depends on the use case - the original approach could still be obtained by setting a smaller task-specific util_min whenever required Since we are at that: - rename schedutil_freq_util() into schedutil_cpu_util(), since it's not only used for frequency selection. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-12-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 9 +++++---- kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 21 +++++++++------------ 3 files changed, 48 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d84e036a7536..636ca6f88c8e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -196,8 +196,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, * based on the task model parameters and gives the minimal utilization * required to meet deadlines. */ -unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type) +unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p) { unsigned long dl_util, util, irq; struct rq *rq = cpu_rq(cpu); @@ -230,7 +231,7 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, */ util = util_cfs + cpu_util_rt(rq); if (type == FREQUENCY_UTIL) - util = uclamp_util(rq, util); + util = uclamp_util_with(rq, util, p); dl_util = cpu_util_dl(rq); @@ -290,7 +291,7 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); - return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL); + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); } /** diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28db7ce5c3a6..b798fe7ff7cd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6231,11 +6231,21 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) static long compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) { - long util, max_util, sum_util, energy = 0; + unsigned int max_util, util_cfs, cpu_util, cpu_cap; + unsigned long sum_util, energy = 0; + struct task_struct *tsk; int cpu; for (; pd; pd = pd->next) { + struct cpumask *pd_mask = perf_domain_span(pd); + + /* + * The energy model mandates all the CPUs of a performance + * domain have the same capacity. + */ + cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); max_util = sum_util = 0; + /* * The capacity state of CPUs of the current rd can be driven by * CPUs of another rd if they belong to the same performance @@ -6246,11 +6256,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * it will not appear in its pd list and will not be accounted * by compute_energy(). */ - for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) { - util = cpu_util_next(cpu, p, dst_cpu); - util = schedutil_energy_util(cpu, util); - max_util = max(util, max_util); - sum_util += util; + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { + util_cfs = cpu_util_next(cpu, p, dst_cpu); + + /* + * Busy time computation: utilization clamping is not + * required since the ratio (sum_util / cpu_capacity) + * is already enough to scale the EM reported power + * consumption at the (eventually clamped) cpu_capacity. + */ + sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + ENERGY_UTIL, NULL); + + /* + * Performance domain frequency: utilization clamping + * must be considered since it affects the selection + * of the performance domain frequency. + * NOTE: in case RT tasks are running, by default the + * FREQUENCY_UTIL's utilization can be max OPP. + */ + tsk = cpu == dst_cpu ? p : NULL; + cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + FREQUENCY_UTIL, tsk); + max_util = max(max_util, cpu_util); } energy += em_pd_energy(pd->em_pd, max_util, sum_util); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1783f6b4c2e0..802b1f3405f2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2322,7 +2322,6 @@ static inline unsigned long capacity_orig_of(int cpu) } #endif -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL /** * enum schedutil_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency @@ -2338,15 +2337,11 @@ enum schedutil_type { ENERGY_UTIL, }; -unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type); +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) -{ - unsigned long max = arch_scale_cpu_capacity(cpu); - - return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); -} +unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p); static inline unsigned long cpu_bw_dl(struct rq *rq) { @@ -2375,11 +2370,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq) return READ_ONCE(rq->avg_rt.util_avg); } #else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p) { - return cfs; + return 0; } -#endif +#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) -- cgit v1.2.3 From fd7d55172d1e2e501e6da0a5c1de25f06612dc2e Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 1 Jun 2019 01:27:22 -0700 Subject: perf/cgroups: Don't rotate events for cgroups unnecessarily Currently perf_rotate_context assumes that if the context's nr_events != nr_active a rotation is necessary for perf event multiplexing. With cgroups, nr_events is the total count of events for all cgroups and nr_active will not include events in a cgroup other than the current task's. This makes rotation appear necessary for cgroups when it is not. Add a perf_event_context flag that is set when rotation is necessary. Clear the flag during sched_out and set it when a flexible sched_in fails due to resources. Signed-off-by: Ian Rogers Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/20190601082722.44543-1-irogers@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 118ad1aef6af..23efe6792abc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2952,6 +2952,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; + /* + * If we had been multiplexing, no rotations are necessary, now no events + * are active. + */ + ctx->rotate_necessary = 0; + perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) @@ -3319,10 +3325,13 @@ static int flexible_sched_in(struct perf_event *event, void *data) return 0; if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { - if (!group_sched_in(event, sid->cpuctx, sid->ctx)) - list_add_tail(&event->active_list, &sid->ctx->flexible_active); - else + int ret = group_sched_in(event, sid->cpuctx, sid->ctx); + if (ret) { sid->can_add_hw = 0; + sid->ctx->rotate_necessary = 1; + return 0; + } + list_add_tail(&event->active_list, &sid->ctx->flexible_active); } return 0; @@ -3690,24 +3699,17 @@ ctx_first_active(struct perf_event_context *ctx) static bool perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event *cpu_event = NULL, *task_event = NULL; - bool cpu_rotate = false, task_rotate = false; - struct perf_event_context *ctx = NULL; + struct perf_event_context *task_ctx = NULL; + int cpu_rotate, task_rotate; /* * Since we run this from IRQ context, nobody can install new * events, thus the event count values are stable. */ - if (cpuctx->ctx.nr_events) { - if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - cpu_rotate = true; - } - - ctx = cpuctx->task_ctx; - if (ctx && ctx->nr_events) { - if (ctx->nr_events != ctx->nr_active) - task_rotate = true; - } + cpu_rotate = cpuctx->ctx.rotate_necessary; + task_ctx = cpuctx->task_ctx; + task_rotate = task_ctx ? task_ctx->rotate_necessary : 0; if (!(cpu_rotate || task_rotate)) return false; @@ -3716,7 +3718,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) perf_pmu_disable(cpuctx->ctx.pmu); if (task_rotate) - task_event = ctx_first_active(ctx); + task_event = ctx_first_active(task_ctx); if (cpu_rotate) cpu_event = ctx_first_active(&cpuctx->ctx); @@ -3724,17 +3726,17 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) * As per the order given at ctx_resched() first 'pop' task flexible * and then, if needed CPU flexible. */ - if (task_event || (ctx && cpu_event)) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + if (task_event || (task_ctx && cpu_event)) + ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE); if (cpu_event) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (task_event) - rotate_ctx(ctx, task_event); + rotate_ctx(task_ctx, task_event); if (cpu_event) rotate_ctx(&cpuctx->ctx, cpu_event); - perf_event_sched_in(cpuctx, ctx, current); + perf_event_sched_in(cpuctx, task_ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); -- cgit v1.2.3 From 4a54d16f615f41489b2ecbc940f6eb2618ddafd6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Apr 2019 09:16:42 -0500 Subject: dma-mapping: truncate dma masks to what dma_addr_t can hold The dma masks in struct device are always 64-bits wide. But for builds using a 32-bit dma_addr_t we need to ensure we don't store an unsupportable value. Before Linux 5.0 this was handled at least by the ARM dma mapping code by never allowing to set a larger dma_mask, but these days we allow the driver to just set the largest supported value and never fall back to a smaller one. Ensure this always works by truncating the value. Fixes: 9eb9e96e97b3 ("Documentation/DMA-API-HOWTO: update dma_mask sections") Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index f7afdadb6770..1f628e7ac709 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -317,6 +317,12 @@ void arch_dma_set_mask(struct device *dev, u64 mask); int dma_set_mask(struct device *dev, u64 mask) { + /* + * Truncate the mask to the actually supported dma_addr_t width to + * avoid generating unsupportable addresses. + */ + mask = (dma_addr_t)mask; + if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; @@ -330,6 +336,12 @@ EXPORT_SYMBOL(dma_set_mask); #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) { + /* + * Truncate the mask to the actually supported dma_addr_t width to + * avoid generating unsupportable addresses. + */ + mask = (dma_addr_t)mask; + if (!dma_supported(dev, mask)) return -EIO; -- cgit v1.2.3 From 4b85faed211ccfbcc7f3adf1cd62f0b00d1a172b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Jun 2019 16:06:10 +0200 Subject: dma-mapping: add a dma_alloc_need_uncached helper Check if we need to allocate uncached memory for a device given the allocation flags. Switch over the uncached segment check to this helper to deal with architectures that do not support the dma_cache_sync operation and thus should not returned cacheable memory for DMA_ATTR_NON_CONSISTENT allocations. Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index b67f0aa08aa3..c2893713bf80 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -160,7 +160,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, memset(ret, 0, size); if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && - !dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_NON_CONSISTENT)) { + dma_alloc_need_uncached(dev, attrs)) { arch_dma_prep_coherent(page, size); ret = uncached_kernel_address(ret); } @@ -182,7 +182,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && - !dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_NON_CONSISTENT)) + dma_alloc_need_uncached(dev, attrs)) cpu_addr = cached_kernel_address(cpu_addr); __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr)); } -- cgit v1.2.3 From 886532aee3cd42d95196601ed16d7c3d4679e9e5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 17 Jun 2019 14:47:05 +0200 Subject: locking/lockdep: Move mark_lock() inside CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING The last cleanup patch triggered another issue, as now another function should be moved into the same section: kernel/locking/lockdep.c:3580:12: error: 'mark_lock' defined but not used [-Werror=unused-function] static int mark_lock(struct task_struct *curr, struct held_lock *this, Move mark_lock() into the same #ifdef section as its only caller, and remove the now-unused mark_lock_irq() stub helper. Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Bart Van Assche Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: Yuyang Du Fixes: 0d2cc3b34532 ("locking/lockdep: Move valid_state() inside CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING") Link: https://lkml.kernel.org/r/20190617124718.1232976-1-arnd@arndb.de Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 73 ++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 5e368f485330..341f52117f88 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -437,13 +437,6 @@ static int verbose(struct lock_class *class) return 0; } -/* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. - */ -unsigned long nr_stack_trace_entries; -static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; - static void print_lockdep_off(const char *bug_msg) { printk(KERN_DEBUG "%s\n", bug_msg); @@ -453,6 +446,15 @@ static void print_lockdep_off(const char *bug_msg) #endif } +unsigned long nr_stack_trace_entries; + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +/* + * Stack-trace: tightly packed array of stack backtrace + * addresses. Protected by the graph_lock. + */ +static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; + static int save_trace(struct lock_trace *trace) { unsigned long *entries = stack_trace + nr_stack_trace_entries; @@ -475,6 +477,7 @@ static int save_trace(struct lock_trace *trace) return 1; } +#endif unsigned int nr_hardirq_chains; unsigned int nr_softirq_chains; @@ -488,6 +491,7 @@ unsigned int max_lockdep_depth; DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); #endif +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Locking printouts: */ @@ -505,6 +509,7 @@ static const char *usage_str[] = #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", }; +#endif const char * __get_key_name(struct lockdep_subclass_key *key, char *str) { @@ -2964,12 +2969,10 @@ static void check_chain_key(struct task_struct *curr) #endif } +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) - - static void print_usage_bug_scenario(struct held_lock *lock) { struct lock_class *class = hlock_class(lock); @@ -3545,35 +3548,6 @@ static int separate_irq_context(struct task_struct *curr, return 0; } -#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - -static inline -int mark_lock_irq(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ - return 1; -} - -static inline int -mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) -{ - return 1; -} - -static inline unsigned int task_irq_context(struct task_struct *task) -{ - return 0; -} - -static inline int separate_irq_context(struct task_struct *curr, - struct held_lock *hlock) -{ - return 0; -} - -#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - /* * Mark a lock with a usage bit, and validate the state transition: */ @@ -3634,6 +3608,27 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } +#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ + +static inline int +mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) +{ + return 1; +} + +static inline unsigned int task_irq_context(struct task_struct *task) +{ + return 0; +} + +static inline int separate_irq_context(struct task_struct *curr, + struct held_lock *hlock) +{ + return 0; +} + +#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ + /* * Initialize a lock instance's lock-class mapping info: */ -- cgit v1.2.3 From 9156e545765e467e6268c4814cfa609ebb16237e Mon Sep 17 00:00:00 2001 From: Kobe Wu Date: Mon, 24 Jun 2019 16:35:48 +0800 Subject: locking/lockdep: increase size of counters for lockdep statistics When system has been running for a long time, signed integer counters are not enough for some lockdep statistics. Using unsigned long counters can satisfy the requirement. Besides, most of lockdep statistics are unsigned. It is better to use unsigned int instead of int. Remove unused variables. - max_recursion_depth - nr_cyclic_check_recursions - nr_find_usage_forwards_recursions - nr_find_usage_backwards_recursions Signed-off-by: Kobe Wu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Eason Lin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: https://lkml.kernel.org/r/1561365348-16050-1-git-send-email-kobe-cp.wu@mediatek.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep_internals.h | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 150ec3f0c5b5..cc83568d5012 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -131,7 +131,6 @@ extern unsigned int nr_hardirq_chains; extern unsigned int nr_softirq_chains; extern unsigned int nr_process_chains; extern unsigned int max_lockdep_depth; -extern unsigned int max_recursion_depth; extern unsigned int max_bfs_queue_depth; @@ -160,25 +159,22 @@ lockdep_count_backward_deps(struct lock_class *class) * and we want to avoid too much cache bouncing. */ struct lockdep_stats { - int chain_lookup_hits; - int chain_lookup_misses; - int hardirqs_on_events; - int hardirqs_off_events; - int redundant_hardirqs_on; - int redundant_hardirqs_off; - int softirqs_on_events; - int softirqs_off_events; - int redundant_softirqs_on; - int redundant_softirqs_off; - int nr_unused_locks; - int nr_redundant_checks; - int nr_redundant; - int nr_cyclic_checks; - int nr_cyclic_check_recursions; - int nr_find_usage_forwards_checks; - int nr_find_usage_forwards_recursions; - int nr_find_usage_backwards_checks; - int nr_find_usage_backwards_recursions; + unsigned long chain_lookup_hits; + unsigned int chain_lookup_misses; + unsigned long hardirqs_on_events; + unsigned long hardirqs_off_events; + unsigned long redundant_hardirqs_on; + unsigned long redundant_hardirqs_off; + unsigned long softirqs_on_events; + unsigned long softirqs_off_events; + unsigned long redundant_softirqs_on; + unsigned long redundant_softirqs_off; + int nr_unused_locks; + unsigned int nr_redundant_checks; + unsigned int nr_redundant; + unsigned int nr_cyclic_checks; + unsigned int nr_find_usage_forwards_checks; + unsigned int nr_find_usage_backwards_checks; /* * Per lock class locking operation stat counts -- cgit v1.2.3 From e7d4798960b3ebcd243ae6a59e04d4fe6518c96c Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 13 Jun 2019 18:39:58 +0900 Subject: xdp: Add tracepoint for bulk XDP_TX This is introduced for admins to check what is happening on XDP_TX when bulk XDP_TX is in use, which will be first introduced in veth in next commit. v3: - Add act field to be in line with other XDP tracepoints. Signed-off-by: Toshiaki Makita Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ad3be85f1411..561ed07d3007 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2101,3 +2101,4 @@ EXPORT_SYMBOL(bpf_stats_enabled_key); #include EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); -- cgit v1.2.3 From c2f2124e0d447ad02a41a92361b3734366797680 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Jun 2019 15:59:14 +0200 Subject: dma-direct: handle DMA_ATTR_NON_CONSISTENT in common code Only call into arch_dma_alloc if we require an uncached mapping, and remove the parisc code manually doing normal cached DMA_ATTR_NON_CONSISTENT allocations. Signed-off-by: Christoph Hellwig Acked-by: Helge Deller # parisc --- kernel/dma/direct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index c2893713bf80..fc354f4f490b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -191,7 +191,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && - !dev_is_dma_coherent(dev)) + dma_alloc_need_uncached(dev, attrs)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); } @@ -200,7 +200,7 @@ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && - !dev_is_dma_coherent(dev)) + dma_alloc_need_uncached(dev, attrs)) arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); else dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); -- cgit v1.2.3 From d98849aff87911013aadb730138ab728b52fc547 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Jun 2019 16:17:27 +0200 Subject: dma-direct: handle DMA_ATTR_NO_KERNEL_MAPPING in common code DMA_ATTR_NO_KERNEL_MAPPING is generally implemented by allocating normal cacheable pages or CMA memory, and then returning the page pointer as the opaque handle. Lift that code from the xtensa and generic dma remapping implementations into the generic dma-direct code so that we don't even call arch_dma_alloc for these allocations. Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 14 ++++++++++++++ kernel/dma/remap.c | 13 ++----------- 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index fc354f4f490b..b90e1aede743 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -138,6 +138,14 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; + if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + /* remove any dirty cache lines on the kernel alias */ + if (!PageHighMem(page)) + arch_dma_prep_coherent(page, size); + /* return the page pointer as the opaque cookie */ + return page; + } + if (PageHighMem(page)) { /* * Depending on the cma= arguments and per-arch setup @@ -178,6 +186,12 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, { unsigned int page_order = get_order(size); + if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + /* cpu_addr is a struct page cookie, not a kernel address */ + __dma_direct_free_pages(dev, size, cpu_addr); + return; + } + if (force_dma_unencrypted()) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 0207e3764d52..a594aec07882 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -202,8 +202,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, size = PAGE_ALIGN(size); - if (!gfpflags_allow_blocking(flags) && - !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) { + if (!gfpflags_allow_blocking(flags)) { ret = dma_alloc_from_pool(size, &page, flags); if (!ret) return NULL; @@ -217,11 +216,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { - ret = page; /* opaque cookie */ - goto done; - } - /* create a coherent mapping */ ret = dma_common_contiguous_remap(page, size, VM_USERMAP, arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), @@ -240,10 +234,7 @@ done: void arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { - /* vaddr is a struct page cookie, not a kernel address */ - __dma_direct_free_pages(dev, size, vaddr); - } else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { + if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { phys_addr_t phys = dma_to_phys(dev, dma_handle); struct page *page = pfn_to_page(__phys_to_pfn(phys)); -- cgit v1.2.3 From 9db1ff0a415c7de8eb67df5b2c56ac409ccefc37 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Jun 2019 17:35:03 -0700 Subject: bpf: fix compiler warning with CONFIG_MODULES=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With CONFIG_MODULES=n, the following compiler warning occurs: /data/users/yhs/work/net-next/kernel/trace/bpf_trace.c:605:13: warning: ‘do_bpf_send_signal’ defined but not used [-Wunused-function] static void do_bpf_send_signal(struct irq_work *entry) The __init function send_signal_irq_work_init(), which calls do_bpf_send_signal(), is defined under CONFIG_MODULES. Hence, when CONFIG_MODULES=n, nobody calls static function do_bpf_send_signal(), hence the warning. The init function send_signal_irq_work_init() should work without CONFIG_MODULES. Moving it out of CONFIG_MODULES code section fixed the compiler warning, and also make bpf_send_signal() helper work without CONFIG_MODULES. Fixes: 8b401f9ed244 ("bpf: implement bpf_send_signal() helper") Reported-By: Arnd Bergmann Signed-off-by: Yonghong Song Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c102c240bb0b..ca1255d14576 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1431,6 +1431,20 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, return err; } +static int __init send_signal_irq_work_init(void) +{ + int cpu; + struct send_signal_irq_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&send_signal_work, cpu); + init_irq_work(&work->irq_work, do_bpf_send_signal); + } + return 0; +} + +subsys_initcall(send_signal_irq_work_init); + #ifdef CONFIG_MODULES static int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) @@ -1478,18 +1492,5 @@ static int __init bpf_event_init(void) return 0; } -static int __init send_signal_irq_work_init(void) -{ - int cpu; - struct send_signal_irq_work *work; - - for_each_possible_cpu(cpu) { - work = per_cpu_ptr(&send_signal_work, cpu); - init_irq_work(&work->irq_work, do_bpf_send_signal); - } - return 0; -} - fs_initcall(bpf_event_init); -subsys_initcall(send_signal_irq_work_init); #endif /* CONFIG_MODULES */ -- cgit v1.2.3 From 75672dda27bd00109a84cd975c17949ad9c45663 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Tue, 25 Jun 2019 17:41:50 +0100 Subject: bpf: fix BPF_ALU32 | BPF_ARSH on BE arches Yauheni reported the following code do not work correctly on BE arches: ALU_ARSH_X: DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); CONT; ALU_ARSH_K: DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); CONT; and are causing failure of test_verifier test 'arsh32 on imm 2' on BE arches. The code is taking address and interpreting memory directly, so is not endianness neutral. We should instead perform standard C type casting on the variable. A u64 to s32 conversion will drop the high 32-bit and reserve the low 32-bit as signed integer, this is all we want. Fixes: 2dc6b100f928 ("bpf: interpreter support BPF_ALU | BPF_ARSH") Reported-by: Yauheni Kaliuta Reviewed-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Jiong Wang Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 080e2bb644cc..f2148db91439 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1364,10 +1364,10 @@ select_insn: insn++; CONT; ALU_ARSH_X: - DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); + DST = (u64) (u32) (((s32) DST) >> SRC); CONT; ALU_ARSH_K: - DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); + DST = (u64) (u32) (((s32) DST) >> IMM); CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; -- cgit v1.2.3 From 93651f80dcb616b8c9115cdafc8e57a781af22d0 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 25 Jun 2019 17:40:28 +0800 Subject: modules: fix compile error if don't have strict module rwx If CONFIG_ARCH_HAS_STRICT_MODULE_RWX is not defined, we need stub for module_enable_nx() and module_enable_x(). If CONFIG_ARCH_HAS_STRICT_MODULE_RWX is defined, but CONFIG_STRICT_MODULE_RWX is disabled, we need stub for module_enable_nx. Move frob_text() outside of the CONFIG_STRICT_MODULE_RWX, because it is needed anyway. Fixes: 2eef1399a866 ("modules: fix BUG when load module with rodata=n") Signed-off-by: Yang Yingliang Signed-off-by: Jessica Yu --- kernel/module.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 537c456ce3ee..b51838325d08 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1898,7 +1898,7 @@ static void mod_sysfs_teardown(struct module *mod) mod_sysfs_fini(mod); } -#ifdef CONFIG_STRICT_MODULE_RWX +#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. @@ -1921,6 +1921,7 @@ static void frob_text(const struct module_layout *layout, layout->text_size >> PAGE_SHIFT); } +#ifdef CONFIG_STRICT_MODULE_RWX static void frob_rodata(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { @@ -2033,15 +2034,19 @@ void set_all_modules_text_ro(void) } mutex_unlock(&module_mutex); } -#else +#else /* !CONFIG_STRICT_MODULE_RWX */ static void module_enable_nx(const struct module *mod) { } -#endif - +#endif /* CONFIG_STRICT_MODULE_RWX */ static void module_enable_x(const struct module *mod) { frob_text(&mod->core_layout, set_memory_x); frob_text(&mod->init_layout, set_memory_x); } +#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ +static void module_enable_nx(const struct module *mod) { } +static void module_enable_x(const struct module *mod) { } +#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ + #ifdef CONFIG_LIVEPATCH /* -- cgit v1.2.3 From b206f281d0ee14969878469816a69db22d5838e8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 26 Jun 2019 21:02:32 +0100 Subject: keys: Namespace keyring names Keyring names are held in a single global list that any process can pick from by means of keyctl_join_session_keyring (provided the keyring grants Search permission). This isn't very container friendly, however. Make the following changes: (1) Make default session, process and thread keyring names begin with a '.' instead of '_'. (2) Keyrings whose names begin with a '.' aren't added to the list. Such keyrings are system specials. (3) Replace the global list with per-user_namespace lists. A keyring adds its name to the list for the user_namespace that it is currently in. (4) When a user_namespace is deleted, it just removes itself from the keyring name list. The global keyring_name_lock is retained for accessing the name lists. This allows (4) to work. This can be tested by: # keyctl newring foo @s 995906392 # unshare -U $ keyctl show ... 995906392 --alswrv 65534 65534 \_ keyring: foo ... $ keyctl session foo Joined session keyring: 935622349 As can be seen, a new session keyring was created. The capability bit KEYCTL_CAPS1_NS_KEYRING_NAME is set if the kernel is employing this feature. Signed-off-by: David Howells cc: Eric W. Biederman --- kernel/user.c | 3 +++ kernel/user_namespace.c | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 88b834f0eebc..50979fd1b7aa 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -62,6 +62,9 @@ struct user_namespace init_user_ns = { .ns.ops = &userns_operations, #endif .flags = USERNS_INIT_FLAGS, +#ifdef CONFIG_KEYS + .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), +#endif #ifdef CONFIG_PERSISTENT_KEYRINGS .persistent_keyring_register_sem = __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 923414a246e9..bda6e890ad88 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -133,6 +133,9 @@ int create_user_ns(struct cred *new) ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); +#ifdef CONFIG_KEYS + INIT_LIST_HEAD(&ns->keyring_name_list); +#endif #ifdef CONFIG_PERSISTENT_KEYRINGS init_rwsem(&ns->persistent_keyring_register_sem); #endif @@ -196,9 +199,7 @@ static void free_user_ns(struct work_struct *work) kfree(ns->projid_map.reverse); } retire_userns_sysctls(ns); -#ifdef CONFIG_PERSISTENT_KEYRINGS - key_put(ns->persistent_keyring_register); -#endif + key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); -- cgit v1.2.3 From 0f44e4d976f96c6439da0d6717238efa4b91196e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 26 Jun 2019 21:02:32 +0100 Subject: keys: Move the user and user-session keyrings to the user_namespace Move the user and user-session keyrings to the user_namespace struct rather than pinning them from the user_struct struct. This prevents these keyrings from propagating across user-namespaces boundaries with regard to the KEY_SPEC_* flags, thereby making them more useful in a containerised environment. The issue is that a single user_struct may be represent UIDs in several different namespaces. The way the patch does this is by attaching a 'register keyring' in each user_namespace and then sticking the user and user-session keyrings into that. It can then be searched to retrieve them. Signed-off-by: David Howells cc: Jann Horn --- kernel/user.c | 7 +------ kernel/user_namespace.c | 4 +--- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 50979fd1b7aa..f8519b62cf9a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -64,10 +64,7 @@ struct user_namespace init_user_ns = { .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_KEYS .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), -#endif -#ifdef CONFIG_PERSISTENT_KEYRINGS - .persistent_keyring_register_sem = - __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), + .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), #endif }; EXPORT_SYMBOL_GPL(init_user_ns); @@ -143,8 +140,6 @@ static void free_user(struct user_struct *up, unsigned long flags) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); - key_put(up->uid_keyring); - key_put(up->session_keyring); kmem_cache_free(uid_cachep, up); } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index bda6e890ad88..c87c2ecc7085 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -135,9 +135,7 @@ int create_user_ns(struct cred *new) #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); -#endif -#ifdef CONFIG_PERSISTENT_KEYRINGS - init_rwsem(&ns->persistent_keyring_register_sem); + init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) -- cgit v1.2.3 From 02e5ad973883c36c0868b301b8357d9c455bb91c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Jun 2019 20:43:53 -0400 Subject: perf_event_get(): don't bother with fget_raw() ... since we immediately follow that with check that it *is* an opened perf file, with O_PATH ones ending with with the same -EBADF we'd get for descriptor that isn't opened at all. Signed-off-by: Al Viro --- kernel/events/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index abbd4b3b96c2..f9ff04c8d084 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11554,9 +11554,7 @@ void perf_event_delayed_put(struct task_struct *task) struct file *perf_event_get(unsigned int fd) { - struct file *file; - - file = fget_raw(fd); + struct file *file = fget(fd); if (!file) return ERR_PTR(-EBADF); -- cgit v1.2.3 From e5c891a349d7c556b7b9dc231d6dd78e88a29e5c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 25 Jun 2019 14:38:58 -0700 Subject: bpf: fix cgroup bpf release synchronization Since commit 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself"), cgroup_bpf release occurs asynchronously (from a worker context), and before the release of the cgroup itself. This introduced a previously non-existing race between the release and update paths. E.g. if a leaf's cgroup_bpf is released and a new bpf program is attached to the one of ancestor cgroups at the same time. The race may result in double-free and other memory corruptions. To fix the problem, let's protect the body of cgroup_bpf_release() with cgroup_mutex, as it was effectively previously, when all this code was called from the cgroup release path with cgroup mutex held. Also let's skip cgroups, which have no chances to invoke a bpf program, on the update path. If the cgroup bpf refcnt reached 0, it means that the cgroup is offline (no attached processes), and there are no associated sockets left. It means there is no point in updating effective progs array! And it can lead to a leak, if it happens after the release. So, let's skip such cgroups. Big thanks for Tejun Heo for discovering and debugging of this problem! Fixes: 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself") Reported-by: Tejun Heo Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c225c42e114a..077ed3a19848 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -16,6 +16,8 @@ #include #include +#include "../cgroup/cgroup-internal.h" + DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); EXPORT_SYMBOL(cgroup_bpf_enabled_key); @@ -38,6 +40,8 @@ static void cgroup_bpf_release(struct work_struct *work) struct bpf_prog_array *old_array; unsigned int type; + mutex_lock(&cgroup_mutex); + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog_list *pl, *tmp; @@ -54,10 +58,12 @@ static void cgroup_bpf_release(struct work_struct *work) } old_array = rcu_dereference_protected( cgrp->bpf.effective[type], - percpu_ref_is_dying(&cgrp->bpf.refcnt)); + lockdep_is_held(&cgroup_mutex)); bpf_prog_array_free(old_array); } + mutex_unlock(&cgroup_mutex); + percpu_ref_exit(&cgrp->bpf.refcnt); cgroup_put(cgrp); } @@ -229,6 +235,9 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + err = compute_effective_progs(desc, type, &desc->bpf.inactive); if (err) goto cleanup; @@ -238,6 +247,14 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) { + if (unlikely(desc->bpf.inactive)) { + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + continue; + } + activate_effective_progs(desc, type, desc->bpf.inactive); desc->bpf.inactive = NULL; } -- cgit v1.2.3 From 2c9858ecbeb1e68224290043445990e29337d4c0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Jun 2019 16:52:37 +0200 Subject: workqueue: Make alloc/apply/free_workqueue_attrs() static None of those functions have any users outside of workqueue.c. Confine them. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 95aea04ff722..b8fa7afe6e7d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3329,7 +3329,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context); * * Undo alloc_workqueue_attrs(). */ -void free_workqueue_attrs(struct workqueue_attrs *attrs) +static void free_workqueue_attrs(struct workqueue_attrs *attrs) { if (attrs) { free_cpumask_var(attrs->cpumask); @@ -3346,7 +3346,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) * * Return: The allocated new workqueue_attr on success. %NULL on failure. */ -struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) +static struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) { struct workqueue_attrs *attrs; @@ -4033,7 +4033,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * * Return: 0 on success and -errno on failure. */ -int apply_workqueue_attrs(struct workqueue_struct *wq, +static int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { int ret; @@ -4044,7 +4044,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, return ret; } -EXPORT_SYMBOL_GPL(apply_workqueue_attrs); /** * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug -- cgit v1.2.3 From be69d00d9769575e35d83367f465a58dbf82748c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Jun 2019 16:52:38 +0200 Subject: workqueue: Remove GPF argument from alloc_workqueue_attrs() All callers use GFP_KERNEL. No point in having that argument. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/workqueue.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b8fa7afe6e7d..601d61150b65 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3339,21 +3339,20 @@ static void free_workqueue_attrs(struct workqueue_attrs *attrs) /** * alloc_workqueue_attrs - allocate a workqueue_attrs - * @gfp_mask: allocation mask to use * * Allocate a new workqueue_attrs, initialize with default settings and * return it. * * Return: The allocated new workqueue_attr on success. %NULL on failure. */ -static struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) +static struct workqueue_attrs *alloc_workqueue_attrs(void) { struct workqueue_attrs *attrs; - attrs = kzalloc(sizeof(*attrs), gfp_mask); + attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); if (!attrs) goto fail; - if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) + if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); @@ -3431,7 +3430,7 @@ static int init_worker_pool(struct worker_pool *pool) pool->refcnt = 1; /* shouldn't fail above this point */ - pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); + pool->attrs = alloc_workqueue_attrs(); if (!pool->attrs) return -ENOMEM; return 0; @@ -3896,8 +3895,8 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); - new_attrs = alloc_workqueue_attrs(GFP_KERNEL); - tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); + new_attrs = alloc_workqueue_attrs(); + tmp_attrs = alloc_workqueue_attrs(); if (!ctx || !new_attrs || !tmp_attrs) goto out_free; @@ -4241,7 +4240,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, return NULL; if (flags & WQ_UNBOUND) { - wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL); + wq->unbound_attrs = alloc_workqueue_attrs(); if (!wq->unbound_attrs) goto err_free_wq; } @@ -5394,7 +5393,7 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) lockdep_assert_held(&wq_pool_mutex); - attrs = alloc_workqueue_attrs(GFP_KERNEL); + attrs = alloc_workqueue_attrs(); if (!attrs) return NULL; @@ -5816,7 +5815,7 @@ static void __init wq_numa_init(void) return; } - wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!wq_update_unbound_numa_attrs_buf); /* @@ -5891,7 +5890,7 @@ int __init workqueue_init_early(void) for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; - BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; unbound_std_wq_attrs[i] = attrs; @@ -5900,7 +5899,7 @@ int __init workqueue_init_early(void) * guaranteed by max_active which is enforced by pwqs. * Turn off NUMA so that dfl_pwq is used for all nodes. */ - BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; attrs->no_numa = true; ordered_wq_attrs[i] = attrs; -- cgit v1.2.3 From 516337048fa40496ae5ca9863c367ec991a44d9a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 24 Jun 2019 07:33:26 -0300 Subject: hrtimer: Use a bullet for the returns bullet list That gets rid of this warning: ./kernel/time/hrtimer.c:1119: WARNING: Block quote ends without a blank line; unexpected unindent. and displays nicely both at the source code and at the produced documentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Thomas Gleixner Cc: Linux Doc Mailing List Cc: Mauro Carvalho Chehab Cc: Jonathan Corbet Link: https://lkml.kernel.org/r/74ddad7dac331b4e5ce4a90e15c8a49e3a16d2ac.1561372382.git.mchehab+samsung@kernel.org --- kernel/time/hrtimer.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index edb230aba3d1..5ee77f1a8a92 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1114,9 +1114,10 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); * @timer: hrtimer to stop * * Returns: - * 0 when the timer was not active - * 1 when the timer was active - * -1 when the timer is currently executing the callback function and + * + * * 0 when the timer was not active + * * 1 when the timer was active + * * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) -- cgit v1.2.3 From 0d01da6afc5402f60325c5da31b22f7d56689b49 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 27 Jun 2019 13:38:47 -0700 Subject: bpf: implement getsockopt and setsockopt hooks Implement new BPF_PROG_TYPE_CGROUP_SOCKOPT program type and BPF_CGROUP_{G,S}ETSOCKOPT cgroup hooks. BPF_CGROUP_SETSOCKOPT can modify user setsockopt arguments before passing them down to the kernel or bypass kernel completely. BPF_CGROUP_GETSOCKOPT can can inspect/modify getsockopt arguments that kernel returns. Both hooks reuse existing PTR_TO_PACKET{,_END} infrastructure. The buffer memory is pre-allocated (because I don't think there is a precedent for working with __user memory from bpf). This might be slow to do for each {s,g}etsockopt call, that's why I've added __cgroup_bpf_prog_array_is_empty that exits early if there is nothing attached to a cgroup. Note, however, that there is a race between __cgroup_bpf_prog_array_is_empty and BPF_PROG_RUN_ARRAY where cgroup program layout might have changed; this should not be a problem because in general there is a race between multiple calls to {s,g}etsocktop and user adding/removing bpf progs from a cgroup. The return code of the BPF program is handled as follows: * 0: EPERM * 1: success, continue with next BPF program in the cgroup chain v9: * allow overwriting setsockopt arguments (Alexei Starovoitov): * use set_fs (same as kernel_setsockopt) * buffer is always kzalloc'd (no small on-stack buffer) v8: * use s32 for optlen (Andrii Nakryiko) v7: * return only 0 or 1 (Alexei Starovoitov) * always run all progs (Alexei Starovoitov) * use optval=0 as kernel bypass in setsockopt (Alexei Starovoitov) (decided to use optval=-1 instead, optval=0 might be a valid input) * call getsockopt hook after kernel handlers (Alexei Starovoitov) v6: * rework cgroup chaining; stop as soon as bpf program returns 0 or 2; see patch with the documentation for the details * drop Andrii's and Martin's Acked-by (not sure they are comfortable with the new state of things) v5: * skip copy_to_user() and put_user() when ret == 0 (Martin Lau) v4: * don't export bpf_sk_fullsock helper (Martin Lau) * size != sizeof(__u64) for uapi pointers (Martin Lau) * offsetof instead of bpf_ctx_range when checking ctx access (Martin Lau) v3: * typos in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY comments (Andrii Nakryiko) * reverse christmas tree in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY (Andrii Nakryiko) * use __bpf_md_ptr instead of __u32 for optval{,_end} (Martin Lau) * use BPF_FIELD_SIZEOF() for consistency (Martin Lau) * new CG_SOCKOPT_ACCESS macro to wrap repeated parts v2: * moved bpf_sockopt_kern fields around to remove a hole (Martin Lau) * aligned bpf_sockopt_kern->buf to 8 bytes (Martin Lau) * bpf_prog_array_is_empty instead of bpf_prog_array_length (Martin Lau) * added [0,2] return code check to verifier (Martin Lau) * dropped unused buf[64] from the stack (Martin Lau) * use PTR_TO_SOCKET for bpf_sockopt->sk (Martin Lau) * dropped bpf_target_off from ctx rewrites (Martin Lau) * use return code for kernel bypass (Martin Lau & Andrii Nakryiko) Cc: Andrii Nakryiko Cc: Martin Lau Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 333 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 9 ++ kernel/bpf/syscall.c | 19 +++ kernel/bpf/verifier.c | 8 ++ 4 files changed, 369 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 077ed3a19848..76fa0076f20d 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "../cgroup/cgroup-internal.h" @@ -938,6 +939,188 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, + enum bpf_attach_type attach_type) +{ + struct bpf_prog_array *prog_array; + bool empty; + + rcu_read_lock(); + prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); + empty = bpf_prog_array_is_empty(prog_array); + rcu_read_unlock(); + + return empty; +} + +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +{ + if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) + return -EINVAL; + + ctx->optval = kzalloc(max_optlen, GFP_USER); + if (!ctx->optval) + return -ENOMEM; + + ctx->optval_end = ctx->optval + max_optlen; + ctx->optlen = max_optlen; + + return 0; +} + +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +{ + kfree(ctx->optval); +} + +int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, + int *optname, char __user *optval, + int *optlen, char **kernel_optval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = *level, + .optname = *optname, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + return 0; + + ret = sockopt_alloc_buf(&ctx, *optlen); + if (ret) + return ret; + + if (copy_from_user(ctx.optval, optval, *optlen) != 0) { + ret = -EFAULT; + goto out; + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen == -1) { + /* optlen set to -1, bypass kernel */ + ret = 1; + } else if (ctx.optlen > *optlen || ctx.optlen < -1) { + /* optlen is out of bounds */ + ret = -EFAULT; + } else { + /* optlen within bounds, run kernel handler */ + ret = 0; + + /* export any potential modifications */ + *level = ctx.level; + *optname = ctx.optname; + *optlen = ctx.optlen; + *kernel_optval = ctx.optval; + } + +out: + if (ret) + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); + +int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen, int max_optlen, + int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + return retval; + + ret = sockopt_alloc_buf(&ctx, max_optlen); + if (ret) + return ret; + + if (!retval) { + /* If kernel getsockopt finished successfully, + * copy whatever was returned to the user back + * into our temporary buffer. Set optlen to the + * one that kernel returned as well to let + * BPF programs inspect the value. + */ + + if (get_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + if (ctx.optlen > max_optlen) + ctx.optlen = max_optlen; + + if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { + ret = -EFAULT; + goto out; + } + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen > max_optlen) { + ret = -EFAULT; + goto out; + } + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) { + ret = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ctx.optval, ctx.optlen) || + put_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + ret = ctx.retval; + +out: + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); + static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, size_t *lenp) { @@ -1198,3 +1381,153 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = { const struct bpf_prog_ops cg_sysctl_prog_ops = { }; + +static const struct bpf_func_proto * +cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif + default: + return cgroup_base_func_proto(func_id, prog); + } +} + +static bool cg_sockopt_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sockopt)) + return false; + + if (off % size != 0) + return false; + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_GETSOCKOPT; + case offsetof(struct bpf_sockopt, optname): + /* fallthrough */ + case offsetof(struct bpf_sockopt, level): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_SETSOCKOPT; + case offsetof(struct bpf_sockopt, optlen): + return size == size_default; + default: + return false; + } + } + + switch (off) { + case offsetof(struct bpf_sockopt, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; + case offsetof(struct bpf_sockopt, optval): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sockopt, optval_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; + default: + if (size != size_default) + return false; + break; + } + return true; +} + +#define CG_SOCKOPT_ACCESS_FIELD(T, F) \ + T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sockopt_kern, F)) + +static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sockopt, sk): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); + break; + case offsetof(struct bpf_sockopt, level): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); + break; + case offsetof(struct bpf_sockopt, optname): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); + break; + case offsetof(struct bpf_sockopt, optlen): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); + break; + case offsetof(struct bpf_sockopt, retval): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); + break; + case offsetof(struct bpf_sockopt, optval): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); + break; + case offsetof(struct bpf_sockopt, optval_end): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); + break; + } + + return insn - insn_buf; +} + +static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, + bool direct_write, + const struct bpf_prog *prog) +{ + /* Nothing to do for sockopt argument. The data is kzalloc'ated. + */ + return 0; +} + +const struct bpf_verifier_ops cg_sockopt_verifier_ops = { + .get_func_proto = cg_sockopt_func_proto, + .is_valid_access = cg_sockopt_is_valid_access, + .convert_ctx_access = cg_sockopt_convert_ctx_access, + .gen_prologue = cg_sockopt_get_prologue, +}; + +const struct bpf_prog_ops cg_sockopt_prog_ops = { +}; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 561ed07d3007..e2c1b43728da 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1809,6 +1809,15 @@ int bpf_prog_array_length(struct bpf_prog_array *array) return cnt; } +bool bpf_prog_array_is_empty(struct bpf_prog_array *array) +{ + struct bpf_prog_array_item *item; + + for (item = array->items; item->prog; item++) + if (item->prog != &dummy_bpf_prog.prog) + return false; + return true; +} static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7713cf39795a..b0f545e07425 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1590,6 +1590,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + switch (expected_attach_type) { + case BPF_CGROUP_SETSOCKOPT: + case BPF_CGROUP_GETSOCKOPT: + return 0; + default: + return -EINVAL; + } default: return 0; } @@ -1840,6 +1848,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: return prog->enforce_expected_attach_type && @@ -1912,6 +1921,10 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SYSCTL: ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -1995,6 +2008,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_SYSCTL: ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -2031,6 +2048,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e079b2298f8..6b5623d320f9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2215,6 +2215,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, env->seen_direct_write = true; return true; + + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + if (t == BPF_WRITE) + env->seen_direct_write = true; + + return true; + default: return false; } @@ -6066,6 +6073,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: break; default: return 0; -- cgit v1.2.3 From 2f02a7ecd512288c40bd72bdd4d87ab4f01c1615 Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Fri, 28 Jun 2019 10:50:35 +0800 Subject: kernel: power: swap: use kzalloc() instead of kmalloc() followed by memset() Use zeroing allocator instead of using allocator followed with memset with 0 Signed-off-by: Fuqian Huang Acked-by: Pavel Machek [ rjw: Subject ] Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index e1912ad13bdc..ca0fcb5ced71 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -974,12 +974,11 @@ static int get_swap_reader(struct swap_map_handle *handle, last = handle->maps = NULL; offset = swsusp_header->image; while (offset) { - tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); + tmp = kzalloc(sizeof(*handle->maps), GFP_KERNEL); if (!tmp) { release_swap_reader(handle); return -ENOMEM; } - memset(tmp, 0, sizeof(*tmp)); if (!handle->maps) handle->maps = tmp; if (last) -- cgit v1.2.3 From b53b0b9d9a613c418057f6cb921c2f40a6f78c24 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 30 Apr 2019 12:21:53 -0400 Subject: pidfd: add polling support This patch adds polling support to pidfd. Android low memory killer (LMK) needs to know when a process dies once it is sent the kill signal. It does so by checking for the existence of /proc/pid which is both racy and slow. For example, if a PID is reused between when LMK sends a kill signal and checks for existence of the PID, since the wrong PID is now possibly checked for existence. Using the polling support, LMK will be able to get notified when a process exists in race-free and fast way, and allows the LMK to do other things (such as by polling on other fds) while awaiting the process being killed to die. For notification to polling processes, we follow the same existing mechanism in the kernel used when the parent of the task group is to be notified of a child's death (do_notify_parent). This is precisely when the tasks waiting on a poll of pidfd are also awakened in this patch. We have decided to include the waitqueue in struct pid for the following reasons: 1. The wait queue has to survive for the lifetime of the poll. Including it in task_struct would not be option in this case because the task can be reaped and destroyed before the poll returns. 2. By including the struct pid for the waitqueue means that during de_thread(), the new thread group leader automatically gets the new waitqueue/pid even though its task_struct is different. Appropriate test cases are added in the second patch to provide coverage of all the cases the patch is handling. Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Daniel Colascione Cc: Jann Horn Cc: Tim Murray Cc: Jonathan Kowalski Cc: Linus Torvalds Cc: Al Viro Cc: Kees Cook Cc: David Howells Cc: Oleg Nesterov Cc: kernel-team@android.com Reviewed-by: Oleg Nesterov Co-developed-by: Daniel Colascione Signed-off-by: Daniel Colascione Signed-off-by: Joel Fernandes (Google) Signed-off-by: Christian Brauner --- kernel/fork.c | 26 ++++++++++++++++++++++++++ kernel/pid.c | 2 ++ kernel/signal.c | 11 +++++++++++ 3 files changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b4cba953040a..2cdf295b72c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1704,8 +1704,34 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) } #endif +/* + * Poll support for process exit notification. + */ +static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) +{ + struct task_struct *task; + struct pid *pid = file->private_data; + int poll_flags = 0; + + poll_wait(file, &pid->wait_pidfd, pts); + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + /* + * Inform pollers only when the whole thread group exits. + * If the thread group leader exits before all other threads in the + * group, then poll(2) should block, similar to the wait(2) family. + */ + if (!task || (task->exit_state && thread_group_empty(task))) + poll_flags = POLLIN | POLLRDNORM; + rcu_read_unlock(); + + return poll_flags; +} + const struct file_operations pidfd_fops = { .release = pidfd_release, + .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, #endif diff --git a/kernel/pid.c b/kernel/pid.c index 89548d35eefb..6ce3a95968f7 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -213,6 +213,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); + init_waitqueue_head(&pid->wait_pidfd); + upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) diff --git a/kernel/signal.c b/kernel/signal.c index a1eb44dc9ff5..1c86b78a7597 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1803,6 +1803,14 @@ ret: return ret; } +static void do_notify_pidfd(struct task_struct *task) +{ + struct pid *pid; + + pid = task_pid(task); + wake_up_all(&pid->wait_pidfd); +} + /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. @@ -1826,6 +1834,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); + /* Wake up all pidfd waiters */ + do_notify_pidfd(tsk); + if (sig != SIGCHLD) { /* * This is only possible if parent == real_parent. -- cgit v1.2.3 From 32fcb426ec001cb6d5a4a195091a8486ea77e2df Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 24 May 2019 12:43:51 +0200 Subject: pid: add pidfd_open() This adds the pidfd_open() syscall. It allows a caller to retrieve pollable pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a process that is created via traditional fork()/clone() calls that is only referenced by a PID: int pidfd = pidfd_open(1234, 0); ret = pidfd_send_signal(pidfd, SIGSTOP, NULL, 0); With the introduction of pidfds through CLONE_PIDFD it is possible to created pidfds at process creation time. However, a lot of processes get created with traditional PID-based calls such as fork() or clone() (without CLONE_PIDFD). For these processes a caller can currently not create a pollable pidfd. This is a problem for Android's low memory killer (LMK) and service managers such as systemd. Both are examples of tools that want to make use of pidfds to get reliable notification of process exit for non-parents (pidfd polling) and race-free signal sending (pidfd_send_signal()). They intend to switch to this API for process supervision/management as soon as possible. Having no way to get pollable pidfds from PID-only processes is one of the biggest blockers for them in adopting this api. With pidfd_open() making it possible to retrieve pidfds for PID-based processes we enable them to adopt this api. In line with Arnd's recent changes to consolidate syscall numbers across architectures, I have added the pidfd_open() syscall to all architectures at the same time. Signed-off-by: Christian Brauner Reviewed-by: David Howells Reviewed-by: Oleg Nesterov Acked-by: Arnd Bergmann Cc: "Eric W. Biederman" Cc: Kees Cook Cc: Joel Fernandes (Google) Cc: Thomas Gleixner Cc: Jann Horn Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Aleksa Sarai Cc: Linus Torvalds Cc: Al Viro Cc: linux-api@vger.kernel.org --- kernel/pid.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 6ce3a95968f7..8e6f50053364 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include @@ -452,6 +454,73 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return idr_get_next(&ns->idr, &nr); } +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid: struct pid that the pidfd will reference + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid) +{ + int fd; + + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), + O_RDWR | O_CLOEXEC); + if (fd < 0) + put_pid(pid); + + return fd; +} + +/** + * pidfd_open() - Open new pid file descriptor. + * + * @pid: pid for which to retrieve a pidfd + * @flags: flags to pass + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set for + * the process identified by @pid. Currently, the process identified by + * @pid must be a thread-group leader. This restriction currently exists + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot + * be used with CLONE_THREAD) and pidfd polling (only supports thread group + * leaders). + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) +{ + int fd, ret; + struct pid *p; + + if (flags) + return -EINVAL; + + if (pid <= 0) + return -EINVAL; + + p = find_get_pid(pid); + if (!p) + return -ESRCH; + + ret = 0; + rcu_read_lock(); + if (!pid_task(p, PIDTYPE_TGID)) + ret = -EINVAL; + rcu_read_unlock(); + + fd = ret ?: pidfd_create(p); + put_pid(p); + return fd; +} + void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ -- cgit v1.2.3 From c8af5cd75e2411d5a5aacf115f59a5ff6b87f3fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: xskmap: Move non-standard list manipulation to helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper in list.h for the non-standard way of clearing a list that is used in xskmap. This makes it easier to reuse it in the other map types, and also makes sure this usage is not forgotten in any list refactorings in the future. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index ef7338cebd18..9bb96ace9fa1 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -145,8 +145,7 @@ void __xsk_map_flush(struct bpf_map *map) list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { xsk_flush(xs); - __list_del(xs->flush_node.prev, xs->flush_node.next); - xs->flush_node.prev = NULL; + __list_del_clearprev(&xs->flush_node); } } -- cgit v1.2.3 From d5df2830ca9922d03a33940ea424c9a5f39f1162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: devmap/cpumap: Use flush list instead of bitmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The socket map uses a linked list instead of a bitmap to keep track of which entries to flush. Do the same for devmap and cpumap, as this means we don't have to care about the map index when enqueueing things into the map (and so we can cache the map lookup). Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jonathan Lemon Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 105 +++++++++++++++++++++++---------------------------- kernel/bpf/devmap.c | 107 +++++++++++++++++++++++----------------------------- 2 files changed, 95 insertions(+), 117 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8dff08768087..ef49e17ae47c 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -32,14 +32,19 @@ /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is - * guaranteed that setting flush bit and flush operation happen on + * guaranteed that queueing the frame and the flush operation happen on * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() * which queue in bpf_cpu_map_entry contains packets. */ #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ +struct bpf_cpu_map_entry; +struct bpf_cpu_map; + struct xdp_bulk_queue { void *q[CPU_MAP_BULK_SIZE]; + struct list_head flush_node; + struct bpf_cpu_map_entry *obj; unsigned int count; }; @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry { /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; + struct bpf_cpu_map *cmap; + /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; @@ -65,23 +72,17 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx); - -static u64 cpu_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); -} +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; + int ret, cpu; u64 cost; - int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); + cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ ret = bpf_map_charge_init(&cmap->map.memory, cost); @@ -114,12 +115,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - /* A per cpu bitfield with a bit per possible CPU in map */ - cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), - __alignof__(unsigned long)); - if (!cmap->flush_needed) + cmap->flush_list = alloc_percpu(struct list_head); + if (!cmap->flush_list) goto free_charge; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); + /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), @@ -129,7 +131,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_percpu: - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); free_charge: bpf_map_charge_finish(&cmap->map.memory); free_cmap: @@ -334,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; - int numa, err; + struct xdp_bulk_queue *bq; + int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -349,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->bulkq) goto free_rcu; + for_each_possible_cpu(i) { + bq = per_cpu_ptr(rcpu->bulkq, i); + bq->obj = rcpu; + } + /* Alloc queue */ rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); if (!rcpu->queue) @@ -405,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(rcpu, bq, false); + bq_flush_to_queue(bq, false); } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ @@ -488,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); if (!rcpu) return -ENOMEM; + rcpu->cmap = cmap; } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); @@ -514,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map) synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * list be empty on _all_ cpus. Because the above synchronize_rcu() + * ensures the map is disconnected from the program we can assume no new + * items will be added to the list. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); - while (!bitmap_empty(bitmap, cmap->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -538,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map) /* bq flush and cleanup happens after RCU graze-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -590,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx) +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) { + struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; const int to_cpu = rcpu->cpu; struct ptr_ring *q; @@ -621,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, bq->count = 0; spin_unlock(&q->producer_lock); + __list_del_clearprev(&bq->flush_node); + /* Feedback loop via tracepoints */ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); return 0; @@ -631,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { + struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(rcpu, bq, true); + bq_flush_to_queue(bq, true); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -646,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) * operation, when completing napi->poll call. */ bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -665,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - - __set_bit(bit, bitmap); -} - void __cpu_map_flush(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - u32 bit; - - /* The napi->poll softirq makes sure __cpu_map_insert_ctx() - * and __cpu_map_flush() happen on same CPU. Thus, the percpu - * bitmap indicate which percpu bulkq have packets. - */ - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!rcpu)) - continue; - - __clear_bit(bit, bitmap); + struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct xdp_bulk_queue *bq, *tmp; - /* Flush all frames in bulkq to real queue */ - bq = this_cpu_ptr(rcpu->bulkq); - bq_flush_to_queue(rcpu, bq, true); + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + bq_flush_to_queue(bq, true); /* If already running, costs spin_lock_irqsave + smb_mb */ - wake_up_process(rcpu->kthread); + wake_up_process(bq->obj->kthread); } } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 40e86a7e0ef0..a4dddc867cbf 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -17,9 +17,8 @@ * datapath always has a valid copy. However, the datapath does a "flush" * operation that pushes any pending packets in the driver outside the RCU * critical section. Each bpf_dtab_netdev tracks these pending operations using - * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed - * until all bits are cleared indicating outstanding flush operations have - * completed. + * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until + * this list is empty, indicating outstanding flush operations have completed. * * BPF syscalls may race with BPF program calls on any of the update, delete * or lookup operations. As noted above the xchg() operation also keep the @@ -48,9 +47,13 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 +struct bpf_dtab_netdev; + struct xdp_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct list_head flush_node; struct net_device *dev_rx; + struct bpf_dtab_netdev *obj; unsigned int count; }; @@ -65,23 +68,18 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; struct list_head list; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static u64 dev_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); -} - static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; + int err, cpu; u64 cost; - int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -99,7 +97,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += dev_map_bitmap_size(attr) * num_possible_cpus(); + cost += sizeof(struct list_head) * num_possible_cpus(); /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); @@ -108,28 +106,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) err = -ENOMEM; - /* A per cpu bitfield with a bit per possible net device */ - dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), - __alignof__(unsigned long), - GFP_KERNEL | __GFP_NOWARN); - if (!dtab->flush_needed) + dtab->flush_list = alloc_percpu(struct list_head); + if (!dtab->flush_list) goto free_charge; + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_charge; + goto free_percpu; spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; + +free_percpu: + free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); free_dtab: - free_percpu(dtab->flush_needed); kfree(dtab); return ERR_PTR(err); } @@ -158,14 +158,14 @@ static void dev_map_free(struct bpf_map *map) rcu_barrier(); /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * list to empty on _all_ cpus. * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * from the program we can assume no new items will be added. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - while (!bitmap_empty(bitmap, dtab->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -181,7 +181,7 @@ static void dev_map_free(struct bpf_map *map) kfree(dev); } - free_percpu(dtab->flush_needed); + free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); kfree(dtab); } @@ -203,18 +203,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - - __set_bit(bit, bitmap); -} - -static int bq_xmit_all(struct bpf_dtab_netdev *obj, - struct xdp_bulk_queue *bq, u32 flags, +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { + struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; int sent = 0, drops = 0, err = 0; int i; @@ -241,6 +233,7 @@ out: trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; + __list_del_clearprev(&bq->flush_node); return 0; error: /* If ndo_xdp_xmit fails with an errno, no frames have been @@ -263,31 +256,18 @@ error: * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the - * net device can be torn down. On devmap tear down we ensure the ctx bitmap - * is zeroed before completing to ensure all flush operations have completed. + * net device can be torn down. On devmap tear down we ensure the flush list + * is empty before completing to ensure all flush operations have completed. */ void __dev_map_flush(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - u32 bit; + struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct xdp_bulk_queue *bq, *tmp; rcu_read_lock(); - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if the dev entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!dev)) - continue; - - bq = this_cpu_ptr(dev->bulkq); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); - - __clear_bit(bit, bitmap); - } + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) + bq_xmit_all(bq, XDP_XMIT_FLUSH, true); rcu_read_unlock(); } @@ -314,10 +294,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { + struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(obj, bq, 0, true); + bq_xmit_all(bq, 0, true); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -327,6 +308,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, bq->dev_rx = dev_rx; bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -377,17 +362,12 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { struct xdp_bulk_queue *bq; - unsigned long *bitmap; - int cpu; rcu_read_lock(); for_each_online_cpu(cpu) { - bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); - __clear_bit(dev->bit, bitmap); - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); + bq_xmit_all(bq, XDP_XMIT_FLUSH, false); } rcu_read_unlock(); } @@ -434,8 +414,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, struct net *net = current->nsproxy->net_ns; gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; - u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; + struct xdp_bulk_queue *bq; + u32 i = *(u32 *)key; + int cpu; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -458,6 +440,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; } + for_each_possible_cpu(cpu) { + bq = per_cpu_ptr(dev->bulkq, cpu); + bq->obj = dev; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { free_percpu(dev->bulkq); -- cgit v1.2.3 From 0cdbb4b09a0658b72c563638d476113aadd91afb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:35 +0200 Subject: devmap: Allow map lookups from eBPF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't currently allow lookups into a devmap from eBPF, because the map lookup returns a pointer directly to the dev->ifindex, which shouldn't be modifiable from eBPF. However, being able to do lookups in devmaps is useful to know (e.g.) whether forwarding to a specific interface is enabled. Currently, programs work around this by keeping a shadow map of another type which indicates whether a map index is valid. Since we now have a flag to make maps read-only from the eBPF side, we can simply lift the lookup restriction if we make sure this flag is always set. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jonathan Lemon Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 5 +++++ kernel/bpf/verifier.c | 7 ++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a4dddc867cbf..d83cf8ccc872 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -89,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); + /* Lookup returns a pointer straight to dev->ifindex, so make sure the + * verifier prevents writes from the BPF side + */ + attr->map_flags |= BPF_F_RDONLY_PROG; + dtab = kzalloc(sizeof(*dtab), GFP_USER); if (!dtab) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6b5623d320f9..9b6ee93d5a85 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3414,12 +3414,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_get_local_storage) goto error; break; - /* devmap returns a pointer to a live net_device ifindex that we cannot - * allow to be modified from bpf side. So do not allow lookup elements - * for now. - */ case BPF_MAP_TYPE_DEVMAP: - if (func_id != BPF_FUNC_redirect_map) + if (func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; /* Restrict bpf side of cpumap and xskmap, open when use-cases -- cgit v1.2.3 From 6a31fcd4cff84490bf5ac11dbeedfdca5b38b39a Mon Sep 17 00:00:00 2001 From: Prakhar Srivastava Date: Sun, 23 Jun 2019 23:23:31 -0700 Subject: KEXEC: Call ima_kexec_cmdline to measure the boot command line args During soft reboot(kexec_file_load) boot command line arguments are not measured. Call ima hook ima_kexec_cmdline to measure the boot command line arguments into IMA measurement list. - call ima_kexec_cmdline from kexec_file_load. - move the call ima_add_kexec_buffer after the cmdline args have been measured. Signed-off-by: Prakhar Srivastava Reviewed-by: James Morris Acked-by: Dave Young Signed-off-by: Mimi Zohar --- kernel/kexec_file.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 072b6ee55e3f..b0c724e5d86c 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -198,9 +198,6 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, return ret; image->kernel_buf_len = size; - /* IMA needs to pass the measurement list to the next kernel. */ - ima_add_kexec_buffer(image); - /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, image->kernel_buf_len); @@ -241,8 +238,14 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, ret = -EINVAL; goto out; } + + ima_kexec_cmdline(image->cmdline_buf, + image->cmdline_buf_len - 1); } + /* IMA needs to pass the measurement list to the next kernel. */ + ima_add_kexec_buffer(image); + /* Call arch image load handlers */ ldata = arch_kexec_kernel_image_load(image); -- cgit v1.2.3 From 0092908d16c604b8207c2141ec64b0fa4473bb03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:06 +0200 Subject: mm: factor out a devm_request_free_mem_region helper Keep the physical address allocation that hmm_add_device does with the rest of the resource code, and allow future reuse of it without the hmm wrapper. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: John Hubbard Reviewed-by: Dan Williams Signed-off-by: Jason Gunthorpe --- kernel/resource.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 158f04ec1d4f..d22423e85cf8 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head) } EXPORT_SYMBOL(resource_list_free); +#ifdef CONFIG_DEVICE_PRIVATE +/** + * devm_request_free_mem_region - find free region for device private memory + * + * @dev: device struct to bind the resource to + * @size: size in bytes of the device memory to add + * @base: resource tree to look in + * + * This function tries to find an empty range of physical address big enough to + * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE + * memory, which in turn allocates struct pages. + */ +struct resource *devm_request_free_mem_region(struct device *dev, + struct resource *base, unsigned long size) +{ + resource_size_t end, addr; + struct resource *res; + + size = ALIGN(size, 1UL << PA_SECTION_SHIFT); + end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); + addr = end - size + 1UL; + + for (; addr > size && addr >= base->start; addr -= size) { + if (region_intersects(addr, size, 0, IORES_DESC_NONE) != + REGION_DISJOINT) + continue; + + res = devm_request_mem_region(dev, addr, size, dev_name(dev)); + if (!res) + return ERR_PTR(-ENOMEM); + res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + return res; + } + + return ERR_PTR(-ERANGE); +} +EXPORT_SYMBOL_GPL(devm_request_free_mem_region); +#endif /* CONFIG_DEVICE_PRIVATE */ + static int __init strict_iomem(char *str) { if (strstr(str, "relaxed")) -- cgit v1.2.3 From 3ed2dcdf54d5bf1f9823b5faf1a702e7cee53982 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:07 +0200 Subject: memremap: validate the pagemap type passed to devm_memremap_pages Most pgmap types are only supported when certain config options are enabled. Check for a type that is valid for the current configuration before setting up the pagemap. For this the usage of the 0 type for device dax gets replaced with an explicit MEMORY_DEVICE_DEVDAX type. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- kernel/memremap.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 6e1970719dc2..abda62d1e5a3 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -157,6 +157,28 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { + WARN(1, "Device private memory not supported\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_FS_DAX: + if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || + IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { + WARN(1, "File system DAX not supported\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_DEVDAX: + case MEMORY_DEVICE_PCI_P2PDMA: + break; + default: + WARN(1, "Invalid pgmap type %d\n", pgmap->type); + break; + } + if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) { WARN(1, "Missing reference count teardown definition\n"); return ERR_PTR(-EINVAL); -- cgit v1.2.3 From 1e240e8d4a7d92232b6214e02a0a4197a53afd6c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:08 +0200 Subject: memremap: move dev_pagemap callbacks into a separate structure The dev_pagemap is a growing too many callbacks. Move them into a separate ops structure so that they are not duplicated for multiple instances, and an attacker can't easily overwrite them. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- kernel/memremap.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index abda62d1e5a3..0824237ef979 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->kill(pgmap->ref); + pgmap->ops->kill(pgmap->ref); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->cleanup(pgmap->ref); + pgmap->ops->cleanup(pgmap->ref); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -128,8 +128,8 @@ static void devm_memremap_pages_release(void *data) * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res, ref and type members of @pgmap must be initialized - * by the caller before passing it to this function + * 1/ At a minimum the res, ref and type and ops members of @pgmap must be + * initialized by the caller before passing it to this function * * 2/ The altmap field may optionally be initialized, in which case altmap_valid * must be set to true @@ -179,7 +179,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) break; } - if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) { + if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill || + !pgmap->ops->cleanup) { WARN(1, "Missing reference count teardown definition\n"); return ERR_PTR(-EINVAL); } @@ -293,9 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->kill(pgmap->ref); - pgmap->cleanup(pgmap->ref); - + pgmap->ops->kill(pgmap->ref); + pgmap->ops->cleanup(pgmap->ref); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); @@ -388,7 +388,7 @@ void __put_devmap_managed_page(struct page *page) mem_cgroup_uncharge(page); - page->pgmap->page_free(page, page->pgmap->data); + page->pgmap->ops->page_free(page, page->pgmap->data); } else if (!count) __put_page(page); } -- cgit v1.2.3 From d8668bb0451c3c45b59dbcde2654e0539aad1d2a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:09 +0200 Subject: memremap: pass a struct dev_pagemap to ->kill and ->cleanup Passing the actual typed structure leads to more understandable code vs just passing the ref member. Reported-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- kernel/memremap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 0824237ef979..00c1ceb60c19 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->ops->kill(pgmap->ref); + pgmap->ops->kill(pgmap); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->ops->cleanup(pgmap->ref); + pgmap->ops->cleanup(pgmap); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -294,8 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->ops->kill(pgmap->ref); - pgmap->ops->cleanup(pgmap->ref); + pgmap->ops->kill(pgmap); + pgmap->ops->cleanup(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); -- cgit v1.2.3 From f6a55e1a3fe6b3bb294a80a05437fcf86488d819 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:10 +0200 Subject: memremap: lift the devmap_enable manipulation into devm_memremap_pages Just check if there is a ->page_free operation set and take care of the static key enable, as well as the put using device managed resources. Also check that a ->page_free is provided for the pgmaps types that require it, and check for a valid type as well while we are at it. Note that this also fixes the fact that hmm never called dev_pagemap_put_ops and thus would leave the slow path enabled forever, even after a device driver unload or disable. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- kernel/memremap.c | 59 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 00c1ceb60c19..3219a4c91d07 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -17,6 +17,35 @@ static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) +#ifdef CONFIG_DEV_PAGEMAP_OPS +DEFINE_STATIC_KEY_FALSE(devmap_managed_key); +EXPORT_SYMBOL(devmap_managed_key); +static atomic_t devmap_managed_enable; + +static void devmap_managed_enable_put(void *data) +{ + if (atomic_dec_and_test(&devmap_managed_enable)) + static_branch_disable(&devmap_managed_key); +} + +static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) +{ + if (!pgmap->ops->page_free) { + WARN(1, "Missing page_free method\n"); + return -EINVAL; + } + + if (atomic_inc_return(&devmap_managed_enable) == 1) + static_branch_enable(&devmap_managed_key); + return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL); +} +#else +static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) +{ + return -EINVAL; +} +#endif /* CONFIG_DEV_PAGEMAP_OPS */ + #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, @@ -156,6 +185,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; + bool need_devmap_managed = true; switch (pgmap->type) { case MEMORY_DEVICE_PRIVATE: @@ -173,6 +203,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) break; case MEMORY_DEVICE_DEVDAX: case MEMORY_DEVICE_PCI_P2PDMA: + need_devmap_managed = false; break; default: WARN(1, "Invalid pgmap type %d\n", pgmap->type); @@ -185,6 +216,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) return ERR_PTR(-EINVAL); } + if (need_devmap_managed) { + error = devmap_managed_enable_get(dev, pgmap); + if (error) + return ERR_PTR(error); + } + align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - align_start; @@ -351,28 +388,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, EXPORT_SYMBOL_GPL(get_dev_pagemap); #ifdef CONFIG_DEV_PAGEMAP_OPS -DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL(devmap_managed_key); -static atomic_t devmap_enable; - -/* - * Toggle the static key for ->page_free() callbacks when dev_pagemap - * pages go idle. - */ -void dev_pagemap_get_ops(void) -{ - if (atomic_inc_return(&devmap_enable) == 1) - static_branch_enable(&devmap_managed_key); -} -EXPORT_SYMBOL_GPL(dev_pagemap_get_ops); - -void dev_pagemap_put_ops(void) -{ - if (atomic_dec_and_test(&devmap_enable)) - static_branch_disable(&devmap_managed_key); -} -EXPORT_SYMBOL_GPL(dev_pagemap_put_ops); - void __put_devmap_managed_page(struct page *page) { int count = page_ref_dec_return(page); -- cgit v1.2.3 From 897e6365cda6ba6356e83a3aaa68dec82ef4c548 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:11 +0200 Subject: memremap: add a migrate_to_ram method to struct dev_pagemap_ops This replaces the hacky ->fault callback, which is currently directly called from common code through a hmm specific data structure as an exercise in layering violations. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- kernel/memremap.c | 35 ++++------------------------------- 1 file changed, 4 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 3219a4c91d07..c06a5487dda7 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -11,7 +11,6 @@ #include #include #include -#include static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) @@ -46,36 +45,6 @@ static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgm } #endif /* CONFIG_DEV_PAGEMAP_OPS */ -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) -vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, - unsigned long addr, - swp_entry_t entry, - unsigned int flags, - pmd_t *pmdp) -{ - struct page *page = device_private_entry_to_page(entry); - struct hmm_devmem *devmem; - - devmem = container_of(page->pgmap, typeof(*devmem), pagemap); - - /* - * The page_fault() callback must migrate page back to system memory - * so that CPU can access it. This might fail for various reasons - * (device issue, device was unsafely unplugged, ...). When such - * error conditions happen, the callback must return VM_FAULT_SIGBUS. - * - * Note that because memory cgroup charges are accounted to the device - * memory, this should never fail because of memory restrictions (but - * allocation of regular system page might still fail because we are - * out of memory). - * - * There is a more in-depth description of what that callback can and - * cannot do, in include/linux/memremap.h - */ - return devmem->page_fault(vma, addr, page, flags, pmdp); -} -#endif /* CONFIG_DEVICE_PRIVATE */ - static void pgmap_array_delete(struct resource *res) { xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), @@ -193,6 +162,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) WARN(1, "Device private memory not supported\n"); return ERR_PTR(-EINVAL); } + if (!pgmap->ops || !pgmap->ops->migrate_to_ram) { + WARN(1, "Missing migrate_to_ram method\n"); + return ERR_PTR(-EINVAL); + } break; case MEMORY_DEVICE_FS_DAX: if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || -- cgit v1.2.3 From 80a72d0af05ae97a8b106c172e431072ba587492 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:12 +0200 Subject: memremap: remove the data field in struct dev_pagemap struct dev_pagemap is always embedded into a containing structure, so there is no need to an additional private data field. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- kernel/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index c06a5487dda7..6c3dbb692037 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -376,7 +376,7 @@ void __put_devmap_managed_page(struct page *page) mem_cgroup_uncharge(page); - page->pgmap->ops->page_free(page, page->pgmap->data); + page->pgmap->ops->page_free(page); } else if (!count) __put_page(page); } -- cgit v1.2.3 From 514caf23a70fd697fa2ece238b2cd8dcc73fb16f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:13 +0200 Subject: memremap: replace the altmap_valid field with a PGMAP_ALTMAP_VALID flag Add a flags field to struct dev_pagemap to replace the altmap_valid boolean to be a little more extensible. Also add a pgmap_altmap() helper to find the optional altmap and clean up the code using the altmap using it. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- kernel/memremap.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 6c3dbb692037..eee490e7d7e1 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -54,14 +54,8 @@ static void pgmap_array_delete(struct resource *res) static unsigned long pfn_first(struct dev_pagemap *pgmap) { - const struct resource *res = &pgmap->res; - struct vmem_altmap *altmap = &pgmap->altmap; - unsigned long pfn; - - pfn = res->start >> PAGE_SHIFT; - if (pgmap->altmap_valid) - pfn += vmem_altmap_offset(altmap); - return pfn; + return (pgmap->res.start >> PAGE_SHIFT) + + vmem_altmap_offset(pgmap_altmap(pgmap)); } static unsigned long pfn_end(struct dev_pagemap *pgmap) @@ -109,7 +103,7 @@ static void devm_memremap_pages_release(void *data) align_size >> PAGE_SHIFT, NULL); } else { arch_remove_memory(nid, align_start, align_size, - pgmap->altmap_valid ? &pgmap->altmap : NULL); + pgmap_altmap(pgmap)); kasan_remove_zero_shadow(__va(align_start), align_size); } mem_hotplug_done(); @@ -129,8 +123,8 @@ static void devm_memremap_pages_release(void *data) * 1/ At a minimum the res, ref and type and ops members of @pgmap must be * initialized by the caller before passing it to this function * - * 2/ The altmap field may optionally be initialized, in which case altmap_valid - * must be set to true + * 2/ The altmap field may optionally be initialized, in which case + * PGMAP_ALTMAP_VALID must be set in pgmap->flags. * * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped * at devm_memremap_pages_release() time, or if this routine fails. @@ -142,15 +136,13 @@ static void devm_memremap_pages_release(void *data) void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { resource_size_t align_start, align_size, align_end; - struct vmem_altmap *altmap = pgmap->altmap_valid ? - &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; struct mhp_restrictions restrictions = { /* * We do not want any optional features only our own memmap */ - .altmap = altmap, + .altmap = pgmap_altmap(pgmap), }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; @@ -274,7 +266,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, altmap); + align_size >> PAGE_SHIFT, pgmap_altmap(pgmap)); } mem_hotplug_done(); @@ -319,7 +311,9 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages); unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ - return altmap->reserve + altmap->free; + if (altmap) + return altmap->reserve + altmap->free; + return 0; } void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) -- cgit v1.2.3 From 24917f6b1041a73993178920656e13364f847995 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:14 +0200 Subject: memremap: provide an optional internal refcount in struct dev_pagemap Provide an internal refcounting logic if no ->ref field is provided in the pagemap passed into devm_memremap_pages so that callers don't have to reinvent it poorly. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- kernel/memremap.c | 64 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index eee490e7d7e1..bea6f887adad 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -29,7 +29,7 @@ static void devmap_managed_enable_put(void *data) static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) { - if (!pgmap->ops->page_free) { + if (!pgmap->ops || !pgmap->ops->page_free) { WARN(1, "Missing page_free method\n"); return -EINVAL; } @@ -75,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn) #define for_each_device_pfn(pfn, map) \ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) +static void dev_pagemap_kill(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->kill) + pgmap->ops->kill(pgmap); + else + percpu_ref_kill(pgmap->ref); +} + +static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->cleanup) { + pgmap->ops->cleanup(pgmap); + } else { + wait_for_completion(&pgmap->done); + percpu_ref_exit(pgmap->ref); + } +} + static void devm_memremap_pages_release(void *data) { struct dev_pagemap *pgmap = data; @@ -84,10 +102,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->ops->kill(pgmap); + dev_pagemap_kill(pgmap); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->ops->cleanup(pgmap); + dev_pagemap_cleanup(pgmap); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -114,20 +132,29 @@ static void devm_memremap_pages_release(void *data) "%s: failed to free all reserved pages\n", __func__); } +static void dev_pagemap_percpu_release(struct percpu_ref *ref) +{ + struct dev_pagemap *pgmap = + container_of(ref, struct dev_pagemap, internal_ref); + + complete(&pgmap->done); +} + /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res, ref and type and ops members of @pgmap must be - * initialized by the caller before passing it to this function + * 1/ At a minimum the res and type members of @pgmap must be initialized + * by the caller before passing it to this function * * 2/ The altmap field may optionally be initialized, in which case * PGMAP_ALTMAP_VALID must be set in pgmap->flags. * - * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped - * at devm_memremap_pages_release() time, or if this routine fails. + * 3/ The ref field may optionally be provided, in which pgmap->ref must be + * 'live' on entry and will be killed and reaped at + * devm_memremap_pages_release() time, or if this routine fails. * * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but @@ -175,10 +202,21 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) break; } - if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill || - !pgmap->ops->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); + if (!pgmap->ref) { + if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) + return ERR_PTR(-EINVAL); + + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->internal_ref, + dev_pagemap_percpu_release, 0, GFP_KERNEL); + if (error) + return ERR_PTR(error); + pgmap->ref = &pgmap->internal_ref; + } else { + if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { + WARN(1, "Missing reference count teardown definition\n"); + return ERR_PTR(-EINVAL); + } } if (need_devmap_managed) { @@ -296,8 +334,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->ops->kill(pgmap); - pgmap->ops->cleanup(pgmap); + dev_pagemap_kill(pgmap); + dev_pagemap_cleanup(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); -- cgit v1.2.3 From 7e8e6816c6495a1168f9a7a50125d82c23e59300 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 2 Jul 2019 17:53:35 +0200 Subject: stacktrace: Use PF_KTHREAD to check for kernel threads !current->mm is not a reliable indicator for kernel threads as they might temporarily use a user mm. Check for PF_KTHREAD instead. Signed-off-by: Thomas Gleixner Acked-by: Mark Rutland Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Steven Rostedt Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1907021750100.1802@nanos.tec.linutronix.de --- kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 36139de0a3c4..c8d0f05721a1 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -228,7 +228,7 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) }; /* Trace user stack if not a kernel thread */ - if (!current->mm) + if (current->flags & PF_KTHREAD) return 0; arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); -- cgit v1.2.3 From 4001d8e8762f57d418b66e4e668601791900a1dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 28 Jun 2019 13:11:49 +0200 Subject: genirq: Delay deactivation in free_irq() When interrupts are shutdown, they are immediately deactivated in the irqdomain hierarchy. While this looks obviously correct there is a subtle issue: There might be an interrupt in flight when free_irq() is invoking the shutdown. This is properly handled at the irq descriptor / primary handler level, but the deactivation might completely disable resources which are required to acknowledge the interrupt. Split the shutdown code and deactivate the interrupt after synchronization in free_irq(). Fixup all other usage sites where this is not an issue to invoke the combined shutdown_and_deactivate() function instead. This still might be an issue if the interrupt in flight servicing is delayed on a remote CPU beyond the invocation of synchronize_irq(), but that cannot be handled at that level and needs to be handled in the synchronize_irq() context. Fixes: f8264e34965a ("irqdomain: Introduce new interfaces to support hierarchy irqdomains") Reported-by: Robert Hodaszi Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Link: https://lkml.kernel.org/r/20190628111440.098196390@linutronix.de --- kernel/irq/autoprobe.c | 6 +++--- kernel/irq/chip.c | 6 ++++++ kernel/irq/cpuhotplug.c | 2 +- kernel/irq/internals.h | 1 + kernel/irq/manage.c | 12 +++++++++++- 5 files changed, 22 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 16cbf6beb276..ae60cae24e9a 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -90,7 +90,7 @@ unsigned long probe_irq_on(void) /* It triggered already - consider it spurious. */ if (!(desc->istate & IRQS_WAITING)) { desc->istate &= ~IRQS_AUTODETECT; - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); } else if (i < 32) mask |= 1 << i; @@ -127,7 +127,7 @@ unsigned int probe_irq_mask(unsigned long val) mask |= 1 << i; desc->istate &= ~IRQS_AUTODETECT; - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); } raw_spin_unlock_irq(&desc->lock); } @@ -169,7 +169,7 @@ int probe_irq_off(unsigned long val) nr_of_irqs++; } desc->istate &= ~IRQS_AUTODETECT; - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 51128bea3846..04fe4f989bd8 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -314,6 +314,12 @@ void irq_shutdown(struct irq_desc *desc) } irq_state_clr_started(desc); } +} + + +void irq_shutdown_and_deactivate(struct irq_desc *desc) +{ + irq_shutdown(desc); /* * This must be called even if the interrupt was never started up, * because the activation can happen before the interrupt is diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 5b1072e394b2..6c7ca2e983a5 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -116,7 +116,7 @@ static bool migrate_one_irq(struct irq_desc *desc) */ if (irqd_affinity_is_managed(d)) { irqd_set_managed_shutdown(d); - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); return false; } affinity = cpu_online_mask; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 70c3053bc1f6..9c957f8b1198 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -82,6 +82,7 @@ extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); +extern void irq_shutdown_and_deactivate(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 53a081392115..dc8b35f2d545 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -1699,6 +1700,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { irq_settings_clr_disable_unlazy(desc); + /* Only shutdown. Deactivate after synchronize_hardirq() */ irq_shutdown(desc); } @@ -1768,6 +1770,14 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) * require it to deallocate resources over the slow bus. */ chip_bus_lock(desc); + /* + * There is no interrupt on the fly anymore. Deactivate it + * completely. + */ + raw_spin_lock_irqsave(&desc->lock, flags); + irq_domain_deactivate_irq(&desc->irq_data); + raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_release_resources(desc); chip_bus_sync_unlock(desc); irq_remove_timings(desc); @@ -1855,7 +1865,7 @@ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) } irq_settings_clr_disable_unlazy(desc); - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); irq_release_resources(desc); -- cgit v1.2.3 From 1d21f2af8571c6a6a44e7c1911780614847b0253 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 28 Jun 2019 13:11:50 +0200 Subject: genirq: Fix misleading synchronize_irq() documentation The function might sleep, so it cannot be called from interrupt context. Not even with care. Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Link: https://lkml.kernel.org/r/20190628111440.189241552@linutronix.de --- kernel/irq/manage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index dc8b35f2d545..44fc505815d6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -96,7 +96,8 @@ EXPORT_SYMBOL(synchronize_hardirq); * to complete before returning. If you use this function while * holding a resource the IRQ handler may need you will deadlock. * - * This function may be called - with care - from IRQ context. + * Can only be called from preemptible code as it might sleep when + * an interrupt thread is associated to @irq. */ void synchronize_irq(unsigned int irq) { -- cgit v1.2.3 From 62e0468650c30f0298822c580f382b16328119f6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 28 Jun 2019 13:11:51 +0200 Subject: genirq: Add optional hardware synchronization for shutdown free_irq() ensures that no hardware interrupt handler is executing on a different CPU before actually releasing resources and deactivating the interrupt completely in a domain hierarchy. But that does not catch the case where the interrupt is on flight at the hardware level but not yet serviced by the target CPU. That creates an interesing race condition: CPU 0 CPU 1 IRQ CHIP interrupt is raised sent to CPU1 Unable to handle immediately (interrupts off, deep idle delay) mask() ... free() shutdown() synchronize_irq() release_resources() do_IRQ() -> resources are not available That might be harmless and just trigger a spurious interrupt warning, but some interrupt chips might get into a wedged state. Utilize the existing irq_get_irqchip_state() callback for the synchronization in free_irq(). synchronize_hardirq() is not using this mechanism as it might actually deadlock unter certain conditions, e.g. when called with interrupts disabled and the target CPU is the one on which the synchronization is invoked. synchronize_irq() uses it because that function cannot be called from non preemtible contexts as it might sleep. No functional change intended and according to Marc the existing GIC implementations where the driver supports the callback should be able to cope with that core change. Famous last words. Fixes: 464d12309e1b ("x86/vector: Switch IOAPIC to global reservation mode") Reported-by: Robert Hodaszi Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://lkml.kernel.org/r/20190628111440.279463375@linutronix.de --- kernel/irq/internals.h | 4 +++ kernel/irq/manage.c | 75 +++++++++++++++++++++++++++++++++++++------------- 2 files changed, 60 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 9c957f8b1198..3a948f41ab00 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -97,6 +97,10 @@ static inline void irq_mark_irq(unsigned int irq) { } extern void irq_mark_irq(unsigned int irq); #endif +extern int __irq_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, + bool *state); + extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 44fc505815d6..fad61986f35c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -35,8 +35,9 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif -static void __synchronize_hardirq(struct irq_desc *desc) +static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { + struct irq_data *irqd = irq_desc_get_irq_data(desc); bool inprogress; do { @@ -52,6 +53,20 @@ static void __synchronize_hardirq(struct irq_desc *desc) /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); inprogress = irqd_irq_inprogress(&desc->irq_data); + + /* + * If requested and supported, check at the chip whether it + * is in flight at the hardware level, i.e. already pending + * in a CPU and waiting for service and acknowledge. + */ + if (!inprogress && sync_chip) { + /* + * Ignore the return code. inprogress is only updated + * when the chip supports it. + */ + __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE, + &inprogress); + } raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ @@ -74,13 +89,18 @@ static void __synchronize_hardirq(struct irq_desc *desc) * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. + * + * It does not check whether there is an interrupt in flight at the + * hardware level, but not serviced yet, as this might deadlock when + * called with interrupts disabled and the target CPU of the interrupt + * is the current CPU. */ bool synchronize_hardirq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { - __synchronize_hardirq(desc); + __synchronize_hardirq(desc, false); return !atomic_read(&desc->threads_active); } @@ -98,13 +118,17 @@ EXPORT_SYMBOL(synchronize_hardirq); * * Can only be called from preemptible code as it might sleep when * an interrupt thread is associated to @irq. + * + * It optionally makes sure (when the irq chip supports that method) + * that the interrupt is not pending in any CPU and waiting for + * service. */ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { - __synchronize_hardirq(desc); + __synchronize_hardirq(desc, true); /* * We made sure that no hardirq handler is * running. Now verify that no threaded handlers are @@ -1730,8 +1754,12 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) unregister_handler_proc(irq, action); - /* Make sure it's not being used on another CPU: */ - synchronize_hardirq(irq); + /* + * Make sure it's not being used on another CPU and if the chip + * supports it also make sure that there is no (not yet serviced) + * interrupt in flight at the hardware level. + */ + __synchronize_hardirq(desc, true); #ifdef CONFIG_DEBUG_SHIRQ /* @@ -2589,6 +2617,28 @@ out: irq_put_desc_unlock(desc, flags); } +int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, + bool *state) +{ + struct irq_chip *chip; + int err = -EINVAL; + + do { + chip = irq_data_get_irq_chip(data); + if (chip->irq_get_irqchip_state) + break; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + data = data->parent_data; +#else + data = NULL; +#endif + } while (data); + + if (data) + err = chip->irq_get_irqchip_state(data, which, state); + return err; +} + /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. * @irq: Interrupt line that is forwarded to a VM @@ -2607,7 +2657,6 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, { struct irq_desc *desc; struct irq_data *data; - struct irq_chip *chip; unsigned long flags; int err = -EINVAL; @@ -2617,19 +2666,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, data = irq_desc_get_irq_data(desc); - do { - chip = irq_data_get_irq_chip(data); - if (chip->irq_get_irqchip_state) - break; -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - data = data->parent_data; -#else - data = NULL; -#endif - } while (data); - - if (data) - err = chip->irq_get_irqchip_state(data, which, state); + err = __irq_get_irqchip_state(data, which, state); irq_put_desc_busunlock(desc, flags); return err; -- cgit v1.2.3 From a3ce685dd01a786fa5bc388e47d0066a4f842591 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 28 Jun 2019 09:24:09 -0700 Subject: bpf: fix precision tracking When equivalent state is found the current state needs to propagate precision marks. Otherwise the verifier will prune the search incorrectly. There is a price for correctness: before before broken fixed cnst spill precise precise bpf_lb-DLB_L3.o 1923 8128 1863 1898 bpf_lb-DLB_L4.o 3077 6707 2468 2666 bpf_lb-DUNKNOWN.o 1062 1062 544 544 bpf_lxc-DDROP_ALL.o 166729 380712 22629 36823 bpf_lxc-DUNKNOWN.o 174607 440652 28805 45325 bpf_netdev.o 8407 31904 6801 7002 bpf_overlay.o 5420 23569 4754 4858 bpf_lxc_jit.o 39389 359445 50925 69631 Overall precision tracking is still very effective. Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Reported-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Tested-by: Lawrence Brakmo Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 121 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 107 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9b6ee93d5a85..a2e763703c30 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1659,16 +1659,18 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, } } -static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, + int spi) { struct bpf_verifier_state *st = env->cur_state; int first_idx = st->first_insn_idx; int last_idx = env->insn_idx; struct bpf_func_state *func; struct bpf_reg_state *reg; - u32 reg_mask = 1u << regno; - u64 stack_mask = 0; + u32 reg_mask = regno >= 0 ? 1u << regno : 0; + u64 stack_mask = spi >= 0 ? 1ull << spi : 0; bool skip_first = true; + bool new_marks = false; int i, err; if (!env->allow_ptr_leaks) @@ -1676,18 +1678,43 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; func = st->frame[st->curframe]; - reg = &func->regs[regno]; - if (reg->type != SCALAR_VALUE) { - WARN_ONCE(1, "backtracing misuse"); - return -EFAULT; + if (regno >= 0) { + reg = &func->regs[regno]; + if (reg->type != SCALAR_VALUE) { + WARN_ONCE(1, "backtracing misuse"); + return -EFAULT; + } + if (!reg->precise) + new_marks = true; + else + reg_mask = 0; + reg->precise = true; } - if (reg->precise) - return 0; - func->regs[regno].precise = true; + while (spi >= 0) { + if (func->stack[spi].slot_type[0] != STACK_SPILL) { + stack_mask = 0; + break; + } + reg = &func->stack[spi].spilled_ptr; + if (reg->type != SCALAR_VALUE) { + stack_mask = 0; + break; + } + if (!reg->precise) + new_marks = true; + else + stack_mask = 0; + reg->precise = true; + break; + } + + if (!new_marks) + return 0; + if (!reg_mask && !stack_mask) + return 0; for (;;) { DECLARE_BITMAP(mask, 64); - bool new_marks = false; u32 history = st->jmp_history_cnt; if (env->log.level & BPF_LOG_LEVEL) @@ -1730,12 +1757,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) if (!st) break; + new_marks = false; func = st->frame[st->curframe]; bitmap_from_u64(mask, reg_mask); for_each_set_bit(i, mask, 32) { reg = &func->regs[i]; - if (reg->type != SCALAR_VALUE) + if (reg->type != SCALAR_VALUE) { + reg_mask &= ~(1u << i); continue; + } if (!reg->precise) new_marks = true; reg->precise = true; @@ -1756,11 +1786,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) return -EFAULT; } - if (func->stack[i].slot_type[0] != STACK_SPILL) + if (func->stack[i].slot_type[0] != STACK_SPILL) { + stack_mask &= ~(1ull << i); continue; + } reg = &func->stack[i].spilled_ptr; - if (reg->type != SCALAR_VALUE) + if (reg->type != SCALAR_VALUE) { + stack_mask &= ~(1ull << i); continue; + } if (!reg->precise) new_marks = true; reg->precise = true; @@ -1772,6 +1806,8 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) reg_mask, stack_mask); } + if (!reg_mask && !stack_mask) + break; if (!new_marks) break; @@ -1781,6 +1817,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; } +static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +{ + return __mark_chain_precision(env, regno, -1); +} + +static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) +{ + return __mark_chain_precision(env, -1, spi); +} static bool is_spillable_regtype(enum bpf_reg_type type) { @@ -7111,6 +7156,46 @@ static int propagate_liveness(struct bpf_verifier_env *env, return 0; } +/* find precise scalars in the previous equivalent state and + * propagate them into the current state + */ +static int propagate_precision(struct bpf_verifier_env *env, + const struct bpf_verifier_state *old) +{ + struct bpf_reg_state *state_reg; + struct bpf_func_state *state; + int i, err = 0; + + state = old->frame[old->curframe]; + state_reg = state->regs; + for (i = 0; i < BPF_REG_FP; i++, state_reg++) { + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "propagating r%d\n", i); + err = mark_chain_precision(env, i); + if (err < 0) + return err; + } + + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + state_reg = &state->stack[i].spilled_ptr; + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "propagating fp%d\n", + (-i - 1) * BPF_REG_SIZE); + err = mark_chain_precision_stack(env, i); + if (err < 0) + return err; + } + return 0; +} + static bool states_maybe_looping(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { @@ -7203,6 +7288,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * this state and will pop a new one. */ err = propagate_liveness(env, &sl->state, cur); + + /* if previous state reached the exit with precision and + * current state is equivalent to it (except precsion marks) + * the precision needs to be propagated back in + * the current state. + */ + err = err ? : push_jmp_history(env, cur); + err = err ? : propagate_precision(env, &sl->state); if (err) return err; return 1; -- cgit v1.2.3 From 1be51474f99bcfdecef3f34b9a9a8cf4393fd8f9 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 12 Jun 2019 16:43:14 +0200 Subject: swiotlb: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Cc: Konrad Rzeszutek Wilk Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: iommu@lists.linux-foundation.org Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20190612144314.GA16803@kroah.com Signed-off-by: Greg Kroah-Hartman --- kernel/dma/swiotlb.c | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 13f0cb080a4d..62fa5a82a065 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -696,29 +696,12 @@ bool is_swiotlb_active(void) static int __init swiotlb_create_debugfs(void) { - struct dentry *d_swiotlb_usage; - struct dentry *ent; - - d_swiotlb_usage = debugfs_create_dir("swiotlb", NULL); - - if (!d_swiotlb_usage) - return -ENOMEM; - - ent = debugfs_create_ulong("io_tlb_nslabs", 0400, - d_swiotlb_usage, &io_tlb_nslabs); - if (!ent) - goto fail; - - ent = debugfs_create_ulong("io_tlb_used", 0400, - d_swiotlb_usage, &io_tlb_used); - if (!ent) - goto fail; + struct dentry *root; + root = debugfs_create_dir("swiotlb", NULL); + debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs); + debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used); return 0; - -fail: - debugfs_remove_recursive(d_swiotlb_usage); - return -ENOMEM; } late_initcall(swiotlb_create_debugfs); -- cgit v1.2.3 From c09cb1293523dd786ae54a12fd88001542cba2f6 Mon Sep 17 00:00:00 2001 From: Shijith Thotton Date: Fri, 5 Jul 2019 07:56:20 +0000 Subject: genirq: Update irq stats from NMI handlers The NMI handlers handle_percpu_devid_fasteoi_nmi() and handle_fasteoi_nmi() do not update the interrupt counts. Due to that the NMI interrupt count does not show up correctly in /proc/interrupts. Add the statistics and treat the NMI handlers in the same way as per cpu interrupts and prevent them from updating irq_desc::tot_count as this might be corrupted due to concurrency. [ tglx: Massaged changelog ] Fixes: 2dcf1fbcad35 ("genirq: Provide NMI handlers") Signed-off-by: Shijith Thotton Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1562313336-11888-1-git-send-email-sthotton@marvell.com --- kernel/irq/chip.c | 4 ++++ kernel/irq/irqdesc.c | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 29d6c7d070b4..04c850fb70cb 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -748,6 +748,8 @@ void handle_fasteoi_nmi(struct irq_desc *desc) unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; + __kstat_incr_irqs_this_cpu(desc); + trace_irq_handler_entry(irq, action); /* * NMIs cannot be shared, there is only one action. @@ -962,6 +964,8 @@ void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc) unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; + __kstat_incr_irqs_this_cpu(desc); + trace_irq_handler_entry(irq, action); res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); trace_irq_handler_exit(irq, action, res); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index c52b737ab8e3..9149dde5a7b0 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -946,6 +946,11 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } +static bool irq_is_nmi(struct irq_desc *desc) +{ + return desc->istate & IRQS_NMI; +} + /** * kstat_irqs - Get the statistics for an interrupt * @irq: The interrupt number @@ -963,7 +968,8 @@ unsigned int kstat_irqs(unsigned int irq) if (!desc || !desc->kstat_irqs) return 0; if (!irq_settings_is_per_cpu_devid(desc) && - !irq_settings_is_per_cpu(desc)) + !irq_settings_is_per_cpu(desc) && + !irq_is_nmi(desc)) return desc->tot_count; for_each_possible_cpu(cpu) -- cgit v1.2.3 From 3a1d24ca9573fbc74a3d32c972c333b161e0e9dc Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sat, 6 Jul 2019 04:41:12 +0000 Subject: irq/irqdomain: Fix comment typo Fix typo in the comment on top of __irq_domain_add(). Signed-off-by: Zenghui Yu Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1562388072-23492-1-git-send-email-yuzenghui@huawei.com --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e7d17cc3a3d7..3078d0e48bba 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -123,7 +123,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); * @ops: domain callbacks * @host_data: Controller private data pointer * - * Allocates and initialize and irq_domain structure. + * Allocates and initializes an irq_domain structure. * Returns pointer to IRQ domain, or NULL on failure. */ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, -- cgit v1.2.3 From 9176ab1b848059a0cd9caf39f0cebaa1b7ec5ec2 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Sun, 7 Jul 2019 08:51:41 +0800 Subject: time: Validate user input in compat_settimeofday() The user value is validated after converting the timeval to a timespec, but for a wide range of negative tv_usec values the multiplication overflow turns them in positive numbers. So the 'validated later' is not catching the invalid input. Signed-off-by: zhengbin Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1562460701-113301-1-git-send-email-zhengbin13@huawei.com --- kernel/time/time.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 7f7d6914ddd5..5c54ca632d08 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -251,6 +251,10 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, if (tv) { if (compat_get_timeval(&user_tv, tv)) return -EFAULT; + + if (!timeval_valid(&user_tv)) + return -EINVAL; + new_ts.tv_sec = user_tv.tv_sec; new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; } -- cgit v1.2.3 From 6705fea0c799a4efb9a9ce2968a2f7a570e33dc2 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 3 Jul 2019 16:26:30 +0800 Subject: bpf: cgroup: Fix build error without CONFIG_NET If CONFIG_NET is not set and CONFIG_CGROUP_BPF=y, gcc building fails: kernel/bpf/cgroup.o: In function `cg_sockopt_func_proto': cgroup.c:(.text+0x237e): undefined reference to `bpf_sk_storage_get_proto' cgroup.c:(.text+0x2394): undefined reference to `bpf_sk_storage_delete_proto' kernel/bpf/cgroup.o: In function `__cgroup_bpf_run_filter_getsockopt': (.text+0x2a1f): undefined reference to `lock_sock_nested' (.text+0x2ca2): undefined reference to `release_sock' kernel/bpf/cgroup.o: In function `__cgroup_bpf_run_filter_setsockopt': (.text+0x3006): undefined reference to `lock_sock_nested' (.text+0x32bb): undefined reference to `release_sock' Reported-by: Hulk Robot Suggested-by: Stanislav Fomichev Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Signed-off-by: YueHaibing Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 76fa0076f20d..0a00eaca6fae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -939,6 +939,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +#ifdef CONFIG_NET static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, enum bpf_attach_type attach_type) { @@ -1120,6 +1121,7 @@ out: return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); +#endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, size_t *lenp) @@ -1386,10 +1388,12 @@ static const struct bpf_func_proto * cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { +#ifdef CONFIG_NET case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; +#endif #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; -- cgit v1.2.3 From b60b7c2ea9b7f854d457fefd592c77f621a86580 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jul 2019 09:58:43 +0900 Subject: kheaders: remove meaningless -R option of 'ls' The -R option of 'ls' is supposed to be used for directories. -R, --recursive list subdirectories recursively Since 'find ... -type f' only matches to regular files, we do not expect directories passed to the 'ls' command here. Giving -R is harmless at least, but unneeded. Signed-off-by: Masahiro Yamada Reviewed-by: Joel Fernandes (Google) --- kernel/gen_kheaders.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 9a34e1d9bd7f..86a666f5cb17 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -33,8 +33,8 @@ arch/$SRCARCH/include/ # Uncomment it for debugging. # if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter; # else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi -# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter -# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter +# find $src_file_list -type f | xargs ls -l > /tmp/src-ls-$iter +# find $obj_file_list -type f | xargs ls -l > /tmp/obj-ls-$iter # include/generated/compile.h is ignored because it is touched even when none # of the source files changed. This causes pointless regeneration, so let us @@ -46,7 +46,7 @@ src_files_md5="$(find $src_file_list -type f | grep -v "include/config/auto.conf" | grep -v "include/config/auto.conf.cmd" | grep -v "include/config/tristate.conf" | - xargs ls -lR | md5sum | cut -d ' ' -f1)" + xargs ls -l | md5sum | cut -d ' ' -f1)" popd > /dev/null obj_files_md5="$(find $obj_file_list -type f | grep -v "include/generated/compile.h" | @@ -54,7 +54,7 @@ obj_files_md5="$(find $obj_file_list -type f | grep -v "include/config/auto.conf" | grep -v "include/config/auto.conf.cmd" | grep -v "include/config/tristate.conf" | - xargs ls -lR | md5sum | cut -d ' ' -f1)" + xargs ls -l | md5sum | cut -d ' ' -f1)" if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi if [ -f kernel/kheaders.md5 ] && -- cgit v1.2.3 From 7199ff7d74003b5aad1e6328bf6128cd8ceea735 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Jul 2019 09:58:44 +0900 Subject: kheaders: include only headers into kheaders_data.tar.xz Currently, kheaders_data.tar.xz contains some build scripts as well as headers. None of them is needed in the header archive. For ARCH=x86, this commit excludes the following from the archive: arch/x86/include/asm/Kbuild arch/x86/include/uapi/asm/Kbuild include/asm-generic/Kbuild include/config/auto.conf include/config/kernel.release include/config/tristate.conf include/uapi/asm-generic/Kbuild include/uapi/Kbuild kernel/gen_kheaders.sh This change is actually motivated for the planned header compile-testing because it will generate more build artifacts, which should not be included in the archive. Signed-off-by: Masahiro Yamada Reviewed-by: Joel Fernandes (Google) --- kernel/gen_kheaders.sh | 47 ++++++++++++++++------------------------------- 1 file changed, 16 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 86a666f5cb17..9ff449888d9c 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -4,24 +4,12 @@ # This script generates an archive consisting of kernel headers # for CONFIG_IKHEADERS. set -e -spath="$(dirname "$(readlink -f "$0")")" -kroot="$spath/.." +sfile="$(readlink -f "$0")" outdir="$(pwd)" tarfile=$1 cpio_dir=$outdir/$tarfile.tmp -# Script filename relative to the kernel source root -# We add it to the archive because it is small and any changes -# to this script will also cause a rebuild of the archive. -sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" - -src_file_list=" -include/ -arch/$SRCARCH/include/ -$sfile -" - -obj_file_list=" +dir_list=" include/ arch/$SRCARCH/include/ " @@ -33,33 +21,29 @@ arch/$SRCARCH/include/ # Uncomment it for debugging. # if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter; # else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi -# find $src_file_list -type f | xargs ls -l > /tmp/src-ls-$iter -# find $obj_file_list -type f | xargs ls -l > /tmp/obj-ls-$iter +# find $src_file_list -name "*.h" | xargs ls -l > /tmp/src-ls-$iter +# find $obj_file_list -name "*.h" | xargs ls -l > /tmp/obj-ls-$iter # include/generated/compile.h is ignored because it is touched even when none # of the source files changed. This causes pointless regeneration, so let us # ignore them for md5 calculation. -pushd $kroot > /dev/null -src_files_md5="$(find $src_file_list -type f | +pushd $srctree > /dev/null +src_files_md5="$(find $dir_list -name "*.h" | grep -v "include/generated/compile.h" | grep -v "include/generated/autoconf.h" | - grep -v "include/config/auto.conf" | - grep -v "include/config/auto.conf.cmd" | - grep -v "include/config/tristate.conf" | xargs ls -l | md5sum | cut -d ' ' -f1)" popd > /dev/null -obj_files_md5="$(find $obj_file_list -type f | +obj_files_md5="$(find $dir_list -name "*.h" | grep -v "include/generated/compile.h" | grep -v "include/generated/autoconf.h" | - grep -v "include/config/auto.conf" | - grep -v "include/config/auto.conf.cmd" | - grep -v "include/config/tristate.conf" | xargs ls -l | md5sum | cut -d ' ' -f1)" - +# Any changes to this script will also cause a rebuild of the archive. +this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)" if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi if [ -f kernel/kheaders.md5 ] && [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && + [ "$(cat kernel/kheaders.md5|head -3|tail -1)" == "$this_file_md5" ] && [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then exit fi @@ -71,16 +55,16 @@ fi rm -rf $cpio_dir mkdir $cpio_dir -pushd $kroot > /dev/null -for f in $src_file_list; - do find "$f" ! -name "*.cmd" ! -name ".*"; +pushd $srctree > /dev/null +for f in $dir_list; + do find "$f" -name "*.h"; done | cpio --quiet -pd $cpio_dir popd > /dev/null # The second CPIO can complain if files already exist which can # happen with out of tree builds. Just silence CPIO for now. -for f in $obj_file_list; - do find "$f" ! -name "*.cmd" ! -name ".*"; +for f in $dir_list; + do find "$f" -name "*.h"; done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 # Remove comments except SDPX lines @@ -91,6 +75,7 @@ tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null echo "$src_files_md5" > kernel/kheaders.md5 echo "$obj_files_md5" >> kernel/kheaders.md5 +echo "$this_file_md5" >> kernel/kheaders.md5 echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 rm -rf $cpio_dir -- cgit v1.2.3 From e55a73251da335873a6e87d68fb17e5aabb8978e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 27 Jun 2019 20:50:47 -0500 Subject: bpf: Fix ORC unwinding in non-JIT BPF code Objtool previously ignored ___bpf_prog_run() because it didn't understand the jump table. This resulted in the ORC unwinder not being able to unwind through non-JIT BPF code. Now that objtool knows how to read jump tables, remove the whitelist and annotate the jump table so objtool can recognize it. Also add an additional "const" to the jump table definition to clarify that the text pointers are constant. Otherwise GCC sets the section writable flag and the assembler spits out warnings. Fixes: d15d356887e7 ("perf/x86: Make perf callchains work without CONFIG_FRAME_POINTER") Reported-by: Song Liu Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Alexei Starovoitov Cc: Peter Zijlstra Cc: Kairui Song Cc: Steven Rostedt Cc: Borislav Petkov Cc: Daniel Borkmann Link: https://lkml.kernel.org/r/881939122b88f32be4c374d248c09d7527a87e35.1561685471.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/bpf/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 080e2bb644cc..1e12ac382a90 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1299,7 +1299,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z - static const void *jumptable[256] = { + static const void * const jumptable[256] __annotate_jump_table = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), @@ -1558,7 +1558,6 @@ out: BUG_ON(1); return 0; } -STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ #define PROG_NAME(stack_size) __bpf_prog_run##stack_size #define DEFINE_BPF_PROG_RUN(stack_size) \ -- cgit v1.2.3 From 0df1c9868c3a1916198ee09c323ca5932a0b8a11 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 10 Jul 2019 15:01:53 +0200 Subject: timekeeping/vsyscall: Use __iter_div_u64_rem() On 32-bit x86 when building with clang-9, the 'division' loop gets turned back into an inefficient division that causes a link error: kernel/time/vsyscall.o: In function `update_vsyscall': vsyscall.c:(.text+0xe3): undefined reference to `__udivdi3' Use the existing __iter_div_u64_rem() function which is used to address the same issue in other places. Fixes: 44f57d788e7d ("timekeeping: Provide a generic update_vsyscall() implementation") Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Link: https://lkml.kernel.org/r/20190710130206.1670830-1-arnd@arndb.de --- kernel/time/vsyscall.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index a80893180826..8cf3596a4ce6 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -104,11 +104,7 @@ void update_vsyscall(struct timekeeper *tk) vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec = nsec + tk->wall_to_monotonic.tv_nsec; - while (nsec >= NSEC_PER_SEC) { - nsec = nsec - NSEC_PER_SEC; - vdso_ts->sec++; - } - vdso_ts->nsec = nsec; + vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); if (__arch_use_vsyscall(vdata)) update_vdso_data(vdata, tk); -- cgit v1.2.3 From b3b50f05dc501cc2cd90349a7bbfd932af0ceb31 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 8 Jul 2019 20:32:44 -0700 Subject: bpf: fix precision bit propagation for BPF_ST instructions When backtracking instructions to propagate precision bit for registers and stack slots, one class of instructions (BPF_ST) weren't handled causing extra stack slots to be propagated into parent state. Parent state might not have that much stack allocated, though, which causes warning on invalid stack slot usage. This patch adds handling of BPF_ST instructions: BPF_MEM | | BPF_ST: *(size *) (dst_reg + off) = imm32 Reported-by: syzbot+4da3ff23081bafe74fc2@syzkaller.appspotmail.com Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Cc: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a2e763703c30..def87e9cc9c7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1519,9 +1519,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, return -EFAULT; } *stack_mask |= 1ull << spi; - } else if (class == BPF_STX) { + } else if (class == BPF_STX || class == BPF_ST) { if (*reg_mask & dreg) - /* stx shouldn't be using _scalar_ dst_reg + /* stx & st shouldn't be using _scalar_ dst_reg * to access memory. It means backtracking * encountered a case of pointer subtraction. */ @@ -1540,7 +1540,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (!(*stack_mask & (1ull << spi))) return 0; *stack_mask &= ~(1ull << spi); - *reg_mask |= sreg; + if (class == BPF_STX) + *reg_mask |= sreg; } else if (class == BPF_JMP || class == BPF_JMP32) { if (opcode == BPF_CALL) { if (insn->src_reg == BPF_PSEUDO_CALL) @@ -1569,10 +1570,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (mode == BPF_IND || mode == BPF_ABS) /* to be analyzed */ return -ENOTSUPP; - } else if (class == BPF_ST) { - if (*reg_mask & dreg) - /* likely pointer subtraction */ - return -ENOTSUPP; } return 0; } -- cgit v1.2.3 From ed4ed4043a127c5ca9a35339bb614693be9037a3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 11 Jul 2019 11:22:33 -0500 Subject: bpf: verifier: avoid fall-through warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to enabling -Wimplicit-fallthrough, this patch silences the following warning: kernel/bpf/verifier.c: In function ‘check_return_code’: kernel/bpf/verifier.c:6106:6: warning: this statement may fall through [-Wimplicit-fallthrough=] if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || ^ kernel/bpf/verifier.c:6109:2: note: here case BPF_PROG_TYPE_CGROUP_SKB: ^~~~ Warning level 3 was used: -Wimplicit-fallthrough=3 Notice that is much clearer to explicitly add breaks in each case statement (that actually contains some code), rather than letting the code to fall through. This patch is part of the ongoing efforts to enable -Wimplicit-fallthrough. Signed-off-by: Gustavo A. R. Silva Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index def87e9cc9c7..5900cbb966b1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6103,11 +6103,13 @@ static int check_return_code(struct bpf_verifier_env *env) if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) range = tnum_range(1, 1); + break; case BPF_PROG_TYPE_CGROUP_SKB: if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { range = tnum_range(0, 3); enforce_attach_type_range = tnum_range(2, 3); } + break; case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: -- cgit v1.2.3 From 9bd3bb6703d8c0a5fb8aec8e3287bd55b7341dcd Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 11 Jul 2019 20:52:08 -0700 Subject: mm/nvdimm: add is_ioremap_addr and use that to check ioremap address Architectures like powerpc use different address range to map ioremap and vmalloc range. The memunmap() check used by the nvdimm layer was wrongly using is_vmalloc_addr() to check for ioremap range which fails for ppc64. This result in ppc64 not freeing the ioremap mapping. The side effect of this is an unbind failure during module unload with papr_scm nvdimm driver Link: http://lkml.kernel.org/r/20190701134038.14165-1-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Fixes: b5beae5e224f ("powerpc/pseries: Add driver for PAPR SCM regions") Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/iomem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/iomem.c b/kernel/iomem.c index 93c264444510..62c92e43aa0d 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -121,7 +121,7 @@ EXPORT_SYMBOL(memremap); void memunmap(void *addr) { - if (is_vmalloc_addr(addr)) + if (is_ioremap_addr(addr)) iounmap((void __iomem *) addr); } EXPORT_SYMBOL(memunmap); -- cgit v1.2.3 From 1cf8dfe8a661f0462925df943140e9f6d1ea5233 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 13 Jul 2019 11:21:25 +0200 Subject: perf/core: Fix race between close() and fork() Syzcaller reported the following Use-after-Free bug: close() clone() copy_process() perf_event_init_task() perf_event_init_context() mutex_lock(parent_ctx->mutex) inherit_task_group() inherit_group() inherit_event() mutex_lock(event->child_mutex) // expose event on child list list_add_tail() mutex_unlock(event->child_mutex) mutex_unlock(parent_ctx->mutex) ... goto bad_fork_* bad_fork_cleanup_perf: perf_event_free_task() perf_release() perf_event_release_kernel() list_for_each_entry() mutex_lock(ctx->mutex) mutex_lock(event->child_mutex) // event is from the failing inherit // on the other CPU perf_remove_from_context() list_move() mutex_unlock(event->child_mutex) mutex_unlock(ctx->mutex) mutex_lock(ctx->mutex) list_for_each_entry_safe() // event already stolen mutex_unlock(ctx->mutex) delayed_free_task() free_task() list_for_each_entry_safe() list_del() free_event() _free_event() // and so event->hw.target // is the already freed failed clone() if (event->hw.target) put_task_struct(event->hw.target) // WHOOPSIE, already quite dead Which puts the lie to the the comment on perf_event_free_task(): 'unexposed, unused context' not so much. Which is a 'fun' confluence of fail; copy_process() doing an unconditional free_task() and not respecting refcounts, and perf having creative locking. In particular: 82d94856fa22 ("perf/core: Fix lock inversion between perf,trace,cpuhp") seems to have overlooked this 'fun' parade. Solve it by using the fact that detached events still have a reference count on their (previous) context. With this perf_event_free_task() can detect when events have escaped and wait for their destruction. Debugged-by: Alexander Shishkin Reported-by: syzbot+a24c397a29ad22d86c98@syzkaller.appspotmail.com Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 82d94856fa22 ("perf/core: Fix lock inversion between perf,trace,cpuhp") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 49 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 785d708f8553..5dd19bedbf64 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4465,12 +4465,20 @@ static void _free_event(struct perf_event *event) if (event->destroy) event->destroy(event); - if (event->ctx) - put_ctx(event->ctx); - + /* + * Must be after ->destroy(), due to uprobe_perf_close() using + * hw.target. + */ if (event->hw.target) put_task_struct(event->hw.target); + /* + * perf_event_free_task() relies on put_ctx() being 'last', in particular + * all task references must be cleaned up. + */ + if (event->ctx) + put_ctx(event->ctx); + exclusive_event_destroy(event); module_put(event->pmu->module); @@ -4650,8 +4658,17 @@ again: mutex_unlock(&event->child_mutex); list_for_each_entry_safe(child, tmp, &free_list, child_list) { + void *var = &child->ctx->refcount; + list_del(&child->child_list); free_event(child); + + /* + * Wake any perf_event_free_task() waiting for this event to be + * freed. + */ + smp_mb(); /* pairs with wait_var_event() */ + wake_up_var(var); } no_ctx: @@ -11527,11 +11544,11 @@ static void perf_free_event(struct perf_event *event, } /* - * Free an unexposed, unused context as created by inheritance by - * perf_event_init_task below, used by fork() in case of fail. + * Free a context as created by inheritance by perf_event_init_task() below, + * used by fork() in case of fail. * - * Not all locks are strictly required, but take them anyway to be nice and - * help out with the lockdep assertions. + * Even though the task has never lived, the context and events have been + * exposed through the child_list, so we must take care tearing it all down. */ void perf_event_free_task(struct task_struct *task) { @@ -11561,7 +11578,23 @@ void perf_event_free_task(struct task_struct *task) perf_free_event(event, ctx); mutex_unlock(&ctx->mutex); - put_ctx(ctx); + + /* + * perf_event_release_kernel() could've stolen some of our + * child events and still have them on its free_list. In that + * case we must wait for these events to have been freed (in + * particular all their references to this task must've been + * dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); + put_ctx(ctx); /* must be last */ } } -- cgit v1.2.3 From 8a58ddae23796c733c5dfbd717538d89d036c5bd Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Mon, 1 Jul 2019 14:07:55 +0300 Subject: perf/core: Fix exclusive events' grouping So far, we tried to disallow grouping exclusive events for the fear of complications they would cause with moving between contexts. Specifically, moving a software group to a hardware context would violate the exclusivity rules if both groups contain matching exclusive events. This attempt was, however, unsuccessful: the check that we have in the perf_event_open() syscall is both wrong (looks at wrong PMU) and insufficient (group leader may still be exclusive), as can be illustrated by running: $ perf record -e '{intel_pt//,cycles}' uname $ perf record -e '{cycles,intel_pt//}' uname ultimately successfully. Furthermore, we are completely free to trigger the exclusivity violation by: perf -e '{cycles,intel_pt//}' -e '{intel_pt//,instructions}' even though the helpful perf record will not allow that, the ABI will. The warning later in the perf_event_open() path will also not trigger, because it's also wrong. Fix all this by validating the original group before moving, getting rid of broken safeguards and placing a useful one to perf_install_in_context(). Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: mathieu.poirier@linaro.org Cc: will.deacon@arm.com Fixes: bed5b25ad9c8a ("perf: Add a pmu capability for "exclusive" events") Link: https://lkml.kernel.org/r/20190701110755.24646-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5dd19bedbf64..eea9d52b010c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2553,6 +2553,9 @@ unlock: return ret; } +static bool exclusive_event_installable(struct perf_event *event, + struct perf_event_context *ctx); + /* * Attach a performance event to a context. * @@ -2567,6 +2570,8 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); + WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); + if (event->cpu != -1) event->cpu = cpu; @@ -4360,7 +4365,7 @@ static int exclusive_event_init(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + if (!is_exclusive_pmu(pmu)) return 0; /* @@ -4391,7 +4396,7 @@ static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + if (!is_exclusive_pmu(pmu)) return; /* see comment in exclusive_event_init() */ @@ -4411,14 +4416,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) return false; } -/* Called under the same ctx::mutex as perf_install_in_context() */ static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *iter_event; struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + lockdep_assert_held(&ctx->mutex); + + if (!is_exclusive_pmu(pmu)) return true; list_for_each_entry(iter_event, &ctx->event_list, event_entry) { @@ -10947,11 +10953,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } - if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { - err = -EBUSY; - goto err_context; - } - /* * Look up the group leader (we will attach this event to it): */ @@ -11039,6 +11040,18 @@ SYSCALL_DEFINE5(perf_event_open, move_group = 0; } } + + /* + * Failure to create exclusive events returns -EBUSY. + */ + err = -EBUSY; + if (!exclusive_event_installable(group_leader, ctx)) + goto err_locked; + + for_each_sibling_event(sibling, group_leader) { + if (!exclusive_event_installable(sibling, ctx)) + goto err_locked; + } } else { mutex_lock(&ctx->mutex); } @@ -11075,9 +11088,6 @@ SYSCALL_DEFINE5(perf_event_open, * because we need to serialize with concurrent event creation. */ if (!exclusive_event_installable(event, ctx)) { - /* exclusive and group stuff are assumed mutually exclusive */ - WARN_ON_ONCE(move_group); - err = -EBUSY; goto err_locked; } -- cgit v1.2.3 From e3d85487fba42206024bc3ed32e4b581c7cb46db Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jul 2019 12:57:36 +0200 Subject: sched/core: Fix preempt warning in ttwu John reported a DEBUG_PREEMPT warning caused by commit: aacedf26fb76 ("sched/core: Optimize try_to_wake_up() for local wakeups") I overlooked that ttwu_stat() requires preemption disabled. Reported-by: John Stultz Tested-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: aacedf26fb76 ("sched/core: Optimize try_to_wake_up() for local wakeups") Link: https://lkml.kernel.org/r/20190710105736.GK3402@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa43ce3962e7..2b037f195473 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2399,6 +2399,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) unsigned long flags; int cpu, success = 0; + preempt_disable(); if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -2412,7 +2413,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * it disabling IRQs (this allows not taking ->pi_lock). */ if (!(p->state & state)) - return false; + goto out; success = 1; cpu = task_cpu(p); @@ -2526,6 +2527,7 @@ unlock: out: if (success) ttwu_stat(p, cpu, wake_flags); + preempt_enable(); return success; } -- cgit v1.2.3 From 68d41d8c94a31dfb8233ab90b9baf41a2ed2da68 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Tue, 9 Jul 2019 18:15:22 +0800 Subject: locking/lockdep: Fix lock used or unused stats error The stats variable nr_unused_locks is incremented every time a new lock class is register and decremented when the lock is first used in __lock_acquire(). And after all, it is shown and checked in lockdep_stats. However, under configurations that either CONFIG_TRACE_IRQFLAGS or CONFIG_PROVE_LOCKING is not defined: The commit: 091806515124b20 ("locking/lockdep: Consolidate lock usage bit initialization") missed marking the LOCK_USED flag at IRQ usage initialization because as mark_usage() is not called. And the commit: 886532aee3cd42d ("locking/lockdep: Move mark_lock() inside CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING") further made mark_lock() not defined such that the LOCK_USED cannot be marked at all when the lock is first acquired. As a result, we fix this by not showing and checking the stats under such configurations for lockdep_stats. Reported-by: Qian Cai Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: arnd@arndb.de Cc: frederic@kernel.org Link: https://lkml.kernel.org/r/20190709101522.9117-1-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep_proc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 9c49ec645d8b..65b6a1600c8f 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -210,6 +210,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, sum_forward_deps = 0; +#ifdef CONFIG_PROVE_LOCKING list_for_each_entry(class, &all_lock_classes, lock_entry) { if (class->usage_mask == 0) @@ -241,12 +242,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v) if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ) nr_hardirq_read_unsafe++; -#ifdef CONFIG_PROVE_LOCKING sum_forward_deps += lockdep_count_forward_deps(class); -#endif } #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); +#endif + #endif seq_printf(m, " lock-classes: %11lu [max: %lu]\n", nr_lock_classes, MAX_LOCKDEP_KEYS); -- cgit v1.2.3 From 028b6e8a89de9133a869bb4cd1bc72445b1ec8ca Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Sun, 14 Jul 2019 19:20:47 +0300 Subject: clone: fix CLONE_PIDFD support The introduction of clone3 syscall accidentally broke CLONE_PIDFD support in traditional clone syscall on compat x86 and those architectures that use do_fork to implement clone syscall. This bug was found by strace test suite. Link: https://strace.io/logs/strace/2019-07-12 Fixes: 7f192e3cd316 ("fork: add clone3") Bisected-and-tested-by: Anatoly Pugachev Signed-off-by: Dmitry V. Levin Link: https://lore.kernel.org/r/20190714162047.GB10389@altlinux.org Signed-off-by: Christian Brauner --- kernel/fork.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8f3e2d97d771..ef1e05a68827 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2406,6 +2406,16 @@ long _do_fork(struct kernel_clone_args *args) return nr; } +bool legacy_clone_args_valid(const struct kernel_clone_args *kargs) +{ + /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ + if ((kargs->flags & CLONE_PIDFD) && + (kargs->flags & CLONE_PARENT_SETTID)) + return false; + + return true; +} + #ifndef CONFIG_HAVE_COPY_THREAD_TLS /* For compatibility with architectures that call do_fork directly rather than * using the syscall entry points below. */ @@ -2417,6 +2427,7 @@ long do_fork(unsigned long clone_flags, { struct kernel_clone_args args = { .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, .child_tid = child_tidptr, .parent_tid = parent_tidptr, .exit_signal = (clone_flags & CSIGNAL), @@ -2424,6 +2435,9 @@ long do_fork(unsigned long clone_flags, .stack_size = stack_size, }; + if (!legacy_clone_args_valid(&args)) + return -EINVAL; + return _do_fork(&args); } #endif @@ -2505,8 +2519,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, .tls = tls, }; - /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ - if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID)) + if (!legacy_clone_args_valid(&args)) return -EINVAL; return _do_fork(&args); -- cgit v1.2.3 From 387b14684f94483cbbb72843db406ec9a8d0d6d2 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 10 Apr 2019 08:32:41 -0300 Subject: docs: locking: convert docs to ReST and rename to *.rst Convert the locking documents to ReST and add them to the kernel development book where it belongs. Most of the stuff here is just to make Sphinx to properly parse the text file, as they're already in good shape, not requiring massive changes in order to be parsed. The conversion is actually: - add blank lines and identation in order to identify paragraphs; - fix tables markups; - add some lists markups; - mark literal blocks; - adjust title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Acked-by: Federico Vaga --- kernel/locking/mutex.c | 2 +- kernel/locking/rtmutex.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 0c601ae072b3..edd1c082dbf5 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -16,7 +16,7 @@ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale * and Sven Dietrich. * - * Also see Documentation/locking/mutex-design.txt. + * Also see Documentation/locking/mutex-design.rst. */ #include #include diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 38fbf9fa7f1b..fa83d36e30c6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -9,7 +9,7 @@ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen * - * See Documentation/locking/rt-mutex-design.txt for details. + * See Documentation/locking/rt-mutex-design.rst for details. */ #include #include -- cgit v1.2.3 From 53b9537509654a6267c3f56b4d2e7409b9089686 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 18 Apr 2019 18:35:54 -0300 Subject: docs: sysctl: convert to ReST Rename the /proc/sys/ documentation files to ReST, using the README file as a template for an index.rst, adding the other files there via TOC markup. Despite being written on different times with different styles, try to make them somewhat coherent with a similar look and feel, ensuring that they'll look nice as both raw text file and as via the html output produced by the Sphinx build system. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 4d9f55bf7d38..e0ea74bbb41d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -372,7 +372,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { /** * print_tainted - return a string to represent the kernel taint state. * - * For individual taint flag meanings, see Documentation/sysctl/kernel.txt + * For individual taint flag meanings, see Documentation/sysctl/kernel.rst * * The string is overwritten by the next call to print_tainted(), * but is always NULL terminated. -- cgit v1.2.3 From 570432470275c3da15b85362bc1461945b9c1919 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 22 Apr 2019 16:48:00 -0300 Subject: docs: admin-guide: move sysctl directory to it The stuff under sysctl describes /sys interface from userspace point of view. So, add it to the admin-guide and remove the :orphan: from its index file. Signed-off-by: Mauro Carvalho Chehab --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index e0ea74bbb41d..057540b6eee9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -372,7 +372,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { /** * print_tainted - return a string to represent the kernel taint state. * - * For individual taint flag meanings, see Documentation/sysctl/kernel.rst + * For individual taint flag meanings, see Documentation/admin-guide/sysctl/kernel.rst * * The string is overwritten by the next call to print_tainted(), * but is always NULL terminated. -- cgit v1.2.3 From da82c92f1150f66afabf78d2c85ef9ac18dc6d38 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 27 Jun 2019 13:08:35 -0300 Subject: docs: cgroup-v1: add it to the admin-guide book Those files belong to the admin guide, so add them. Signed-off-by: Mauro Carvalho Chehab --- kernel/cgroup/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b3b02b9c4405..863e434a6020 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -729,7 +729,7 @@ static inline int nr_cpusets(void) * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst + * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst * for a background explanation of this. * * Does not return errors, on the theory that the callers of this -- cgit v1.2.3 From 1acc5d5c5832da9a98b22374a8fae08ffe31b3f8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jul 2019 10:25:55 -0700 Subject: bpf: fix BTF verifier size resolution logic BTF verifier has a size resolution bug which in some circumstances leads to invalid size resolution for, e.g., TYPEDEF modifier. This happens if we have [1] PTR -> [2] TYPEDEF -> [3] ARRAY, in which case due to being in pointer context ARRAY size won't be resolved (because for pointer it doesn't matter, so it's a sink in pointer context), but it will be permanently remembered as zero for TYPEDEF and TYPEDEF will be marked as RESOLVED. Eventually ARRAY size will be resolved correctly, but TYPEDEF resolved_size won't be updated anymore. This, subsequently, will lead to erroneous map creation failure, if that TYPEDEF is specified as either key or value, as key_size/value_size won't correspond to resolved size of TYPEDEF (kernel will believe it's zero). Note, that if BTF was ordered as [1] ARRAY <- [2] TYPEDEF <- [3] PTR, this won't be a problem, as by the time we get to TYPEDEF, ARRAY's size is already calculated and stored. This bug manifests itself in rejecting BTF-defined maps that use array typedef as a value type: typedef int array_t[16]; struct { __uint(type, BPF_MAP_TYPE_ARRAY); __type(value, array_t); /* i.e., array_t *value; */ } test_map SEC(".maps"); The fix consists on not relying on modifier's resolved_size and instead using modifier's resolved_id (type ID for "concrete" type to which modifier eventually resolves) and doing size determination for that resolved type. This allow to preserve existing "early DFS termination" logic for PTR or STRUCT_OR_ARRAY contexts, but still do correct size determination for modifier types. Fixes: eb3f595dab40 ("bpf: btf: Validate type reference") Cc: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 546ebee39e2a..5fcc7a17eb5a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1073,11 +1073,18 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, !btf_type_is_var(size_type))) return NULL; - size = btf->resolved_sizes[size_type_id]; size_type_id = btf->resolved_ids[size_type_id]; size_type = btf_type_by_id(btf, size_type_id); if (btf_type_nosize_or_null(size_type)) return NULL; + else if (btf_type_has_size(size_type)) + size = size_type->size; + else if (btf_type_is_array(size_type)) + size = btf->resolved_sizes[size_type_id]; + else if (btf_type_is_ptr(size_type)) + size = sizeof(void *); + else + return NULL; } *type_id = size_type_id; @@ -1602,7 +1609,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, const struct btf_type *next_type; u32 next_type_id = t->type; struct btf *btf = env->btf; - u32 next_type_size = 0; next_type = btf_type_by_id(btf, next_type_id); if (!next_type || btf_type_is_resolve_source_only(next_type)) { @@ -1620,7 +1626,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, * save us a few type-following when we use it later (e.g. in * pretty print). */ - if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { + if (!btf_type_id_size(btf, &next_type_id, NULL)) { if (env_type_is_resolved(env, next_type_id)) next_type = btf_type_id_resolve(btf, &next_type_id); @@ -1633,7 +1639,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, } } - env_stack_pop_resolved(env, next_type_id, next_type_size); + env_stack_pop_resolved(env, next_type_id, 0); return 0; } @@ -1645,7 +1651,6 @@ static int btf_var_resolve(struct btf_verifier_env *env, const struct btf_type *t = v->t; u32 next_type_id = t->type; struct btf *btf = env->btf; - u32 next_type_size; next_type = btf_type_by_id(btf, next_type_id); if (!next_type || btf_type_is_resolve_source_only(next_type)) { @@ -1675,12 +1680,12 @@ static int btf_var_resolve(struct btf_verifier_env *env, * forward types or similar that would resolve to size of * zero is allowed. */ - if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { + if (!btf_type_id_size(btf, &next_type_id, NULL)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } - env_stack_pop_resolved(env, next_type_id, next_type_size); + env_stack_pop_resolved(env, next_type_id, 0); return 0; } -- cgit v1.2.3 From 65fc965c708c90b8c8b2cea980db0618333dd7fe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 3 Jun 2019 22:04:42 +0900 Subject: kprobes: Fix to init kprobes in subsys_initcall Since arm64 kernel initializes breakpoint trap vector in arch_initcall(), initializing kprobe (and run smoke test) in postcore_initcall() causes a kernel panic. To fix this issue, move the kprobe initialization in subsys_initcall() (which is called right afer the arch_initcall). In-kernel kprobe users (ftrace and bpf) are using fs_initcall() which is called after subsys_initcall(), so this shouldn't cause more problem. Link: http://lkml.kernel.org/r/155956708268.12228.10363800793132214198.stgit@devnote2 Link: http://lkml.kernel.org/r/20190709153755.GB10123@lakrids.cambridge.arm.com Reported-by: Anders Roxell Fixes: b5f8b32c93b2 ("kprobes: Initialize kprobes at postcore_initcall") Tested-by: Anders Roxell Tested-by: Mark Rutland Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 54aaaad00a47..5471efbeb937 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2289,7 +2289,7 @@ static int __init init_kprobes(void) init_test_probes(); return err; } -postcore_initcall(init_kprobes); +subsys_initcall(init_kprobes); #ifdef CONFIG_DEBUG_FS static void report_probe(struct seq_file *pi, struct kprobe *p, -- cgit v1.2.3 From f730e0f2da4d0035775ab3c85757fee37bb9cbbe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 1 Jun 2019 00:16:46 +0900 Subject: tracing/kprobe: Set print format right after parsed command Set event call's print format right after parsed command for simplifying (un)register_kprobe_event(). Link: http://lkml.kernel.org/r/155931580625.28323.5158822928646225903.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7958da2fd922..01fc49f08b70 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -272,6 +272,7 @@ static void free_trace_kprobe(struct trace_kprobe *tk) kfree(tk->tp.call.class->system); kfree(tk->tp.call.name); + kfree(tk->tp.call.print_fmt); kfree(tk->symbol); free_percpu(tk->nhit); kfree(tk); @@ -730,6 +731,10 @@ static int trace_kprobe_create(int argc, const char *argv[]) goto error; /* This can be -ENOMEM */ } + ret = traceprobe_set_print_fmt(&tk->tp, is_return); + if (ret < 0) + goto error; + ret = register_trace_kprobe(tk); if (ret) { trace_probe_log_set_index(1); @@ -1416,18 +1421,14 @@ static int register_kprobe_event(struct trace_kprobe *tk) init_trace_event_call(tk, call); - if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) - return -ENOMEM; ret = register_trace_event(&call->event); - if (!ret) { - kfree(call->print_fmt); + if (!ret) return -ENODEV; - } + ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", trace_event_name(call)); - kfree(call->print_fmt); unregister_trace_event(&call->event); } return ret; @@ -1435,13 +1436,8 @@ static int register_kprobe_event(struct trace_kprobe *tk) static int unregister_kprobe_event(struct trace_kprobe *tk) { - int ret; - /* tp->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tk->tp.call); - if (!ret) - kfree(tk->tp.call.print_fmt); - return ret; + return trace_remove_event_call(&tk->tp.call); } #ifdef CONFIG_PERF_EVENTS @@ -1479,10 +1475,8 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, } ret = __register_trace_kprobe(tk); - if (ret < 0) { - kfree(tk->tp.call.print_fmt); + if (ret < 0) goto error; - } return &tk->tp.call; error: @@ -1503,7 +1497,6 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call) __unregister_trace_kprobe(tk); - kfree(tk->tp.call.print_fmt); free_trace_kprobe(tk); } #endif /* CONFIG_PERF_EVENTS */ -- cgit v1.2.3 From b4d4b96be89466049a0d383d019edc1403bf0ba9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 1 Jun 2019 00:16:56 +0900 Subject: tracing/uprobe: Set print format when parsing command Set event call's print format right after parsed command for simplifying (un)register_uprobe_event(). Link: http://lkml.kernel.org/r/155931581659.28323.5404667166417404076.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3d6b868830f3..34ce671b6080 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -345,6 +345,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu) path_put(&tu->path); kfree(tu->tp.call.class->system); kfree(tu->tp.call.name); + kfree(tu->tp.call.print_fmt); kfree(tu->filename); kfree(tu); } @@ -592,6 +593,10 @@ static int trace_uprobe_create(int argc, const char **argv) goto error; } + ret = traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)); + if (ret < 0) + goto error; + ret = register_trace_uprobe(tu); if (!ret) goto out; @@ -1362,21 +1367,15 @@ static int register_uprobe_event(struct trace_uprobe *tu) init_trace_event_call(tu, call); - if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) - return -ENOMEM; - ret = register_trace_event(&call->event); - if (!ret) { - kfree(call->print_fmt); + if (!ret) return -ENODEV; - } ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register uprobe event: %s\n", trace_event_name(call)); - kfree(call->print_fmt); unregister_trace_event(&call->event); } @@ -1385,15 +1384,8 @@ static int register_uprobe_event(struct trace_uprobe *tu) static int unregister_uprobe_event(struct trace_uprobe *tu) { - int ret; - /* tu->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tu->tp.call); - if (ret) - return ret; - kfree(tu->tp.call.print_fmt); - tu->tp.call.print_fmt = NULL; - return 0; + return trace_remove_event_call(&tu->tp.call); } #ifdef CONFIG_PERF_EVENTS @@ -1452,9 +1444,6 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call) tu = container_of(event_call, struct trace_uprobe, tp.call); - kfree(tu->tp.call.print_fmt); - tu->tp.call.print_fmt = NULL; - free_trace_uprobe(tu); } #endif /* CONFIG_PERF_EVENTS */ -- cgit v1.2.3 From 455b289973f7df350ea179c7eb8bfed0c766ec40 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 1 Jun 2019 00:17:06 +0900 Subject: tracing/probe: Add trace_probe init and free functions Add common trace_probe init and cleanup function in trace_probe.c, and use it from trace_kprobe.c and trace_uprobe.c Link: http://lkml.kernel.org/r/155931582664.28323.5934870189034740822.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 49 ++++++++++++--------------------------------- kernel/trace/trace_probe.c | 36 +++++++++++++++++++++++++++++++++ kernel/trace/trace_probe.h | 4 ++++ kernel/trace/trace_uprobe.c | 27 +++++-------------------- 4 files changed, 58 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 01fc49f08b70..c43c2d419ded 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -197,6 +197,16 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); +static void free_trace_kprobe(struct trace_kprobe *tk) +{ + if (tk) { + trace_probe_cleanup(&tk->tp); + kfree(tk->symbol); + free_percpu(tk->nhit); + kfree(tk); + } +} + /* * Allocate new trace_probe and initialize it (including kprobes). */ @@ -235,49 +245,17 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, tk->rp.maxactive = maxactive; - if (!event || !group) { - ret = -EINVAL; - goto error; - } - - tk->tp.call.class = &tk->tp.class; - tk->tp.call.name = kstrdup(event, GFP_KERNEL); - if (!tk->tp.call.name) - goto error; - - tk->tp.class.system = kstrdup(group, GFP_KERNEL); - if (!tk->tp.class.system) + ret = trace_probe_init(&tk->tp, event, group); + if (ret < 0) goto error; dyn_event_init(&tk->devent, &trace_kprobe_ops); - INIT_LIST_HEAD(&tk->tp.files); return tk; error: - kfree(tk->tp.call.name); - kfree(tk->symbol); - free_percpu(tk->nhit); - kfree(tk); + free_trace_kprobe(tk); return ERR_PTR(ret); } -static void free_trace_kprobe(struct trace_kprobe *tk) -{ - int i; - - if (!tk) - return; - - for (i = 0; i < tk->tp.nr_args; i++) - traceprobe_free_probe_arg(&tk->tp.args[i]); - - kfree(tk->tp.call.class->system); - kfree(tk->tp.call.name); - kfree(tk->tp.call.print_fmt); - kfree(tk->symbol); - free_percpu(tk->nhit); - kfree(tk); -} - static struct trace_kprobe *find_trace_kprobe(const char *event, const char *group) { @@ -1400,7 +1378,6 @@ static struct trace_event_functions kprobe_funcs = { static inline void init_trace_event_call(struct trace_kprobe *tk, struct trace_event_call *call) { - INIT_LIST_HEAD(&call->class->fields); if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; call->class->define_fields = kretprobe_event_define_fields; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index b6b0593844cd..fe4ee2e73d92 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -884,3 +884,39 @@ int traceprobe_define_arg_fields(struct trace_event_call *event_call, } return 0; } + + +void trace_probe_cleanup(struct trace_probe *tp) +{ + int i; + + for (i = 0; i < tp->nr_args; i++) + traceprobe_free_probe_arg(&tp->args[i]); + + kfree(tp->call.class->system); + kfree(tp->call.name); + kfree(tp->call.print_fmt); +} + +int trace_probe_init(struct trace_probe *tp, const char *event, + const char *group) +{ + if (!event || !group) + return -EINVAL; + + tp->call.class = &tp->class; + tp->call.name = kstrdup(event, GFP_KERNEL); + if (!tp->call.name) + return -ENOMEM; + + tp->class.system = kstrdup(group, GFP_KERNEL); + if (!tp->class.system) { + kfree(tp->call.name); + tp->call.name = NULL; + return -ENOMEM; + } + INIT_LIST_HEAD(&tp->files); + INIT_LIST_HEAD(&tp->class.fields); + + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 42816358dd48..818b1d7693ba 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -248,6 +248,10 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp) return !!(tp->flags & TP_FLAG_REGISTERED); } +int trace_probe_init(struct trace_probe *tp, const char *event, + const char *group); +void trace_probe_cleanup(struct trace_probe *tp); + /* Check the name is good for event/group/fields */ static inline bool is_good_name(const char *name) { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 34ce671b6080..b18b7eb1a76f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -300,25 +300,17 @@ static struct trace_uprobe * alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) { struct trace_uprobe *tu; - - if (!event || !group) - return ERR_PTR(-EINVAL); + int ret; tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); if (!tu) return ERR_PTR(-ENOMEM); - tu->tp.call.class = &tu->tp.class; - tu->tp.call.name = kstrdup(event, GFP_KERNEL); - if (!tu->tp.call.name) - goto error; - - tu->tp.class.system = kstrdup(group, GFP_KERNEL); - if (!tu->tp.class.system) + ret = trace_probe_init(&tu->tp, event, group); + if (ret < 0) goto error; dyn_event_init(&tu->devent, &trace_uprobe_ops); - INIT_LIST_HEAD(&tu->tp.files); tu->consumer.handler = uprobe_dispatcher; if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; @@ -326,26 +318,18 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) return tu; error: - kfree(tu->tp.call.name); kfree(tu); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } static void free_trace_uprobe(struct trace_uprobe *tu) { - int i; - if (!tu) return; - for (i = 0; i < tu->tp.nr_args; i++) - traceprobe_free_probe_arg(&tu->tp.args[i]); - path_put(&tu->path); - kfree(tu->tp.call.class->system); - kfree(tu->tp.call.name); - kfree(tu->tp.call.print_fmt); + trace_probe_cleanup(&tu->tp); kfree(tu->filename); kfree(tu); } @@ -1351,7 +1335,6 @@ static struct trace_event_functions uprobe_funcs = { static inline void init_trace_event_call(struct trace_uprobe *tu, struct trace_event_call *call) { - INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; -- cgit v1.2.3 From 46e5376d404d14cb321f5d4e446fe3fb6d8a93ab Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 1 Jun 2019 00:17:16 +0900 Subject: tracing/probe: Add trace_event_call register API for trace_probe Since trace_event_call is a field of trace_probe, these operations should be done in trace_probe.c. trace_kprobe and trace_uprobe use new functions to register/unregister trace_event_call. Link: http://lkml.kernel.org/r/155931583643.28323.14828411185591538876.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 20 +++----------------- kernel/trace/trace_probe.c | 16 ++++++++++++++++ kernel/trace/trace_probe.h | 6 ++++++ kernel/trace/trace_uprobe.c | 22 +++------------------- 4 files changed, 28 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c43c2d419ded..7f802ee27266 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1393,28 +1393,14 @@ static inline void init_trace_event_call(struct trace_kprobe *tk, static int register_kprobe_event(struct trace_kprobe *tk) { - struct trace_event_call *call = &tk->tp.call; - int ret = 0; - - init_trace_event_call(tk, call); - - ret = register_trace_event(&call->event); - if (!ret) - return -ENODEV; + init_trace_event_call(tk, &tk->tp.call); - ret = trace_add_event_call(call); - if (ret) { - pr_info("Failed to register kprobe event: %s\n", - trace_event_name(call)); - unregister_trace_event(&call->event); - } - return ret; + return trace_probe_register_event_call(&tk->tp); } static int unregister_kprobe_event(struct trace_kprobe *tk) { - /* tp->event is unregistered in trace_remove_event_call() */ - return trace_remove_event_call(&tk->tp.call); + return trace_probe_unregister_event_call(&tk->tp); } #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index fe4ee2e73d92..509a26024b4f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -920,3 +920,19 @@ int trace_probe_init(struct trace_probe *tp, const char *event, return 0; } + +int trace_probe_register_event_call(struct trace_probe *tp) +{ + struct trace_event_call *call = &tp->call; + int ret; + + ret = register_trace_event(&call->event); + if (!ret) + return -ENODEV; + + ret = trace_add_event_call(call); + if (ret) + unregister_trace_event(&call->event); + + return ret; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 818b1d7693ba..01d7b222e004 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -251,6 +251,12 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp) int trace_probe_init(struct trace_probe *tp, const char *event, const char *group); void trace_probe_cleanup(struct trace_probe *tp); +int trace_probe_register_event_call(struct trace_probe *tp); +static inline int trace_probe_unregister_event_call(struct trace_probe *tp) +{ + /* tp->event is unregistered in trace_remove_event_call() */ + return trace_remove_event_call(&tp->call); +} /* Check the name is good for event/group/fields */ static inline bool is_good_name(const char *name) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b18b7eb1a76f..c262494fa793 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1345,30 +1345,14 @@ static inline void init_trace_event_call(struct trace_uprobe *tu, static int register_uprobe_event(struct trace_uprobe *tu) { - struct trace_event_call *call = &tu->tp.call; - int ret = 0; - - init_trace_event_call(tu, call); - - ret = register_trace_event(&call->event); - if (!ret) - return -ENODEV; - - ret = trace_add_event_call(call); - - if (ret) { - pr_info("Failed to register uprobe event: %s\n", - trace_event_name(call)); - unregister_trace_event(&call->event); - } + init_trace_event_call(tu, &tu->tp.call); - return ret; + return trace_probe_register_event_call(&tu->tp); } static int unregister_uprobe_event(struct trace_uprobe *tu) { - /* tu->event is unregistered in trace_remove_event_call() */ - return trace_remove_event_call(&tu->tp.call); + return trace_probe_unregister_event_call(&tu->tp); } #ifdef CONFIG_PERF_EVENTS -- cgit v1.2.3 From b5f935ee133911b3ed2d4429dd86d2bd5385519d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 1 Jun 2019 00:17:26 +0900 Subject: tracing/probe: Add trace_event_file access APIs for trace_probe Add trace_event_file access APIs for trace_probe data structure. This simplifies enabling/disabling operations in uprobe and kprobe events so that those don't touch deep inside the trace_probe. This also removing a redundant synchronization when the kprobe event is used from perf, since the perf itself uses tracepoint_synchronize_unregister() after disabling (ftrace- defined) event, thus we don't have to synchronize in that path. Also we don't need to identify local trace_kprobe too anymore. Link: http://lkml.kernel.org/r/155931584587.28323.372301976283354629.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 85 ++++++++++++++++----------------------------- kernel/trace/trace_probe.c | 47 +++++++++++++++++++++++++ kernel/trace/trace_probe.h | 36 ++++++++++--------- kernel/trace/trace_uprobe.c | 42 +++++++--------------- 4 files changed, 109 insertions(+), 101 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7f802ee27266..87a52094378c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -290,34 +290,27 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk) static int enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { - struct event_file_link *link; + bool enabled = trace_probe_is_enabled(&tk->tp); int ret = 0; if (file) { - link = kmalloc(sizeof(*link), GFP_KERNEL); - if (!link) { - ret = -ENOMEM; - goto out; - } - - link->file = file; - list_add_tail_rcu(&link->list, &tk->tp.files); + ret = trace_probe_add_file(&tk->tp, file); + if (ret) + return ret; + } else + tk->tp.flags |= TP_FLAG_PROFILE; - tk->tp.flags |= TP_FLAG_TRACE; - ret = __enable_trace_kprobe(tk); - if (ret) { - list_del_rcu(&link->list); - kfree(link); - tk->tp.flags &= ~TP_FLAG_TRACE; - } + if (enabled) + return 0; - } else { - tk->tp.flags |= TP_FLAG_PROFILE; - ret = __enable_trace_kprobe(tk); - if (ret) + ret = __enable_trace_kprobe(tk); + if (ret) { + if (file) + trace_probe_remove_file(&tk->tp, file); + else tk->tp.flags &= ~TP_FLAG_PROFILE; } - out: + return ret; } @@ -328,54 +321,34 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) static int disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { - struct event_file_link *link = NULL; - int wait = 0; + struct trace_probe *tp = &tk->tp; int ret = 0; if (file) { - link = find_event_file_link(&tk->tp, file); - if (!link) { - ret = -EINVAL; - goto out; - } - - list_del_rcu(&link->list); - wait = 1; - if (!list_empty(&tk->tp.files)) + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) goto out; - - tk->tp.flags &= ~TP_FLAG_TRACE; + tp->flags &= ~TP_FLAG_TRACE; } else - tk->tp.flags &= ~TP_FLAG_PROFILE; + tp->flags &= ~TP_FLAG_PROFILE; - if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) { + if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { if (trace_kprobe_is_return(tk)) disable_kretprobe(&tk->rp); else disable_kprobe(&tk->rp.kp); - wait = 1; } - /* - * if tk is not added to any list, it must be a local trace_kprobe - * created with perf_event_open. We don't need to wait for these - * trace_kprobes - */ - if (list_empty(&tk->devent.list)) - wait = 0; out: - if (wait) { + if (file) /* - * Synchronize with kprobe_trace_func/kretprobe_trace_func - * to ensure disabled (all running handlers are finished). - * This is not only for kfree(), but also the caller, - * trace_remove_event_call() supposes it for releasing - * event_call related objects, which will be accessed in - * the kprobe_trace_func/kretprobe_trace_func. + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. */ - synchronize_rcu(); - kfree(link); /* Ignored if link == NULL */ - } + trace_probe_remove_file(tp, file); return ret; } @@ -1044,7 +1017,7 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct event_file_link *link; - list_for_each_entry_rcu(link, &tk->tp.files, list) + trace_probe_for_each_link_rcu(link, &tk->tp) __kprobe_trace_func(tk, regs, link->file); } NOKPROBE_SYMBOL(kprobe_trace_func); @@ -1094,7 +1067,7 @@ kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, { struct event_file_link *link; - list_for_each_entry_rcu(link, &tk->tp.files, list) + trace_probe_for_each_link_rcu(link, &tk->tp) __kretprobe_trace_func(tk, ri, regs, link->file); } NOKPROBE_SYMBOL(kretprobe_trace_func); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 509a26024b4f..abb05608a09d 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -936,3 +936,50 @@ int trace_probe_register_event_call(struct trace_probe *tp) return ret; } + +int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file) +{ + struct event_file_link *link; + + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + + link->file = file; + INIT_LIST_HEAD(&link->list); + list_add_tail_rcu(&link->list, &tp->files); + tp->flags |= TP_FLAG_TRACE; + return 0; +} + +struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, + struct trace_event_file *file) +{ + struct event_file_link *link; + + trace_probe_for_each_link(link, tp) { + if (link->file == file) + return link; + } + + return NULL; +} + +int trace_probe_remove_file(struct trace_probe *tp, + struct trace_event_file *file) +{ + struct event_file_link *link; + + link = trace_probe_get_file_link(tp, file); + if (!link) + return -ENOENT; + + list_del_rcu(&link->list); + synchronize_rcu(); + kfree(link); + + if (list_empty(&tp->files)) + tp->flags &= ~TP_FLAG_TRACE; + + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 01d7b222e004..ab02007e131d 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -248,16 +248,32 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp) return !!(tp->flags & TP_FLAG_REGISTERED); } -int trace_probe_init(struct trace_probe *tp, const char *event, - const char *group); -void trace_probe_cleanup(struct trace_probe *tp); -int trace_probe_register_event_call(struct trace_probe *tp); static inline int trace_probe_unregister_event_call(struct trace_probe *tp) { /* tp->event is unregistered in trace_remove_event_call() */ return trace_remove_event_call(&tp->call); } +static inline bool trace_probe_has_single_file(struct trace_probe *tp) +{ + return !!list_is_singular(&tp->files); +} + +int trace_probe_init(struct trace_probe *tp, const char *event, + const char *group); +void trace_probe_cleanup(struct trace_probe *tp); +int trace_probe_register_event_call(struct trace_probe *tp); +int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file); +int trace_probe_remove_file(struct trace_probe *tp, + struct trace_event_file *file); +struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, + struct trace_event_file *file); + +#define trace_probe_for_each_link(pos, tp) \ + list_for_each_entry(pos, &(tp)->files, list) +#define trace_probe_for_each_link_rcu(pos, tp) \ + list_for_each_entry_rcu(pos, &(tp)->files, list) + /* Check the name is good for event/group/fields */ static inline bool is_good_name(const char *name) { @@ -270,18 +286,6 @@ static inline bool is_good_name(const char *name) return true; } -static inline struct event_file_link * -find_event_file_link(struct trace_probe *tp, struct trace_event_file *file) -{ - struct event_file_link *link; - - list_for_each_entry(link, &tp->files, list) - if (link->file == file) - return link; - - return NULL; -} - #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c262494fa793..a9f8045b6695 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -863,7 +863,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, return 0; rcu_read_lock(); - list_for_each_entry_rcu(link, &tu->tp.files, list) + trace_probe_for_each_link_rcu(link, &tu->tp) __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); rcu_read_unlock(); @@ -877,7 +877,7 @@ static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct event_file_link *link; rcu_read_lock(); - list_for_each_entry_rcu(link, &tu->tp.files, list) + trace_probe_for_each_link_rcu(link, &tu->tp) __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); rcu_read_unlock(); } @@ -924,21 +924,15 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, filter_func_t filter) { bool enabled = trace_probe_is_enabled(&tu->tp); - struct event_file_link *link = NULL; int ret; if (file) { if (tu->tp.flags & TP_FLAG_PROFILE) return -EINTR; - link = kmalloc(sizeof(*link), GFP_KERNEL); - if (!link) - return -ENOMEM; - - link->file = file; - list_add_tail_rcu(&link->list, &tu->tp.files); - - tu->tp.flags |= TP_FLAG_TRACE; + ret = trace_probe_add_file(&tu->tp, file); + if (ret < 0) + return ret; } else { if (tu->tp.flags & TP_FLAG_TRACE) return -EINTR; @@ -973,13 +967,11 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, uprobe_buffer_disable(); err_flags: - if (file) { - list_del(&link->list); - kfree(link); - tu->tp.flags &= ~TP_FLAG_TRACE; - } else { + if (file) + trace_probe_remove_file(&tu->tp, file); + else tu->tp.flags &= ~TP_FLAG_PROFILE; - } + return ret; } @@ -990,26 +982,18 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) return; if (file) { - struct event_file_link *link; - - link = find_event_file_link(&tu->tp, file); - if (!link) + if (trace_probe_remove_file(&tu->tp, file) < 0) return; - list_del_rcu(&link->list); - /* synchronize with u{,ret}probe_trace_func */ - synchronize_rcu(); - kfree(link); - - if (!list_empty(&tu->tp.files)) + if (trace_probe_is_enabled(&tu->tp)) return; - } + } else + tu->tp.flags &= ~TP_FLAG_PROFILE; WARN_ON(!uprobe_filter_is_empty(&tu->filter)); uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->inode = NULL; - tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE; uprobe_buffer_disable(); } -- cgit v1.2.3 From 747774d6b018ca02493fd3f321624dfce749da61 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 1 Jun 2019 00:17:37 +0900 Subject: tracing/probe: Add trace flag access APIs for trace_probe Add trace_probe_test/set/clear_flag() functions for accessing trace_probe.flag field. This flags field should not be accessed directly. Link: http://lkml.kernel.org/r/155931585683.28323.314290023236905988.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 20 ++++++++++---------- kernel/trace/trace_probe.c | 4 ++-- kernel/trace/trace_probe.h | 22 ++++++++++++++++++++-- kernel/trace/trace_uprobe.c | 18 +++++++++--------- 4 files changed, 41 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 87a52094378c..c3ab84cb25c8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -298,7 +298,7 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) if (ret) return ret; } else - tk->tp.flags |= TP_FLAG_PROFILE; + trace_probe_set_flag(&tk->tp, TP_FLAG_PROFILE); if (enabled) return 0; @@ -308,7 +308,7 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) if (file) trace_probe_remove_file(&tk->tp, file); else - tk->tp.flags &= ~TP_FLAG_PROFILE; + trace_probe_clear_flag(&tk->tp, TP_FLAG_PROFILE); } return ret; @@ -329,9 +329,9 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) return -ENOENT; if (!trace_probe_has_single_file(tp)) goto out; - tp->flags &= ~TP_FLAG_TRACE; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); } else - tp->flags &= ~TP_FLAG_PROFILE; + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { if (trace_kprobe_is_return(tk)) @@ -408,7 +408,7 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) ret = register_kprobe(&tk->rp.kp); if (ret == 0) - tk->tp.flags |= TP_FLAG_REGISTERED; + trace_probe_set_flag(&tk->tp, TP_FLAG_REGISTERED); return ret; } @@ -420,7 +420,7 @@ static void __unregister_trace_kprobe(struct trace_kprobe *tk) unregister_kretprobe(&tk->rp); else unregister_kprobe(&tk->rp.kp); - tk->tp.flags &= ~TP_FLAG_REGISTERED; + trace_probe_clear_flag(&tk->tp, TP_FLAG_REGISTERED); /* Cleanup kprobe for reuse */ if (tk->rp.kp.symbol_name) tk->rp.kp.addr = NULL; @@ -1313,10 +1313,10 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) raw_cpu_inc(*tk->nhit); - if (tk->tp.flags & TP_FLAG_TRACE) + if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (tk->tp.flags & TP_FLAG_PROFILE) + if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) ret = kprobe_perf_func(tk, regs); #endif return ret; @@ -1330,10 +1330,10 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) raw_cpu_inc(*tk->nhit); - if (tk->tp.flags & TP_FLAG_TRACE) + if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) kretprobe_trace_func(tk, ri, regs); #ifdef CONFIG_PERF_EVENTS - if (tk->tp.flags & TP_FLAG_PROFILE) + if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) kretprobe_perf_func(tk, ri, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index abb05608a09d..323a11ad1dad 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -948,7 +948,7 @@ int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file) link->file = file; INIT_LIST_HEAD(&link->list); list_add_tail_rcu(&link->list, &tp->files); - tp->flags |= TP_FLAG_TRACE; + trace_probe_set_flag(tp, TP_FLAG_TRACE); return 0; } @@ -979,7 +979,7 @@ int trace_probe_remove_file(struct trace_probe *tp, kfree(link); if (list_empty(&tp->files)) - tp->flags &= ~TP_FLAG_TRACE; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); return 0; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index ab02007e131d..87d48d850b63 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -238,14 +238,32 @@ struct event_file_link { struct list_head list; }; +static inline bool trace_probe_test_flag(struct trace_probe *tp, + unsigned int flag) +{ + return !!(tp->flags & flag); +} + +static inline void trace_probe_set_flag(struct trace_probe *tp, + unsigned int flag) +{ + tp->flags |= flag; +} + +static inline void trace_probe_clear_flag(struct trace_probe *tp, + unsigned int flag) +{ + tp->flags &= ~flag; +} + static inline bool trace_probe_is_enabled(struct trace_probe *tp) { - return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); + return trace_probe_test_flag(tp, TP_FLAG_TRACE | TP_FLAG_PROFILE); } static inline bool trace_probe_is_registered(struct trace_probe *tp) { - return !!(tp->flags & TP_FLAG_REGISTERED); + return trace_probe_test_flag(tp, TP_FLAG_REGISTERED); } static inline int trace_probe_unregister_event_call(struct trace_probe *tp) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a9f8045b6695..f8e23ed47823 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -927,17 +927,17 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, int ret; if (file) { - if (tu->tp.flags & TP_FLAG_PROFILE) + if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) return -EINTR; ret = trace_probe_add_file(&tu->tp, file); if (ret < 0) return ret; } else { - if (tu->tp.flags & TP_FLAG_TRACE) + if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) return -EINTR; - tu->tp.flags |= TP_FLAG_PROFILE; + trace_probe_set_flag(&tu->tp, TP_FLAG_PROFILE); } WARN_ON(!uprobe_filter_is_empty(&tu->filter)); @@ -970,7 +970,7 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, if (file) trace_probe_remove_file(&tu->tp, file); else - tu->tp.flags &= ~TP_FLAG_PROFILE; + trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE); return ret; } @@ -988,7 +988,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) if (trace_probe_is_enabled(&tu->tp)) return; } else - tu->tp.flags &= ~TP_FLAG_PROFILE; + trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE); WARN_ON(!uprobe_filter_is_empty(&tu->filter)); @@ -1266,11 +1266,11 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) ucb = uprobe_buffer_get(); store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); - if (tu->tp.flags & TP_FLAG_TRACE) + if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) ret |= uprobe_trace_func(tu, regs, ucb, dsize); #ifdef CONFIG_PERF_EVENTS - if (tu->tp.flags & TP_FLAG_PROFILE) + if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) ret |= uprobe_perf_func(tu, regs, ucb, dsize); #endif uprobe_buffer_put(ucb); @@ -1301,11 +1301,11 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, ucb = uprobe_buffer_get(); store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); - if (tu->tp.flags & TP_FLAG_TRACE) + if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) uretprobe_trace_func(tu, func, regs, ucb, dsize); #ifdef CONFIG_PERF_EVENTS - if (tu->tp.flags & TP_FLAG_PROFILE) + if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) uretprobe_perf_func(tu, func, regs, ucb, dsize); #endif uprobe_buffer_put(ucb); -- cgit v1.2.3 From b55ce203a8f327b623688c8fb551ac3f9781edea Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 1 Jun 2019 00:17:47 +0900 Subject: tracing/probe: Add probe event name and group name accesses APIs Add trace_probe_name() and trace_probe_group_name() functions for accessing probe name and group name of trace_probe. Link: http://lkml.kernel.org/r/155931586717.28323.8738615064952254761.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 24 ++++++++++++------------ kernel/trace/trace_probe.h | 10 ++++++++++ kernel/trace/trace_uprobe.c | 22 +++++++++++----------- 3 files changed, 33 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c3ab84cb25c8..3cf8cee4f276 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -142,8 +142,8 @@ static bool trace_kprobe_match(const char *system, const char *event, { struct trace_kprobe *tk = to_trace_kprobe(ev); - return strcmp(trace_event_name(&tk->tp.call), event) == 0 && - (!system || strcmp(tk->tp.call.class->system, system) == 0); + return strcmp(trace_probe_name(&tk->tp), event) == 0 && + (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0); } static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) @@ -263,8 +263,8 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, struct trace_kprobe *tk; for_each_trace_kprobe(tk, pos) - if (strcmp(trace_event_name(&tk->tp.call), event) == 0 && - strcmp(tk->tp.call.class->system, group) == 0) + if (strcmp(trace_probe_name(&tk->tp), event) == 0 && + strcmp(trace_probe_group_name(&tk->tp), group) == 0) return tk; return NULL; } @@ -453,8 +453,8 @@ static int register_trace_kprobe(struct trace_kprobe *tk) mutex_lock(&event_mutex); /* Delete old (same name) event if exist */ - old_tk = find_trace_kprobe(trace_event_name(&tk->tp.call), - tk->tp.call.class->system); + old_tk = find_trace_kprobe(trace_probe_name(&tk->tp), + trace_probe_group_name(&tk->tp)); if (old_tk) { ret = unregister_trace_kprobe(old_tk); if (ret < 0) @@ -507,7 +507,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, ret = __register_trace_kprobe(tk); if (ret) pr_warn("Failed to re-register probe %s on %s: %d\n", - trace_event_name(&tk->tp.call), + trace_probe_name(&tk->tp), mod->name, ret); } } @@ -737,8 +737,8 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev) int i; seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); - seq_printf(m, ":%s/%s", tk->tp.call.class->system, - trace_event_name(&tk->tp.call)); + seq_printf(m, ":%s/%s", trace_probe_group_name(&tk->tp), + trace_probe_name(&tk->tp)); if (!tk->symbol) seq_printf(m, " 0x%p", tk->rp.kp.addr); @@ -812,7 +812,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) tk = to_trace_kprobe(ev); seq_printf(m, " %-44s %15lu %15lu\n", - trace_event_name(&tk->tp.call), + trace_probe_name(&tk->tp), trace_kprobe_nhit(tk), tk->rp.kp.nmissed); @@ -1084,7 +1084,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - trace_seq_printf(s, "%s: (", trace_event_name(&tp->call)); + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) goto out; @@ -1111,7 +1111,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - trace_seq_printf(s, "%s: (", trace_event_name(&tp->call)); + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) goto out; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 87d48d850b63..67424cb5d5d6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -266,6 +266,16 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp) return trace_probe_test_flag(tp, TP_FLAG_REGISTERED); } +static inline const char *trace_probe_name(struct trace_probe *tp) +{ + return trace_event_name(&tp->call); +} + +static inline const char *trace_probe_group_name(struct trace_probe *tp) +{ + return tp->call.class->system; +} + static inline int trace_probe_unregister_event_call(struct trace_probe *tp) { /* tp->event is unregistered in trace_remove_event_call() */ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f8e23ed47823..09fdba3ee9d9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -289,8 +289,8 @@ static bool trace_uprobe_match(const char *system, const char *event, { struct trace_uprobe *tu = to_trace_uprobe(ev); - return strcmp(trace_event_name(&tu->tp.call), event) == 0 && - (!system || strcmp(tu->tp.call.class->system, system) == 0); + return strcmp(trace_probe_name(&tu->tp), event) == 0 && + (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0); } /* @@ -340,8 +340,8 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou struct trace_uprobe *tu; for_each_trace_uprobe(tu, pos) - if (strcmp(trace_event_name(&tu->tp.call), event) == 0 && - strcmp(tu->tp.call.class->system, group) == 0) + if (strcmp(trace_probe_name(&tu->tp), event) == 0 && + strcmp(trace_probe_group_name(&tu->tp), group) == 0) return tu; return NULL; @@ -376,8 +376,8 @@ static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new) struct trace_uprobe *tmp, *old = NULL; struct inode *new_inode = d_real_inode(new->path.dentry); - old = find_probe_event(trace_event_name(&new->tp.call), - new->tp.call.class->system); + old = find_probe_event(trace_probe_name(&new->tp), + trace_probe_group_name(&new->tp)); for_each_trace_uprobe(tmp, pos) { if ((old ? old != tmp : true) && @@ -624,8 +624,8 @@ static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev) char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, tu->tp.call.class->system, - trace_event_name(&tu->tp.call), tu->filename, + seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, trace_probe_group_name(&tu->tp), + trace_probe_name(&tu->tp), tu->filename, (int)(sizeof(void *) * 2), tu->offset); if (tu->ref_ctr_offset) @@ -695,7 +695,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) tu = to_trace_uprobe(ev); seq_printf(m, " %s %-44s %15lu\n", tu->filename, - trace_event_name(&tu->tp.call), tu->nhit); + trace_probe_name(&tu->tp), tu->nhit); return 0; } @@ -896,12 +896,12 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e if (is_ret_probe(tu)) { trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", - trace_event_name(&tu->tp.call), + trace_probe_name(&tu->tp), entry->vaddr[1], entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, true); } else { trace_seq_printf(s, "%s: (0x%lx)", - trace_event_name(&tu->tp.call), + trace_probe_name(&tu->tp), entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, false); } -- cgit v1.2.3 From e3dc9f898ef9c6a1a96378517573ee2d04d0abcc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 1 Jun 2019 00:17:57 +0900 Subject: tracing/probe: Add trace_event_call accesses APIs Add trace_event_call access APIs for trace_probe. Instead of accessing trace_probe.call directly, use those accesses by trace_probe_event_call() method. This hides the relationship of trace_event_call and trace_probe from trace_kprobe and trace_uprobe. Link: http://lkml.kernel.org/r/155931587711.28323.8335129014686133120.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 23 ++++++++++++----------- kernel/trace/trace_probe.c | 24 ++++++++++++++---------- kernel/trace/trace_probe.h | 6 ++++++ kernel/trace/trace_uprobe.c | 15 ++++++++------- 4 files changed, 40 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3cf8cee4f276..62362ad1ad98 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -985,7 +985,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, struct ring_buffer *buffer; int size, dsize, pc; unsigned long irq_flags; - struct trace_event_call *call = &tk->tp.call; + struct trace_event_call *call = trace_probe_event_call(&tk->tp); WARN_ON(call != trace_file->event_call); @@ -1033,7 +1033,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct ring_buffer *buffer; int size, pc, dsize; unsigned long irq_flags; - struct trace_event_call *call = &tk->tp.call; + struct trace_event_call *call = trace_probe_event_call(&tk->tp); WARN_ON(call != trace_file->event_call); @@ -1163,7 +1163,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) static int kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { - struct trace_event_call *call = &tk->tp.call; + struct trace_event_call *call = trace_probe_event_call(&tk->tp); struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; @@ -1213,7 +1213,7 @@ static void kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { - struct trace_event_call *call = &tk->tp.call; + struct trace_event_call *call = trace_probe_event_call(&tk->tp); struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; @@ -1348,9 +1348,10 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; -static inline void init_trace_event_call(struct trace_kprobe *tk, - struct trace_event_call *call) +static inline void init_trace_event_call(struct trace_kprobe *tk) { + struct trace_event_call *call = trace_probe_event_call(&tk->tp); + if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; call->class->define_fields = kretprobe_event_define_fields; @@ -1366,7 +1367,7 @@ static inline void init_trace_event_call(struct trace_kprobe *tk, static int register_kprobe_event(struct trace_kprobe *tk) { - init_trace_event_call(tk, &tk->tp.call); + init_trace_event_call(tk); return trace_probe_register_event_call(&tk->tp); } @@ -1403,7 +1404,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, return ERR_CAST(tk); } - init_trace_event_call(tk, &tk->tp.call); + init_trace_event_call(tk); if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { ret = -ENOMEM; @@ -1414,7 +1415,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, if (ret < 0) goto error; - return &tk->tp.call; + return trace_probe_event_call(&tk->tp); error: free_trace_kprobe(tk); return ERR_PTR(ret); @@ -1447,7 +1448,7 @@ static __init void enable_boot_kprobe_events(void) mutex_lock(&event_mutex); for_each_trace_kprobe(tk, pos) { list_for_each_entry(file, &tr->events, list) - if (file->event_call == &tk->tp.call) + if (file->event_call == trace_probe_event_call(&tk->tp)) trace_event_enable_disable(file, 1, 0); } mutex_unlock(&event_mutex); @@ -1523,7 +1524,7 @@ find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) - if (file->event_call == &tk->tp.call) + if (file->event_call == trace_probe_event_call(&tk->tp)) return file; return NULL; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 323a11ad1dad..dbef0d135075 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -844,6 +844,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return) { + struct trace_event_call *call = trace_probe_event_call(tp); int len; char *print_fmt; @@ -855,7 +856,7 @@ int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return) /* Second: actually write the @print_fmt */ __set_print_fmt(tp, print_fmt, len + 1, is_return); - tp->call.print_fmt = print_fmt; + call->print_fmt = print_fmt; return 0; } @@ -888,31 +889,34 @@ int traceprobe_define_arg_fields(struct trace_event_call *event_call, void trace_probe_cleanup(struct trace_probe *tp) { + struct trace_event_call *call = trace_probe_event_call(tp); int i; for (i = 0; i < tp->nr_args; i++) traceprobe_free_probe_arg(&tp->args[i]); - kfree(tp->call.class->system); - kfree(tp->call.name); - kfree(tp->call.print_fmt); + kfree(call->class->system); + kfree(call->name); + kfree(call->print_fmt); } int trace_probe_init(struct trace_probe *tp, const char *event, const char *group) { + struct trace_event_call *call = trace_probe_event_call(tp); + if (!event || !group) return -EINVAL; - tp->call.class = &tp->class; - tp->call.name = kstrdup(event, GFP_KERNEL); - if (!tp->call.name) + call->class = &tp->class; + call->name = kstrdup(event, GFP_KERNEL); + if (!call->name) return -ENOMEM; tp->class.system = kstrdup(group, GFP_KERNEL); if (!tp->class.system) { - kfree(tp->call.name); - tp->call.name = NULL; + kfree(call->name); + call->name = NULL; return -ENOMEM; } INIT_LIST_HEAD(&tp->files); @@ -923,7 +927,7 @@ int trace_probe_init(struct trace_probe *tp, const char *event, int trace_probe_register_event_call(struct trace_probe *tp) { - struct trace_event_call *call = &tp->call; + struct trace_event_call *call = trace_probe_event_call(tp); int ret; ret = register_trace_event(&call->event); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 67424cb5d5d6..6c33d4aa36c3 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -276,6 +276,12 @@ static inline const char *trace_probe_group_name(struct trace_probe *tp) return tp->call.class->system; } +static inline struct trace_event_call * + trace_probe_event_call(struct trace_probe *tp) +{ + return &tp->call; +} + static inline int trace_probe_unregister_event_call(struct trace_probe *tp) { /* tp->event is unregistered in trace_remove_event_call() */ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 09fdba3ee9d9..7fb9353b47d9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -821,7 +821,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, struct ring_buffer *buffer; void *data; int size, esize; - struct trace_event_call *call = &tu->tp.call; + struct trace_event_call *call = trace_probe_event_call(&tu->tp); WARN_ON(call != trace_file->event_call); @@ -1113,7 +1113,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, struct uprobe_cpu_buffer *ucb, int dsize) { - struct trace_event_call *call = &tu->tp.call; + struct trace_event_call *call = trace_probe_event_call(&tu->tp); struct uprobe_trace_entry_head *entry; struct hlist_head *head; void *data; @@ -1316,9 +1316,10 @@ static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; -static inline void init_trace_event_call(struct trace_uprobe *tu, - struct trace_event_call *call) +static inline void init_trace_event_call(struct trace_uprobe *tu) { + struct trace_event_call *call = trace_probe_event_call(&tu->tp); + call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; @@ -1329,7 +1330,7 @@ static inline void init_trace_event_call(struct trace_uprobe *tu, static int register_uprobe_event(struct trace_uprobe *tu) { - init_trace_event_call(tu, &tu->tp.call); + init_trace_event_call(tu); return trace_probe_register_event_call(&tu->tp); } @@ -1376,14 +1377,14 @@ create_local_trace_uprobe(char *name, unsigned long offs, tu->path = path; tu->ref_ctr_offset = ref_ctr_offset; tu->filename = kstrdup(name, GFP_KERNEL); - init_trace_event_call(tu, &tu->tp.call); + init_trace_event_call(tu); if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { ret = -ENOMEM; goto error; } - return &tu->tp.call; + return trace_probe_event_call(&tu->tp); error: free_trace_uprobe(tu); return ERR_PTR(ret); -- cgit v1.2.3 From 715fa2fd4c6c3e9165659ac26a582b8a2e607b93 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 1 Jun 2019 00:18:07 +0900 Subject: tracing/kprobe: Check registered state using kprobe Change registered check only by trace_kprobe and remove TP_FLAG_REGISTERED from trace_probe, since this feature is only used for trace_kprobe. Link: http://lkml.kernel.org/r/155931588704.28323.4952266828256245833.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 23 +++++++++++++++-------- kernel/trace/trace_probe.h | 6 ------ 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 62362ad1ad98..9d483ad9bb6c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -157,6 +157,12 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +static nokprobe_inline bool trace_kprobe_is_registered(struct trace_kprobe *tk) +{ + return !(list_empty(&tk->rp.kp.list) && + hlist_unhashed(&tk->rp.kp.hlist)); +} + /* Return 0 if it fails to find the symbol address */ static nokprobe_inline unsigned long trace_kprobe_address(struct trace_kprobe *tk) @@ -244,6 +250,8 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, tk->rp.kp.pre_handler = kprobe_dispatcher; tk->rp.maxactive = maxactive; + INIT_HLIST_NODE(&tk->rp.kp.hlist); + INIT_LIST_HEAD(&tk->rp.kp.list); ret = trace_probe_init(&tk->tp, event, group); if (ret < 0) @@ -273,7 +281,7 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk) { int ret = 0; - if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { + if (trace_kprobe_is_registered(tk) && !trace_kprobe_has_gone(tk)) { if (trace_kprobe_is_return(tk)) ret = enable_kretprobe(&tk->rp); else @@ -333,7 +341,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) } else trace_probe_clear_flag(tp, TP_FLAG_PROFILE); - if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { + if (!trace_probe_is_enabled(tp) && trace_kprobe_is_registered(tk)) { if (trace_kprobe_is_return(tk)) disable_kretprobe(&tk->rp); else @@ -381,7 +389,7 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) { int i, ret; - if (trace_probe_is_registered(&tk->tp)) + if (trace_kprobe_is_registered(tk)) return -EINVAL; if (within_notrace_func(tk)) { @@ -407,21 +415,20 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) else ret = register_kprobe(&tk->rp.kp); - if (ret == 0) - trace_probe_set_flag(&tk->tp, TP_FLAG_REGISTERED); return ret; } /* Internal unregister function - just handle k*probes and flags */ static void __unregister_trace_kprobe(struct trace_kprobe *tk) { - if (trace_probe_is_registered(&tk->tp)) { + if (trace_kprobe_is_registered(tk)) { if (trace_kprobe_is_return(tk)) unregister_kretprobe(&tk->rp); else unregister_kprobe(&tk->rp.kp); - trace_probe_clear_flag(&tk->tp, TP_FLAG_REGISTERED); - /* Cleanup kprobe for reuse */ + /* Cleanup kprobe for reuse and mark it unregistered */ + INIT_HLIST_NODE(&tk->rp.kp.hlist); + INIT_LIST_HEAD(&tk->rp.kp.list); if (tk->rp.kp.symbol_name) tk->rp.kp.addr = NULL; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 6c33d4aa36c3..d1714820efe1 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -55,7 +55,6 @@ /* Flags for trace_probe */ #define TP_FLAG_TRACE 1 #define TP_FLAG_PROFILE 2 -#define TP_FLAG_REGISTERED 4 /* data_loc: data location, compatible with u32 */ #define make_data_loc(len, offs) \ @@ -261,11 +260,6 @@ static inline bool trace_probe_is_enabled(struct trace_probe *tp) return trace_probe_test_flag(tp, TP_FLAG_TRACE | TP_FLAG_PROFILE); } -static inline bool trace_probe_is_registered(struct trace_probe *tp) -{ - return trace_probe_test_flag(tp, TP_FLAG_REGISTERED); -} - static inline const char *trace_probe_name(struct trace_probe *tp) { return trace_event_name(&tp->call); -- cgit v1.2.3 From 46710f3a34b592ac5c51a95f696b2d2a2a0d9419 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 25 May 2019 09:57:59 -0700 Subject: tracing: Pass type into tracing_generic_entry_update() All callers of tracing_generic_entry_update() have to initialize entry->type, so let's just simply move it inside. Link: http://lkml.kernel.org/r/20190525165802.25944-2-xiyou.wangcong@gmail.com Cc: Ingo Molnar Signed-off-by: Cong Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace_event_perf.c | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 77b9c4ca5faa..6b62e1718548 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -743,8 +743,7 @@ trace_event_setup(struct ring_buffer_event *event, { struct trace_entry *ent = ring_buffer_event_data(event); - tracing_generic_entry_update(ent, flags, pc); - ent->type = type; + tracing_generic_entry_update(ent, type, flags, pc); } static __always_inline struct ring_buffer_event * @@ -2312,13 +2311,14 @@ enum print_line_t trace_handle_return(struct trace_seq *s) EXPORT_SYMBOL_GPL(trace_handle_return); void -tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, - int pc) +tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, + unsigned long flags, int pc) { struct task_struct *tsk = current; entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; + entry->type = type; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 4629a6104474..0892e38ed6fb 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -416,8 +416,7 @@ void perf_trace_buf_update(void *record, u16 type) unsigned long flags; local_save_flags(flags); - tracing_generic_entry_update(entry, flags, pc); - entry->type = type; + tracing_generic_entry_update(entry, type, flags, pc); } NOKPROBE_SYMBOL(perf_trace_buf_update); -- cgit v1.2.3 From 5967bd5c4239be449744a1471daf60c866486c24 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 25 May 2019 09:58:00 -0700 Subject: tracing: Let filter_assign_type() detect FILTER_PTR_STRING filter_assign_type() could detect dynamic string and static string, but not string pointers. Teach filter_assign_type() to detect string pointers, and this will be needed by trace event injection code. BTW, trace event hist uses FILTER_PTR_STRING too. Link: http://lkml.kernel.org/r/20190525165802.25944-3-xiyou.wangcong@gmail.com Cc: Ingo Molnar Signed-off-by: Cong Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d3e59312ef40..550e8a0d048a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1080,6 +1080,9 @@ int filter_assign_type(const char *type) if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; + if (strcmp(type, "char *") == 0 || strcmp(type, "const char *") == 0) + return FILTER_PTR_STRING; + return FILTER_OTHER; } -- cgit v1.2.3 From 0aeb1def44169cbe7119f26cf10b974a2046142e Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 25 May 2019 09:58:01 -0700 Subject: tracing: Make trace_get_fields() global trace_get_fields() is the only way to read tracepoint fields at run time, as their fields are defined at compile-time with macros. Make this function visible to all users and it will be used by trace event injection code to calculate the size of a tracepoint entry. Link: http://lkml.kernel.org/r/20190525165802.25944-4-xiyou.wangcong@gmail.com Cc: Ingo Molnar Signed-off-by: Cong Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index edc72f3b080c..c7506bc81b75 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -70,14 +70,6 @@ static int system_refcount_dec(struct event_subsystem *system) #define while_for_each_event_file() \ } -static struct list_head * -trace_get_fields(struct trace_event_call *event_call) -{ - if (!event_call->class->get_fields) - return &event_call->class->fields; - return event_call->class->get_fields(event_call); -} - static struct ftrace_event_field * __find_event_field(struct list_head *head, char *name) { -- cgit v1.2.3 From 9087c37584fb7d8315877bb55f85e4268cc0b4f4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 10 Jul 2019 19:01:19 +0000 Subject: dma-direct: Force unencrypted DMA under SME for certain DMA masks If a device doesn't support DMA to a physical address that includes the encryption bit (currently bit 47, so 48-bit DMA), then the DMA must occur to unencrypted memory. SWIOTLB is used to satisfy that requirement if an IOMMU is not active (enabled or configured in passthrough mode). However, commit fafadcd16595 ("swiotlb: don't dip into swiotlb pool for coherent allocations") modified the coherent allocation support in SWIOTLB to use the DMA direct coherent allocation support. When an IOMMU is not active, this resulted in dma_alloc_coherent() failing for devices that didn't support DMA addresses that included the encryption bit. Addressing this requires changes to the force_dma_unencrypted() function in kernel/dma/direct.c. Since the function is now non-trivial and SME/SEV specific, update the DMA direct support to add an arch override for the force_dma_unencrypted() function. The arch override is selected when CONFIG_AMD_MEM_ENCRYPT is set. The arch override function resides in the arch/x86/mm/mem_encrypt.c file and forces unencrypted DMA when either SEV is active or SME is active and the device does not support DMA to physical addresses that include the encryption bit. Fixes: fafadcd16595 ("swiotlb: don't dip into swiotlb pool for coherent allocations") Suggested-by: Christoph Hellwig Signed-off-by: Tom Lendacky Acked-by: Thomas Gleixner [hch: moved the force_dma_unencrypted declaration to dma-mapping.h, fold the s390 fix from Halil Pasic] Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 3 +++ kernel/dma/direct.c | 16 ++++------------ 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 70f8f8d9200e..9decbba255fc 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -48,6 +48,9 @@ config ARCH_HAS_DMA_COHERENT_TO_PFN config ARCH_HAS_DMA_MMAP_PGPROT bool +config ARCH_HAS_FORCE_DMA_UNENCRYPTED + bool + config DMA_NONCOHERENT_CACHE_SYNC bool diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index b90e1aede743..d7cec866d16b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -23,14 +23,6 @@ #define ARCH_ZONE_DMA_BITS 24 #endif -/* - * For AMD SEV all DMA must be to unencrypted addresses. - */ -static inline bool force_dma_unencrypted(void) -{ - return sev_active(); -} - static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) { if (!dev->dma_mask) { @@ -46,7 +38,7 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { - if (force_dma_unencrypted()) + if (force_dma_unencrypted(dev)) return __phys_to_dma(dev, phys); return phys_to_dma(dev, phys); } @@ -67,7 +59,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask) dma_mask = dev->bus_dma_mask; - if (force_dma_unencrypted()) + if (force_dma_unencrypted(dev)) *phys_mask = __dma_to_phys(dev, dma_mask); else *phys_mask = dma_to_phys(dev, dma_mask); @@ -159,7 +151,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, } ret = page_address(page); - if (force_dma_unencrypted()) { + if (force_dma_unencrypted(dev)) { set_memory_decrypted((unsigned long)ret, 1 << get_order(size)); *dma_handle = __phys_to_dma(dev, page_to_phys(page)); } else { @@ -192,7 +184,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, return; } - if (force_dma_unencrypted()) + if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && -- cgit v1.2.3 From 65f50f255349959f15f2761abd17ead8530b2f33 Mon Sep 17 00:00:00 2001 From: Weitao Hou Date: Tue, 16 Jul 2019 16:26:54 -0700 Subject: kernel: fix typos and some coding style in comments fix lenght to length Link: http://lkml.kernel.org/r/20190521050937.4370-1-houweitaoo@gmail.com Signed-off-by: Weitao Hou Acked-by: Kees Cook Cc: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1c1ad1e14f21..43186ccfa139 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -188,17 +188,17 @@ extern int no_unaligned_warning; * enum sysctl_writes_mode - supported sysctl write modes * * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value - * to be written, and multiple writes on the same sysctl file descriptor - * will rewrite the sysctl value, regardless of file position. No warning - * is issued when the initial position is not 0. + * to be written, and multiple writes on the same sysctl file descriptor + * will rewrite the sysctl value, regardless of file position. No warning + * is issued when the initial position is not 0. * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is - * not 0. + * not 0. * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at - * file position 0 and the value must be fully contained in the buffer - * sent to the write syscall. If dealing with strings respect the file - * position, but restrict this to the max length of the buffer, anything - * passed the max lenght will be ignored. Multiple writes will append - * to the buffer. + * file position 0 and the value must be fully contained in the buffer + * sent to the write syscall. If dealing with strings respect the file + * position, but restrict this to the max length of the buffer, anything + * passed the max length will be ignored. Multiple writes will append + * to the buffer. * * These write modes control how current file position affects the behavior of * updating sysctl values through the proc interface on each write. -- cgit v1.2.3 From 201766a20e30f982ccfe36bebfad9602c3ff574a Mon Sep 17 00:00:00 2001 From: Elvira Khabirova Date: Tue, 16 Jul 2019 16:29:42 -0700 Subject: ptrace: add PTRACE_GET_SYSCALL_INFO request PTRACE_GET_SYSCALL_INFO is a generic ptrace API that lets ptracer obtain details of the syscall the tracee is blocked in. There are two reasons for a special syscall-related ptrace request. Firstly, with the current ptrace API there are cases when ptracer cannot retrieve necessary information about syscalls. Some examples include: * The notorious int-0x80-from-64-bit-task issue. See [1] for details. In short, if a 64-bit task performs a syscall through int 0x80, its tracer has no reliable means to find out that the syscall was, in fact, a compat syscall, and misidentifies it. * Syscall-enter-stop and syscall-exit-stop look the same for the tracer. Common practice is to keep track of the sequence of ptrace-stops in order not to mix the two syscall-stops up. But it is not as simple as it looks; for example, strace had a (just recently fixed) long-standing bug where attaching strace to a tracee that is performing the execve system call led to the tracer identifying the following syscall-exit-stop as syscall-enter-stop, which messed up all the state tracking. * Since the introduction of commit 84d77d3f06e7 ("ptrace: Don't allow accessing an undumpable mm"), both PTRACE_PEEKDATA and process_vm_readv become unavailable when the process dumpable flag is cleared. On such architectures as ia64 this results in all syscall arguments being unavailable for the tracer. Secondly, ptracers also have to support a lot of arch-specific code for obtaining information about the tracee. For some architectures, this requires a ptrace(PTRACE_PEEKUSER, ...) invocation for every syscall argument and return value. ptrace(2) man page: long ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data); ... PTRACE_GET_SYSCALL_INFO Retrieve information about the syscall that caused the stop. The information is placed into the buffer pointed by "data" argument, which should be a pointer to a buffer of type "struct ptrace_syscall_info". The "addr" argument contains the size of the buffer pointed to by "data" argument (i.e., sizeof(struct ptrace_syscall_info)). The return value contains the number of bytes available to be written by the kernel. If the size of data to be written by the kernel exceeds the size specified by "addr" argument, the output is truncated. [ldv@altlinux.org: selftests/seccomp/seccomp_bpf: update for PTRACE_GET_SYSCALL_INFO] Link: http://lkml.kernel.org/r/20190708182904.GA12332@altlinux.org Link: http://lkml.kernel.org/r/20190510152842.GF28558@altlinux.org Signed-off-by: Elvira Khabirova Co-developed-by: Dmitry V. Levin Signed-off-by: Dmitry V. Levin Reviewed-by: Oleg Nesterov Reviewed-by: Kees Cook Reviewed-by: Andy Lutomirski Cc: Eugene Syromyatnikov Cc: Benjamin Herrenschmidt Cc: Greentime Hu Cc: Helge Deller [parisc] Cc: James E.J. Bottomley Cc: James Hogan Cc: kbuild test robot Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Kuo Cc: Shuah Khan Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 83a531cea2f3..cb9ddcc08119 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -32,6 +32,8 @@ #include #include +#include /* for syscall_get_* */ + /* * Access another process' address space via ptrace. * Source/target buffer must be kernel space, @@ -897,7 +899,100 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, * to ensure no machine forgets it. */ EXPORT_SYMBOL_GPL(task_user_regset_view); -#endif + +static unsigned long +ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + unsigned long args[ARRAY_SIZE(info->entry.args)]; + int i; + + info->op = PTRACE_SYSCALL_INFO_ENTRY; + info->entry.nr = syscall_get_nr(child, regs); + syscall_get_arguments(child, regs, args); + for (i = 0; i < ARRAY_SIZE(args); i++) + info->entry.args[i] = args[i]; + + /* args is the last field in struct ptrace_syscall_info.entry */ + return offsetofend(struct ptrace_syscall_info, entry.args); +} + +static unsigned long +ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + /* + * As struct ptrace_syscall_info.entry is currently a subset + * of struct ptrace_syscall_info.seccomp, it makes sense to + * initialize that subset using ptrace_get_syscall_info_entry(). + * This can be reconsidered in the future if these structures + * diverge significantly enough. + */ + ptrace_get_syscall_info_entry(child, regs, info); + info->op = PTRACE_SYSCALL_INFO_SECCOMP; + info->seccomp.ret_data = child->ptrace_message; + + /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ + return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); +} + +static unsigned long +ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + info->op = PTRACE_SYSCALL_INFO_EXIT; + info->exit.rval = syscall_get_error(child, regs); + info->exit.is_error = !!info->exit.rval; + if (!info->exit.is_error) + info->exit.rval = syscall_get_return_value(child, regs); + + /* is_error is the last field in struct ptrace_syscall_info.exit */ + return offsetofend(struct ptrace_syscall_info, exit.is_error); +} + +static int +ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, + void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info = { + .op = PTRACE_SYSCALL_INFO_NONE, + .arch = syscall_get_arch(child), + .instruction_pointer = instruction_pointer(regs), + .stack_pointer = user_stack_pointer(regs), + }; + unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); + unsigned long write_size; + + /* + * This does not need lock_task_sighand() to access + * child->last_siginfo because ptrace_freeze_traced() + * called earlier by ptrace_check_attach() ensures that + * the tracee cannot go away and clear its last_siginfo. + */ + switch (child->last_siginfo ? child->last_siginfo->si_code : 0) { + case SIGTRAP | 0x80: + switch (child->ptrace_message) { + case PTRACE_EVENTMSG_SYSCALL_ENTRY: + actual_size = ptrace_get_syscall_info_entry(child, regs, + &info); + break; + case PTRACE_EVENTMSG_SYSCALL_EXIT: + actual_size = ptrace_get_syscall_info_exit(child, regs, + &info); + break; + } + break; + case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): + actual_size = ptrace_get_syscall_info_seccomp(child, regs, + &info); + break; + } + + write_size = min(actual_size, user_size); + return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; +} +#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data) @@ -1114,6 +1209,10 @@ int ptrace_request(struct task_struct *child, long request, ret = __put_user(kiov.iov_len, &uiov->iov_len); break; } + + case PTRACE_GET_SYSCALL_INFO: + ret = ptrace_get_syscall_info(child, addr, datavp); + break; #endif case PTRACE_SECCOMP_GET_FILTER: -- cgit v1.2.3 From b772434be0891ed1081a08ae7cfd4666728f8e82 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 16 Jul 2019 16:29:53 -0700 Subject: signal: simplify set_user_sigmask/restore_user_sigmask task->saved_sigmask and ->restore_sigmask are only used in the ret-from- syscall paths. This means that set_user_sigmask() can save ->blocked in ->saved_sigmask and do set_restore_sigmask() to indicate that ->blocked was modified. This way the callers do not need 2 sigset_t's passed to set/restore and restore_user_sigmask() renamed to restore_saved_sigmask_unless() turns into the trivial helper which just calls restore_saved_sigmask(). Link: http://lkml.kernel.org/r/20190606113206.GA9464@redhat.com Signed-off-by: Oleg Nesterov Cc: Deepa Dinamani Cc: Arnd Bergmann Cc: Jens Axboe Cc: Davidlohr Bueso Cc: Eric Wong Cc: Jason Baron Cc: Thomas Gleixner Cc: Al Viro Cc: Eric W. Biederman Cc: David Laight Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 69 ++++++++++++++++----------------------------------------- 1 file changed, 19 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index dabe100d2091..91b789dd6e72 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2951,80 +2951,49 @@ EXPORT_SYMBOL(sigprocmask); * * This is useful for syscalls such as ppoll, pselect, io_pgetevents and * epoll_pwait where a new sigmask is passed from userland for the syscalls. + * + * Note that it does set_restore_sigmask() in advance, so it must be always + * paired with restore_saved_sigmask_unless() before return from syscall. */ -int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set, - sigset_t *oldset, size_t sigsetsize) +int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize) { - if (!usigmask) - return 0; + sigset_t kmask; + if (!umask) + return 0; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; - if (copy_from_user(set, usigmask, sizeof(sigset_t))) + if (copy_from_user(&kmask, umask, sizeof(sigset_t))) return -EFAULT; - *oldset = current->blocked; - set_current_blocked(set); + set_restore_sigmask(); + current->saved_sigmask = current->blocked; + set_current_blocked(&kmask); return 0; } -EXPORT_SYMBOL(set_user_sigmask); #ifdef CONFIG_COMPAT -int set_compat_user_sigmask(const compat_sigset_t __user *usigmask, - sigset_t *set, sigset_t *oldset, +int set_compat_user_sigmask(const compat_sigset_t __user *umask, size_t sigsetsize) { - if (!usigmask) - return 0; + sigset_t kmask; + if (!umask) + return 0; if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (get_compat_sigset(set, usigmask)) + if (get_compat_sigset(&kmask, umask)) return -EFAULT; - *oldset = current->blocked; - set_current_blocked(set); + set_restore_sigmask(); + current->saved_sigmask = current->blocked; + set_current_blocked(&kmask); return 0; } -EXPORT_SYMBOL(set_compat_user_sigmask); #endif -/* - * restore_user_sigmask: - * usigmask: sigmask passed in from userland. - * sigsaved: saved sigmask when the syscall started and changed the sigmask to - * usigmask. - * - * This is useful for syscalls such as ppoll, pselect, io_pgetevents and - * epoll_pwait where a new sigmask is passed in from userland for the syscalls. - */ -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved, - bool interrupted) -{ - - if (!usigmask) - return; - /* - * When signals are pending, do not restore them here. - * Restoring sigmask here can lead to delivering signals that the above - * syscalls are intended to block because of the sigmask passed in. - */ - if (interrupted) { - current->saved_sigmask = *sigsaved; - set_restore_sigmask(); - return; - } - - /* - * This is needed because the fast syscall return path does not restore - * saved_sigmask when signals are not pending. - */ - set_current_blocked(sigsaved); -} -EXPORT_SYMBOL(restore_user_sigmask); - /** * sys_rt_sigprocmask - change the list of currently blocked signals * @how: whether to add, remove, or set signals -- cgit v1.2.3 From f57e515a1b56325a28a0972c632a623a9c84590c Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 16 Jul 2019 16:30:06 -0700 Subject: kernel/pid.c: convert struct pid count to refcount_t struct pid's count is an atomic_t field used as a refcount. Use refcount_t for it which is basically atomic_t but does additional checking to prevent use-after-free bugs. For memory ordering, the only change is with the following: - if ((atomic_read(&pid->count) == 1) || - atomic_dec_and_test(&pid->count)) { + if (refcount_dec_and_test(&pid->count)) { kmem_cache_free(ns->pid_cachep, pid); Here the change is from: Fully ordered --> RELEASE + ACQUIRE (as per refcount-vs-atomic.rst) This ACQUIRE should take care of making sure the free happens after the refcount_dec_and_test(). The above hunk also removes atomic_read() since it is not needed for the code to work and it is unclear how beneficial it is. The removal lets refcount_dec_and_test() check for cases where get_pid() happened before the object was freed. Link: http://lkml.kernel.org/r/20190701183826.191936-1-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Andrea Parri Reviewed-by: Kees Cook Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Will Deacon Cc: Paul E. McKenney Cc: Elena Reshetova Cc: Jann Horn Cc: Eric W. Biederman Cc: KJ Tsanaktsidis Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 16263b526560..0a9f2e437217 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -37,14 +37,14 @@ #include #include #include -#include +#include #include #include #include #include struct pid init_struct_pid = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .tasks = { { .first = NULL }, { .first = NULL }, @@ -108,8 +108,7 @@ void put_pid(struct pid *pid) return; ns = pid->numbers[pid->level].ns; - if ((atomic_read(&pid->count) == 1) || - atomic_dec_and_test(&pid->count)) { + if (refcount_dec_and_test(&pid->count)) { kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } @@ -212,7 +211,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) } get_pid_ns(ns); - atomic_set(&pid->count, 1); + refcount_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); -- cgit v1.2.3 From a5008b59cd9d8de12ab623cb5052bb4735330e5c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Jul 2019 22:00:54 +0200 Subject: dma-direct: only limit the mapping size if swiotlb could be used Don't just check for a swiotlb buffer, but also if buffering might be required for this particular device. Fixes: 133d624b1cee ("dma: Introduce dma_max_mapping_size()") Reported-by: Benjamin Herrenschmidt Signed-off-by: Christoph Hellwig Tested-by: Benjamin Herrenschmidt --- kernel/dma/direct.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index d7cec866d16b..e269b6f9b444 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -399,11 +399,9 @@ int dma_direct_supported(struct device *dev, u64 mask) size_t dma_direct_max_mapping_size(struct device *dev) { - size_t size = SIZE_MAX; - /* If SWIOTLB is active, use its maximum mapping size */ - if (is_swiotlb_active()) - size = swiotlb_max_mapping_size(dev); - - return size; + if (is_swiotlb_active() && + (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE)) + return swiotlb_max_mapping_size(dev); + return SIZE_MAX; } -- cgit v1.2.3 From cf144f81a99d1a3928f90b0936accfd3f45c9a0a Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 16 Jul 2019 12:32:53 -0400 Subject: padata: use smp_mb in padata_reorder to avoid orphaned padata jobs Testing padata with the tcrypt module on a 5.2 kernel... # modprobe tcrypt alg="pcrypt(rfc4106(gcm(aes)))" type=3 # modprobe tcrypt mode=211 sec=1 ...produces this splat: INFO: task modprobe:10075 blocked for more than 120 seconds. Not tainted 5.2.0-base+ #16 modprobe D 0 10075 10064 0x80004080 Call Trace: ? __schedule+0x4dd/0x610 ? ring_buffer_unlock_commit+0x23/0x100 schedule+0x6c/0x90 schedule_timeout+0x3b/0x320 ? trace_buffer_unlock_commit_regs+0x4f/0x1f0 wait_for_common+0x160/0x1a0 ? wake_up_q+0x80/0x80 { crypto_wait_req } # entries in braces added by hand { do_one_aead_op } { test_aead_jiffies } test_aead_speed.constprop.17+0x681/0xf30 [tcrypt] do_test+0x4053/0x6a2b [tcrypt] ? 0xffffffffa00f4000 tcrypt_mod_init+0x50/0x1000 [tcrypt] ... The second modprobe command never finishes because in padata_reorder, CPU0's load of reorder_objects is executed before the unlocking store in spin_unlock_bh(pd->lock), causing CPU0 to miss CPU1's increment: CPU0 CPU1 padata_reorder padata_do_serial LOAD reorder_objects // 0 INC reorder_objects // 1 padata_reorder TRYLOCK pd->lock // failed UNLOCK pd->lock CPU0 deletes the timer before returning from padata_reorder and since no other job is submitted to padata, modprobe waits indefinitely. Add a pair of full barriers to guarantee proper ordering: CPU0 CPU1 padata_reorder padata_do_serial UNLOCK pd->lock smp_mb() LOAD reorder_objects INC reorder_objects smp_mb__after_atomic() padata_reorder TRYLOCK pd->lock smp_mb__after_atomic is needed so the read part of the trylock operation comes after the INC, as Andrea points out. Thanks also to Andrea for help with writing a litmus test. Fixes: 16295bec6398 ("padata: Generic parallelization/serialization interface") Signed-off-by: Daniel Jordan Cc: Cc: Andrea Parri Cc: Boqun Feng Cc: Herbert Xu Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steffen Klassert Cc: linux-arch@vger.kernel.org Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Herbert Xu --- kernel/padata.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 2d2fddbb7a4c..15a8ad63f4ff 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -267,7 +267,12 @@ static void padata_reorder(struct parallel_data *pd) * The next object that needs serialization might have arrived to * the reorder queues in the meantime, we will be called again * from the timer function if no one else cares for it. + * + * Ensure reorder_objects is read after pd->lock is dropped so we see + * an increment from another task in padata_do_serial. Pairs with + * smp_mb__after_atomic in padata_do_serial. */ + smp_mb(); if (atomic_read(&pd->reorder_objects) && !(pinst->flags & PADATA_RESET)) mod_timer(&pd->timer, jiffies + HZ); @@ -387,6 +392,13 @@ void padata_do_serial(struct padata_priv *padata) list_add_tail(&padata->list, &pqueue->reorder.list); spin_unlock(&pqueue->reorder.lock); + /* + * Ensure the atomic_inc of reorder_objects above is ordered correctly + * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb + * in padata_reorder. + */ + smp_mb__after_atomic(); + put_cpu(); /* If we're running on the wrong CPU, call padata_reorder() via a -- cgit v1.2.3 From cac9b9a4b08304f11daace03b8b48659355e44c1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Jul 2019 10:47:47 +0200 Subject: stacktrace: Force USER_DS for stack_trace_save_user() When walking userspace stacks, USER_DS needs to be set, otherwise access_ok() will not function as expected. Reported-by: Vegard Nossum Reported-by: Eiichi Tsukata Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Tested-by: Vegard Nossum Reviewed-by: Joel Fernandes (Google) Link: https://lkml.kernel.org/r/20190718085754.GM3402@hirez.programming.kicks-ass.net --- kernel/stacktrace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index e6a02b274b73..f5440abb7532 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -226,12 +226,17 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) .store = store, .size = size, }; + mm_segment_t fs; /* Trace user stack if not a kernel thread */ if (current->flags & PF_KTHREAD) return 0; + fs = get_fs(); + set_fs(USER_DS); arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); + set_fs(fs); + return c.len; } #endif -- cgit v1.2.3 From 3193c0836f203a91bef96d88c64cccf0be090d9c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Jul 2019 20:36:45 -0500 Subject: bpf: Disable GCC -fgcse optimization for ___bpf_prog_run() On x86-64, with CONFIG_RETPOLINE=n, GCC's "global common subexpression elimination" optimization results in ___bpf_prog_run()'s jumptable code changing from this: select_insn: jmp *jumptable(, %rax, 8) ... ALU64_ADD_X: ... jmp *jumptable(, %rax, 8) ALU_ADD_X: ... jmp *jumptable(, %rax, 8) to this: select_insn: mov jumptable, %r12 jmp *(%r12, %rax, 8) ... ALU64_ADD_X: ... jmp *(%r12, %rax, 8) ALU_ADD_X: ... jmp *(%r12, %rax, 8) The jumptable address is placed in a register once, at the beginning of the function. The function execution can then go through multiple indirect jumps which rely on that same register value. This has a few issues: 1) Objtool isn't smart enough to be able to track such a register value across multiple recursive indirect jumps through the jump table. 2) With CONFIG_RETPOLINE enabled, this optimization actually results in a small slowdown. I measured a ~4.7% slowdown in the test_bpf "tcpdump port 22" selftest. This slowdown is actually predicted by the GCC manual: Note: When compiling a program using computed gotos, a GCC extension, you may get better run-time performance if you disable the global common subexpression elimination pass by adding -fno-gcse to the command line. So just disable the optimization for this function. Fixes: e55a73251da3 ("bpf: Fix ORC unwinding in non-JIT BPF code") Reported-by: Randy Dunlap Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/30c3ca29ba037afcbd860a8672eef0021addf9fe.1563413318.git.jpoimboe@redhat.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7e98f36a14e2..8191a7db2777 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1295,7 +1295,7 @@ bool bpf_opcode_in_insntable(u8 code) * * Decode and execute eBPF instructions. */ -static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z -- cgit v1.2.3 From a50a3f4b6a313dc76912bd4ad3b8b4f4b479c801 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 17 Jul 2019 22:01:49 +0200 Subject: sched/rt, Kconfig: Introduce CONFIG_PREEMPT_RT Add a new entry to the preemption menu which enables the real-time support for the kernel. The choice is only enabled when an architecture supports it. It selects PREEMPT as the RT features depend on it. To achieve that the existing PREEMPT choice is renamed to PREEMPT_LL which select PREEMPT as well. No functional change. Signed-off-by: Thomas Gleixner Acked-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) Acked-by: Clark Williams Acked-by: Daniel Bristot de Oliveira Acked-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Acked-by: Marc Zyngier Acked-by: Daniel Wagner Acked-by: Luis Claudio R. Goncalves Acked-by: Julia Cartwright Acked-by: Tom Zanussi Acked-by: Gratian Crisan Acked-by: Sebastian Siewior Cc: Andrew Morton Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Lukas Bulwahn Cc: Mike Galbraith Cc: Tejun Heo Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1907172200190.1778@nanos.tec.linutronix.de Signed-off-by: Ingo Molnar --- kernel/Kconfig.preempt | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index dc0b682ec2d9..fc020c09b7e8 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -35,10 +35,10 @@ config PREEMPT_VOLUNTARY Select this if you are building a kernel for a desktop system. -config PREEMPT +config PREEMPT_LL bool "Preemptible Kernel (Low-Latency Desktop)" depends on !ARCH_NO_PREEMPT - select PREEMPT_COUNT + select PREEMPT select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK help This option reduces the latency of the kernel by making @@ -55,7 +55,28 @@ config PREEMPT embedded system with latency requirements in the milliseconds range. +config PREEMPT_RT + bool "Fully Preemptible Kernel (Real-Time)" + depends on EXPERT && ARCH_SUPPORTS_RT + select PREEMPT + help + This option turns the kernel into a real-time kernel by replacing + various locking primitives (spinlocks, rwlocks, etc.) with + preemptible priority-inheritance aware variants, enforcing + interrupt threading and introducing mechanisms to break up long + non-preemptible sections. This makes the kernel, except for very + low level and critical code pathes (entry code, scheduler, low + level interrupt handling) fully preemptible and brings most + execution contexts under scheduler control. + + Select this if you are building a kernel for systems which + require real-time guarantees. + endchoice config PREEMPT_COUNT bool + +config PREEMPT + bool + select PREEMPT_COUNT -- cgit v1.2.3 From 49f17c26c123b60fd1c74629eef077740d16ffc2 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 18 Jul 2019 15:57:31 -0700 Subject: resource: fix locking in find_next_iomem_res() Since resources can be removed, locking should ensure that the resource is not removed while accessing it. However, find_next_iomem_res() does not hold the lock while copying the data of the resource. Keep holding the lock while the data is copied. While at it, change the return value to a more informative value. It is disregarded by the callers. [akpm@linux-foundation.org: fix find_next_iomem_res() documentation] Link: http://lkml.kernel.org/r/20190613045903.4922-2-namit@vmware.com Fixes: ff3cc952d3f00 ("resource: Add remove_resource interface") Signed-off-by: Nadav Amit Reviewed-by: Andrew Morton Reviewed-by: Dan Williams Cc: Borislav Petkov Cc: Toshi Kani Cc: Peter Zijlstra Cc: Dave Hansen Cc: Bjorn Helgaas Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index d22423e85cf8..3ced0cd45bdd 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -326,7 +326,7 @@ EXPORT_SYMBOL(release_resource); * * If a resource is found, returns 0 and @*res is overwritten with the part * of the resource that's within [@start..@end]; if none is found, returns - * -1 or -EINVAL for other invalid parameters. + * -ENODEV. Returns -EINVAL for invalid parameters. * * This function walks the whole tree and not just first level children * unless @first_lvl is true. @@ -365,16 +365,16 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, break; } + if (p) { + /* copy data */ + res->start = max(start, p->start); + res->end = min(end, p->end); + res->flags = p->flags; + res->desc = p->desc; + } + read_unlock(&resource_lock); - if (!p) - return -1; - - /* copy data */ - res->start = max(start, p->start); - res->end = min(end, p->end); - res->flags = p->flags; - res->desc = p->desc; - return 0; + return p ? 0 : -ENODEV; } static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, -- cgit v1.2.3 From 756398750e11ade1e617cd2a8f8d66fe7ed637e1 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 18 Jul 2019 15:57:34 -0700 Subject: resource: avoid unnecessary lookups in find_next_iomem_res() find_next_iomem_res() shows up to be a source for overhead in dax benchmarks. Improve performance by not considering children of the tree if the top level does not match. Since the range of the parents should include the range of the children such check is redundant. Running sysbench on dax (pmem emulation, with write_cache disabled): sysbench fileio --file-total-size=3G --file-test-mode=rndwr \ --file-io-mode=mmap --threads=4 --file-fsync-mode=fdatasync run Provides the following results: events (avg/stddev) ------------------- 5.2-rc3: 1247669.0000/16075.39 w/patch: 1286320.5000/16402.72 (+3%) Link: http://lkml.kernel.org/r/20190613045903.4922-3-namit@vmware.com Signed-off-by: Nadav Amit Cc: Borislav Petkov Cc: Toshi Kani Cc: Peter Zijlstra Cc: Dave Hansen Cc: Dan Williams Cc: Bjorn Helgaas Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 3ced0cd45bdd..7ea4306503c5 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -342,6 +342,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, bool first_lvl, struct resource *res) { + bool siblings_only = true; struct resource *p; if (!res) @@ -352,17 +353,31 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) { - if ((p->flags & flags) != flags) - continue; - if ((desc != IORES_DESC_NONE) && (desc != p->desc)) - continue; + for (p = iomem_resource.child; p; p = next_resource(p, siblings_only)) { + /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; break; } - if ((p->end >= start) && (p->start <= end)) - break; + + /* Skip until we find a range that matches what we look for */ + if (p->end < start) + continue; + + /* + * Now that we found a range that matches what we look for, + * check the flags and the descriptor. If we were not asked to + * use only the first level, start looking at children as well. + */ + siblings_only = first_lvl; + + if ((p->flags & flags) != flags) + continue; + if ((desc != IORES_DESC_NONE) && (desc != p->desc)) + continue; + + /* Found a match, break */ + break; } if (p) { -- cgit v1.2.3 From 7cc7867fb06166ac113eda9cf20d3c15d95ff6f5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:33 -0700 Subject: mm/devm_memremap_pages: enable sub-section remap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Teach devm_memremap_pages() about the new sub-section capabilities of arch_{add,remove}_memory(). Effectively, just replace all usage of align_start, align_end, and align_size with res->start, res->end, and resource_size(res). The existing sanity check will still make sure that the two separate remap attempts do not collide within a sub-section (2MB on x86). Link: http://lkml.kernel.org/r/156092355542.979959.10060071713397030576.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Tested-by: Aneesh Kumar K.V [ppc64] Cc: Michal Hocko Cc: Toshi Kani Cc: Jérôme Glisse Cc: Logan Gunthorpe Cc: Oscar Salvador Cc: Pavel Tatashin Cc: David Hildenbrand Cc: Jane Chu Cc: Jeff Moyer Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Vlastimil Babka Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 57 ++++++++++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index bea6f887adad..6ee03a816d67 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -54,7 +54,7 @@ static void pgmap_array_delete(struct resource *res) static unsigned long pfn_first(struct dev_pagemap *pgmap) { - return (pgmap->res.start >> PAGE_SHIFT) + + return PHYS_PFN(pgmap->res.start) + vmem_altmap_offset(pgmap_altmap(pgmap)); } @@ -98,7 +98,6 @@ static void devm_memremap_pages_release(void *data) struct dev_pagemap *pgmap = data; struct device *dev = pgmap->dev; struct resource *res = &pgmap->res; - resource_size_t align_start, align_size; unsigned long pfn; int nid; @@ -108,25 +107,21 @@ static void devm_memremap_pages_release(void *data) dev_pagemap_cleanup(pgmap); /* pages are dead and unused, undo the arch mapping */ - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - - align_start; - - nid = page_to_nid(pfn_to_page(align_start >> PAGE_SHIFT)); + nid = page_to_nid(pfn_to_page(PHYS_PFN(res->start))); mem_hotplug_begin(); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - pfn = align_start >> PAGE_SHIFT; + pfn = PHYS_PFN(res->start); __remove_pages(page_zone(pfn_to_page(pfn)), pfn, - align_size >> PAGE_SHIFT, NULL); + PHYS_PFN(resource_size(res)), NULL); } else { - arch_remove_memory(nid, align_start, align_size, + arch_remove_memory(nid, res->start, resource_size(res), pgmap_altmap(pgmap)); - kasan_remove_zero_shadow(__va(align_start), align_size); + kasan_remove_zero_shadow(__va(res->start), resource_size(res)); } mem_hotplug_done(); - untrack_pfn(NULL, PHYS_PFN(align_start), align_size); + untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); pgmap_array_delete(res); dev_WARN_ONCE(dev, pgmap->altmap.alloc, "%s: failed to free all reserved pages\n", __func__); @@ -162,13 +157,12 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) */ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { - resource_size_t align_start, align_size, align_end; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; struct mhp_restrictions restrictions = { /* * We do not want any optional features only our own memmap - */ + */ .altmap = pgmap_altmap(pgmap), }; pgprot_t pgprot = PAGE_KERNEL; @@ -225,12 +219,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) return ERR_PTR(error); } - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - - align_start; - align_end = align_start + align_size - 1; - - conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_start), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL); if (conflict_pgmap) { dev_WARN(dev, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); @@ -238,7 +227,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_array; } - conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL); if (conflict_pgmap) { dev_WARN(dev, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); @@ -246,7 +235,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_array; } - is_ram = region_intersects(align_start, align_size, + is_ram = region_intersects(res->start, resource_size(res), IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); if (is_ram != REGION_DISJOINT) { @@ -267,8 +256,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (nid < 0) nid = numa_mem_id(); - error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0, - align_size); + error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(res->start), 0, + resource_size(res)); if (error) goto err_pfn_remap; @@ -286,16 +275,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) * arch_add_memory(). */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - error = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, &restrictions); + error = add_pages(nid, PHYS_PFN(res->start), + PHYS_PFN(resource_size(res)), &restrictions); } else { - error = kasan_add_zero_shadow(__va(align_start), align_size); + error = kasan_add_zero_shadow(__va(res->start), resource_size(res)); if (error) { mem_hotplug_done(); goto err_kasan; } - error = arch_add_memory(nid, align_start, align_size, + error = arch_add_memory(nid, res->start, resource_size(res), &restrictions); } @@ -303,8 +292,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) struct zone *zone; zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; - move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, pgmap_altmap(pgmap)); + move_pfn_range_to_zone(zone, PHYS_PFN(res->start), + PHYS_PFN(resource_size(res)), restrictions.altmap); } mem_hotplug_done(); @@ -316,8 +305,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) * to allow us to do the work while not holding the hotplug lock. */ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], - align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, pgmap); + PHYS_PFN(res->start), + PHYS_PFN(resource_size(res)), pgmap); percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); error = devm_add_action_or_reset(dev, devm_memremap_pages_release, @@ -328,9 +317,9 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) return __va(res->start); err_add_memory: - kasan_remove_zero_shadow(__va(align_start), align_size); + kasan_remove_zero_shadow(__va(res->start), resource_size(res)); err_kasan: - untrack_pfn(NULL, PHYS_PFN(align_start), align_size); + untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); err_pfn_remap: pgmap_array_delete(res); err_array: -- cgit v1.2.3 From eec4844fae7c033a0c1fc1eb3b8517aeb8b6cc49 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 18 Jul 2019 15:58:50 -0700 Subject: proc/sysctl: add shared variables for range check In the sysctl code the proc_dointvec_minmax() function is often used to validate the user supplied value between an allowed range. This function uses the extra1 and extra2 members from struct ctl_table as minimum and maximum allowed value. On sysctl handler declaration, in every source file there are some readonly variables containing just an integer which address is assigned to the extra1 and extra2 members, so the sysctl range is enforced. The special values 0, 1 and INT_MAX are very often used as range boundary, leading duplication of variables like zero=0, one=1, int_max=INT_MAX in different source files: $ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l 248 Add a const int array containing the most commonly used values, some macros to refer more easily to the correct array member, and use them instead of creating a local one for every object file. This is the bloat-o-meter output comparing the old and new binary compiled with the default Fedora config: # scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164) Data old new delta sysctl_vals - 12 +12 __kstrtab_sysctl_vals - 12 +12 max 14 10 -4 int_max 16 - -16 one 68 - -68 zero 128 28 -100 Total: Before=20583249, After=20583085, chg -0.00% [mcroce@redhat.com: tipc: remove two unused variables] Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com [akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c] [arnd@arndb.de: proc/sysctl: make firmware loader table conditional] Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de [akpm@linux-foundation.org: fix fs/eventpoll.c] Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com Signed-off-by: Matteo Croce Signed-off-by: Arnd Bergmann Acked-by: Kees Cook Reviewed-by: Aaron Tomlin Cc: Matthew Wilcox Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 3 +- kernel/sysctl.c | 197 ++++++++++++++++++++++++------------------------- kernel/ucount.c | 6 +- 3 files changed, 100 insertions(+), 106 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6d726cef241c..a6a79f85c81a 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -291,14 +291,13 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, } extern int pid_max; -static int zero = 0; static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &pid_max, }, { } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 43186ccfa139..078950d9605b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -125,9 +125,6 @@ static int sixty = 60; #endif static int __maybe_unused neg_one = -1; - -static int zero; -static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long zero_ul; @@ -385,8 +382,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sysctl_schedstats, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SMP */ @@ -418,7 +415,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { .procname = "numa_balancing", @@ -426,8 +423,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sysctl_numa_balancing, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ @@ -475,8 +472,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -486,7 +483,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, #endif #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) @@ -496,8 +493,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = sched_energy_aware_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_PROVE_LOCKING @@ -562,7 +559,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &neg_one, - .extra2 = &one, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_LATENCYTOP @@ -696,8 +693,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, /* only handle a transition from default "0" to "1" */ .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &one, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_MODULES @@ -715,8 +712,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, /* only handle a transition from default "0" to "1" */ .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &one, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_UEVENT_HELPER @@ -875,7 +872,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &ten_thousand, }, { @@ -891,8 +888,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "kptr_restrict", @@ -900,7 +897,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, #endif @@ -925,8 +922,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "watchdog_thresh", @@ -934,7 +931,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog_thresh, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &sixty, }, { @@ -943,8 +940,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = NMI_WATCHDOG_SYSCTL_PERM, .proc_handler = proc_nmi_watchdog, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "watchdog_cpumask", @@ -960,8 +957,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_soft_watchdog, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "softlockup_panic", @@ -969,8 +966,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_SMP { @@ -979,8 +976,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ #endif @@ -991,8 +988,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_SMP { @@ -1001,8 +998,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ #endif @@ -1115,8 +1112,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "hung_task_check_count", @@ -1124,7 +1121,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "hung_task_timeout_secs", @@ -1201,7 +1198,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = perf_proc_update_handler, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { .procname = "perf_cpu_time_max_percent", @@ -1209,7 +1206,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, { @@ -1218,7 +1215,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_event_max_stack), .mode = 0644, .proc_handler = perf_event_max_stack_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &six_hundred_forty_kb, }, { @@ -1227,7 +1224,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), .mode = 0644, .proc_handler = perf_event_max_stack_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_thousand, }, #endif @@ -1237,8 +1234,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) { @@ -1247,8 +1244,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = timer_migration_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_BPF_SYSCALL @@ -1259,8 +1256,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, /* only handle a transition from default "0" to "1" */ .proc_handler = proc_dointvec_minmax, - .extra1 = &one, - .extra2 = &one, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, }, { .procname = "bpf_stats_enabled", @@ -1277,8 +1274,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_panic_on_rcu_stall), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE @@ -1288,8 +1285,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = stack_erasing_sysctl, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { } @@ -1302,7 +1299,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1311,7 +1308,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_panic_on_oom), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1348,7 +1345,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "dirty_background_ratio", @@ -1356,7 +1353,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_background_ratio), .mode = 0644, .proc_handler = dirty_background_ratio_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, { @@ -1373,7 +1370,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_dirty_ratio), .mode = 0644, .proc_handler = dirty_ratio_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, { @@ -1397,7 +1394,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_expire_interval), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "dirtytime_expire_seconds", @@ -1405,7 +1402,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirtytime_expire_interval), .mode = 0644, .proc_handler = dirtytime_interval_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "swappiness", @@ -1413,7 +1410,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_swappiness), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, #ifdef CONFIG_HUGETLB_PAGE @@ -1438,8 +1435,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -1470,7 +1467,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = drop_caches_sysctl_handler, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &four, }, #ifdef CONFIG_COMPACTION @@ -1496,8 +1493,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_COMPACTION */ @@ -1507,7 +1504,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(min_free_kbytes), .mode = 0644, .proc_handler = min_free_kbytes_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "watermark_boost_factor", @@ -1515,7 +1512,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(watermark_boost_factor), .mode = 0644, .proc_handler = watermark_boost_factor_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "watermark_scale_factor", @@ -1523,7 +1520,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(watermark_scale_factor), .mode = 0644, .proc_handler = watermark_scale_factor_sysctl_handler, - .extra1 = &one, + .extra1 = SYSCTL_ONE, .extra2 = &one_thousand, }, { @@ -1532,7 +1529,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(percpu_pagelist_fraction), .mode = 0644, .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #ifdef CONFIG_MMU { @@ -1541,7 +1538,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #else { @@ -1550,7 +1547,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_nr_trim_pages), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #endif { @@ -1566,7 +1563,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(block_dump), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "vfs_cache_pressure", @@ -1574,7 +1571,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_vfs_cache_pressure), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT { @@ -1583,7 +1580,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_NUMA @@ -1593,7 +1590,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(node_reclaim_mode), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, { .procname = "min_unmapped_ratio", @@ -1601,7 +1598,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_min_unmapped_ratio), .mode = 0644, .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, { @@ -1610,7 +1607,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_min_slab_ratio), .mode = 0644, .proc_handler = sysctl_min_slab_ratio_sysctl_handler, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, #endif @@ -1661,7 +1658,7 @@ static struct ctl_table vm_table[] = { #endif .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_HIGHMEM @@ -1671,8 +1668,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_highmem_is_dirtyable), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif #ifdef CONFIG_MEMORY_FAILURE @@ -1682,8 +1679,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_memory_failure_early_kill), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "memory_failure_recovery", @@ -1691,8 +1688,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_memory_failure_recovery), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { @@ -1738,8 +1735,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(sysctl_unprivileged_userfaultfd), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { } @@ -1875,8 +1872,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "protected_hardlinks", @@ -1884,8 +1881,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "protected_fifos", @@ -1893,7 +1890,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1902,7 +1899,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, { @@ -1911,7 +1908,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_coredump, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &two, }, #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) @@ -1948,7 +1945,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = SYSCTL_ONE, }, { } }; @@ -1970,8 +1967,8 @@ static struct ctl_table debug_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_kprobes_optimization_handler, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, #endif { } @@ -3395,8 +3392,8 @@ int proc_do_static_key(struct ctl_table *table, int write, .data = &val, .maxlen = sizeof(val), .mode = table->mode, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }; if (write && !capable(CAP_SYS_ADMIN)) diff --git a/kernel/ucount.c b/kernel/ucount.c index feb128c7b5d9..a53cc2b4179c 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -52,16 +52,14 @@ static struct ctl_table_root set_root = { .permissions = set_permissions, }; -static int zero = 0; -static int int_max = INT_MAX; #define UCOUNT_ENTRY(name) \ { \ .procname = name, \ .maxlen = sizeof(int), \ .mode = 0644, \ .proc_handler = proc_dointvec_minmax, \ - .extra1 = &zero, \ - .extra2 = &int_max, \ + .extra1 = SYSCTL_ZERO, \ + .extra2 = SYSCTL_INT_MAX, \ } static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), -- cgit v1.2.3 From 449fa54d6815be8c2c1f68fa9dbbae9384a7c03e Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Fri, 19 Jul 2019 17:26:48 +0800 Subject: dma-direct: correct the physical addr in dma_direct_sync_sg_for_cpu/device dma_map_sg() may use swiotlb buffer when the kernel command line includes "swiotlb=force" or the dma_addr is out of dev->dma_mask range. After DMA complete the memory moving from device to memory, then user call dma_sync_sg_for_cpu() to sync with DMA buffer, and copy the original virtual buffer to other space. So dma_direct_sync_sg_for_cpu() should use swiotlb physical addr, not the original physical addr from sg_phys(sg). dma_direct_sync_sg_for_device() also has the same issue, correct it as well. Fixes: 55897af63091("dma-direct: merge swiotlb_dma_ops into the dma_direct code") Signed-off-by: Fugang Duan Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index e269b6f9b444..59bdceea3737 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -234,12 +234,14 @@ void dma_direct_sync_sg_for_device(struct device *dev, int i; for_each_sg(sgl, sg, nents, i) { - if (unlikely(is_swiotlb_buffer(sg_phys(sg)))) - swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length, + phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); + + if (unlikely(is_swiotlb_buffer(paddr))) + swiotlb_tbl_sync_single(dev, paddr, sg->length, dir, SYNC_FOR_DEVICE); if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, + arch_sync_dma_for_device(dev, paddr, sg->length, dir); } } @@ -271,11 +273,13 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, int i; for_each_sg(sgl, sg, nents, i) { + phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); + if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); - - if (unlikely(is_swiotlb_buffer(sg_phys(sg)))) - swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length, dir, + arch_sync_dma_for_cpu(dev, paddr, sg->length, dir); + + if (unlikely(is_swiotlb_buffer(paddr))) + swiotlb_tbl_sync_single(dev, paddr, sg->length, dir, SYNC_FOR_CPU); } -- cgit v1.2.3 From 6d54ceb539aacc3df65c89500e8b045924f3ef81 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Sun, 30 Jun 2019 17:54:38 +0900 Subject: tracing: Fix user stack trace "??" output Commit c5c27a0a5838 ("x86/stacktrace: Remove the pointless ULONG_MAX marker") removes ULONG_MAX marker from user stack trace entries but trace_user_stack_print() still uses the marker and it outputs unnecessary "??". For example: less-1911 [001] d..2 34.758944: => <00007f16f2295910> => ?? => ?? => ?? => ?? => ?? => ?? => ?? The user stack trace code zeroes the storage before saving the stack, so if the trace is shorter than the maximum number of entries it can terminate the print loop if a zero entry is detected. Link: http://lkml.kernel.org/r/20190630085438.25545-1-devel@etsukata.com Cc: stable@vger.kernel.org Fixes: 4285f2fcef80 ("tracing: Remove the ULONG_MAX stack trace hackery") Signed-off-by: Eiichi Tsukata Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 54373d93e251..1d6178a188f4 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1109,17 +1109,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { unsigned long ip = field->caller[i]; - if (ip == ULONG_MAX || trace_seq_has_overflowed(s)) + if (!ip || trace_seq_has_overflowed(s)) break; trace_seq_puts(s, " => "); - - if (!ip) { - trace_seq_puts(s, "??"); - trace_seq_putc(s, '\n'); - continue; - } - seq_print_user_ip(s, mm, ip, flags); trace_seq_putc(s, '\n'); } -- cgit v1.2.3 From 0c5f81dad46c90792e6c3c4797131323c9e96dcd Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sat, 6 Jul 2019 09:26:51 +0800 Subject: KVM: LAPIC: Inject timer interrupt via posted interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dedicated instances are currently disturbed by unnecessary jitter due to the emulated lapic timers firing on the same pCPUs where the vCPUs reside. There is no hardware virtual timer on Intel for guest like ARM, so both programming timer in guest and the emulated timer fires incur vmexits. This patch tries to avoid vmexit when the emulated timer fires, at least in dedicated instance scenario when nohz_full is enabled. In that case, the emulated timers can be offload to the nearest busy housekeeping cpus since APICv has been found for several years in server processors. The guest timer interrupt can then be injected via posted interrupts, which are delivered by the housekeeping cpu once the emulated timer fires. The host should tuned so that vCPUs are placed on isolated physical processors, and with several pCPUs surplus for busy housekeeping. If disabled mwait/hlt/pause vmexits keep the vCPUs in non-root mode, ~3% redis performance benefit can be observed on Skylake server, and the number of external interrupt vmexits drops substantially. Without patch VM-EXIT Samples Samples% Time% Min Time Max Time Avg time EXTERNAL_INTERRUPT 42916 49.43% 39.30% 0.47us 106.09us 0.71us ( +- 1.09% ) While with patch: VM-EXIT Samples Samples% Time% Min Time Max Time Avg time EXTERNAL_INTERRUPT 6871 9.29% 2.96% 0.44us 57.88us 0.72us ( +- 4.02% ) Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Marcelo Tosatti Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- kernel/sched/isolation.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 123ea07a3f3b..ccb28085b114 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -14,6 +14,12 @@ EXPORT_SYMBOL_GPL(housekeeping_overridden); static cpumask_var_t housekeeping_mask; static unsigned int housekeeping_flags; +bool housekeeping_enabled(enum hk_flags flags) +{ + return !!(housekeeping_flags & flags); +} +EXPORT_SYMBOL_GPL(housekeeping_enabled); + int housekeeping_any_cpu(enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overridden)) -- cgit v1.2.3 From 19dbdcb8039cff16669a05136a29180778d16d0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Jul 2019 11:20:09 +0200 Subject: smp: Warn on function calls from softirq context It's clearly documented that smp function calls cannot be invoked from softirq handling context. Unfortunately nothing enforces that or emits a warning. A single function call can be invoked from softirq context only via smp_call_function_single_async(). The only legit context is task context, so add a warning to that effect. Reported-by: luferry Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190718160601.GP3402@hirez.programming.kicks-ass.net --- kernel/smp.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 616d4d114847..7dbcb402c2fc 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -291,6 +291,14 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); + /* + * When @wait we can deadlock when we interrupt between llist_add() and + * arch_send_call_function_ipi*(); when !@wait we can deadlock due to + * csd_lock() on because the interrupt context uses the same csd + * storage. + */ + WARN_ON_ONCE(!in_task()); + csd = &csd_stack; if (!wait) { csd = this_cpu_ptr(&csd_data); @@ -416,6 +424,14 @@ void smp_call_function_many(const struct cpumask *mask, WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress && !early_boot_irqs_disabled); + /* + * When @wait we can deadlock when we interrupt between llist_add() and + * arch_send_call_function_ipi*(); when !@wait we can deadlock due to + * csd_lock() on because the interrupt context uses the same csd + * storage. + */ + WARN_ON_ONCE(!in_task()); + /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); if (cpu == this_cpu) -- cgit v1.2.3 From b191d6491be67cef2b3fa83015561caca1394ab9 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 17 Jul 2019 13:21:00 -0400 Subject: pidfd: fix a poll race when setting exit_state There is a race between reading task->exit_state in pidfd_poll and writing it after do_notify_parent calls do_notify_pidfd. Expected sequence of events is: CPU 0 CPU 1 ------------------------------------------------ exit_notify do_notify_parent do_notify_pidfd tsk->exit_state = EXIT_DEAD pidfd_poll if (tsk->exit_state) However nothing prevents the following sequence: CPU 0 CPU 1 ------------------------------------------------ exit_notify do_notify_parent do_notify_pidfd pidfd_poll if (tsk->exit_state) tsk->exit_state = EXIT_DEAD This causes a polling task to wait forever, since poll blocks because exit_state is 0 and the waiting task is not notified again. A stress test continuously doing pidfd poll and process exits uncovered this bug. To fix it, we make sure that the task's exit_state is always set before calling do_notify_pidfd. Fixes: b53b0b9d9a6 ("pidfd: add polling support") Cc: kernel-team@android.com Cc: Oleg Nesterov Signed-off-by: Suren Baghdasaryan Signed-off-by: Joel Fernandes (Google) Link: https://lore.kernel.org/r/20190717172100.261204-1-joel@joelfernandes.org [christian@brauner.io: adapt commit message and drop unneeded changes from wait_task_zombie] Signed-off-by: Christian Brauner --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a75b6a7f458a..4436158a6d30 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -720,6 +720,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); + tsk->exit_state = EXIT_ZOMBIE; if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && -- cgit v1.2.3 From b8d3349803ba34afda429e87a837fd95a99b2349 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Jul 2019 17:59:19 +0200 Subject: sched/rt, Kconfig: Unbreak def/oldconfig with CONFIG_PREEMPT=y The merge of the CONFIG_PREEMPT_RT stub renamed CONFIG_PREEMPT to CONFIG_PREEMPT_LL which causes all defconfigs which have CONFIG_PREEMPT=y set to fall back to CONFIG_PREEMPT_NONE because CONFIG_PREEMPT depends on the preemption mode choice wich defaults to NONE. This also affects oldconfig builds. So rather than changing 114 defconfig files and being an annoyance to users, revert the rename and select a new config symbol PREEMPTION. That keeps everything working smoothly and the revelant ifdef's are going to be fixed up step by step. Reported-by: Mark Rutland Fixes: a50a3f4b6a31 ("sched/rt, Kconfig: Introduce CONFIG_PREEMPT_RT") Signed-off-by: Thomas Gleixner --- kernel/Kconfig.preempt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index fc020c09b7e8..deff97217496 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -35,10 +35,10 @@ config PREEMPT_VOLUNTARY Select this if you are building a kernel for a desktop system. -config PREEMPT_LL +config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" depends on !ARCH_NO_PREEMPT - select PREEMPT + select PREEMPTION select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK help This option reduces the latency of the kernel by making @@ -58,7 +58,7 @@ config PREEMPT_LL config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" depends on EXPERT && ARCH_SUPPORTS_RT - select PREEMPT + select PREEMPTION help This option turns the kernel into a real-time kernel by replacing various locking primitives (spinlocks, rwlocks, etc.) with @@ -77,6 +77,6 @@ endchoice config PREEMPT_COUNT bool -config PREEMPT +config PREEMPTION bool select PREEMPT_COUNT -- cgit v1.2.3 From d9b8aadaffa65809d146cf0f8632a22a946367d7 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 19 Jul 2019 11:18:15 +0200 Subject: bpf: fix narrower loads on s390 The very first check in test_pkt_md_access is failing on s390, which happens because loading a part of a struct __sk_buff field produces an incorrect result. The preprocessed code of the check is: { __u8 tmp = *((volatile __u8 *)&skb->len + ((sizeof(skb->len) - sizeof(__u8)) / sizeof(__u8))); if (tmp != ((*(volatile __u32 *)&skb->len) & 0xFF)) return 2; }; clang generates the following code for it: 0: 71 21 00 03 00 00 00 00 r2 = *(u8 *)(r1 + 3) 1: 61 31 00 00 00 00 00 00 r3 = *(u32 *)(r1 + 0) 2: 57 30 00 00 00 00 00 ff r3 &= 255 3: 5d 23 00 1d 00 00 00 00 if r2 != r3 goto +29 Finally, verifier transforms it to: 0: (61) r2 = *(u32 *)(r1 +104) 1: (bc) w2 = w2 2: (74) w2 >>= 24 3: (bc) w2 = w2 4: (54) w2 &= 255 5: (bc) w2 = w2 The problem is that when verifier emits the code to replace a partial load of a struct __sk_buff field (*(u8 *)(r1 + 3)) with a full load of struct sk_buff field (*(u32 *)(r1 + 104)), an optional shift and a bitwise AND, it assumes that the machine is little endian and incorrectly decides to use a shift. Adjust shift count calculation to account for endianness. Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5900cbb966b1..c84d83f86141 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8616,8 +8616,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (is_narrower_load && size < target_size) { - u8 shift = (off & (size_default - 1)) * 8; - + u8 shift = bpf_ctx_narrow_load_shift(off, size, + size_default); if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, -- cgit v1.2.3 From 66d7780f18eae0232827fcffeaded39a6a168236 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jul 2019 11:51:56 -0700 Subject: dma-mapping: check pfn validity in dma_common_{mmap,get_sgtable} Check that the pfn returned from arch_dma_coherent_to_pfn refers to a valid page and reject the mmap / get_sgtable requests otherwise. Based on the arm implementation of the mmap and get_sgtable methods. Signed-off-by: Christoph Hellwig Tested-by: Vignesh Raghavendra --- kernel/dma/mapping.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 1f628e7ac709..b945239621d8 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -116,11 +116,16 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, int ret; if (!dev_is_dma_coherent(dev)) { + unsigned long pfn; + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) return -ENXIO; - page = pfn_to_page(arch_dma_coherent_to_pfn(dev, cpu_addr, - dma_addr)); + /* If the PFN is not valid, we do not have a struct page */ + pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); + if (!pfn_valid(pfn)) + return -ENXIO; + page = pfn_to_page(pfn); } else { page = virt_to_page(cpu_addr); } @@ -170,7 +175,11 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, if (!dev_is_dma_coherent(dev)) { if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) return -ENXIO; + + /* If the PFN is not valid, we do not have a struct page */ pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); + if (!pfn_valid(pfn)) + return -ENXIO; } else { pfn = page_to_pfn(virt_to_page(cpu_addr)); } -- cgit v1.2.3 From d7852fbd0f0423937fa287a598bfde188bb68c22 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 11 Jul 2019 09:54:40 -0700 Subject: access: avoid the RCU grace period for the temporary subjective credentials It turns out that 'access()' (and 'faccessat()') can cause a lot of RCU work because it installs a temporary credential that gets allocated and freed for each system call. The allocation and freeing overhead is mostly benign, but because credentials can be accessed under the RCU read lock, the freeing involves a RCU grace period. Which is not a huge deal normally, but if you have a lot of access() calls, this causes a fair amount of seconday damage: instead of having a nice alloc/free patterns that hits in hot per-CPU slab caches, you have all those delayed free's, and on big machines with hundreds of cores, the RCU overhead can end up being enormous. But it turns out that all of this is entirely unnecessary. Exactly because access() only installs the credential as the thread-local subjective credential, the temporary cred pointer doesn't actually need to be RCU free'd at all. Once we're done using it, we can just free it synchronously and avoid all the RCU overhead. So add a 'non_rcu' flag to 'struct cred', which can be set by users that know they only use it in non-RCU context (there are other potential users for this). We can make it a union with the rcu freeing list head that we need for the RCU case, so this doesn't need any extra storage. Note that this also makes 'get_current_cred()' clear the new non_rcu flag, in case we have filesystems that take a long-term reference to the cred and then expect the RCU delayed freeing afterwards. It's not entirely clear that this is required, but it makes for clear semantics: the subjective cred remains non-RCU as long as you only access it synchronously using the thread-local accessors, but you _can_ use it as a generic cred if you want to. It is possible that we should just remove the whole RCU markings for ->cred entirely. Only ->real_cred is really supposed to be accessed through RCU, and the long-term cred copies that nfs uses might want to explicitly re-enable RCU freeing if required, rather than have get_current_cred() do it implicitly. But this is a "minimal semantic changes" change for the immediate problem. Acked-by: Peter Zijlstra (Intel) Acked-by: Eric Dumazet Acked-by: Paul E. McKenney Cc: Oleg Nesterov Cc: Jan Glauber Cc: Jiri Kosina Cc: Jayachandran Chandrasekharan Nair Cc: Greg KH Cc: Kees Cook Cc: David Howells Cc: Miklos Szeredi Cc: Al Viro Signed-off-by: Linus Torvalds --- kernel/cred.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index c73a87a4df13..153ae369e024 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -144,7 +144,10 @@ void __put_cred(struct cred *cred) BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); - call_rcu(&cred->rcu, put_cred_rcu); + if (cred->non_rcu) + put_cred_rcu(&cred->rcu); + else + call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); @@ -256,6 +259,7 @@ struct cred *prepare_creds(void) old = task->cred; memcpy(new, old, sizeof(struct cred)); + new->non_rcu = 0; atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_group_info(new->group_info); @@ -535,7 +539,19 @@ const struct cred *override_creds(const struct cred *new) validate_creds(old); validate_creds(new); - get_cred(new); + + /* + * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'. + * + * That means that we do not clear the 'non_rcu' flag, since + * we are only installing the cred into the thread-synchronous + * '->cred' pointer, not the '->real_cred' pointer that is + * visible to other threads under RCU. + * + * Also note that we did validate_creds() manually, not depending + * on the validation in 'get_cred()'. + */ + get_new_cred((struct cred *)new); alter_cred_subscribers(new, 1); rcu_assign_pointer(current->cred, new); alter_cred_subscribers(old, -1); @@ -672,6 +688,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) validate_creds(old); *new = *old; + new->non_rcu = 0; atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_uid(new->user); -- cgit v1.2.3 From 16d51a590a8ce3befb1308e0e7ab77f3b661af33 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 16 Jul 2019 17:20:45 +0200 Subject: sched/fair: Don't free p->numa_faults with concurrent readers When going through execve(), zero out the NUMA fault statistics instead of freeing them. During execve, the task is reachable through procfs and the scheduler. A concurrent /proc/*/sched reader can read data from a freed ->numa_faults allocation (confirmed by KASAN) and write it back to userspace. I believe that it would also be possible for a use-after-free read to occur through a race between a NUMA fault and execve(): task_numa_fault() can lead to task_numa_compare(), which invokes task_weight() on the currently running task of a different CPU. Another way to fix this would be to make ->numa_faults RCU-managed or add extra locking, but it seems easier to wipe the NUMA fault statistics on execve. Signed-off-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: Will Deacon Fixes: 82727018b0d3 ("sched/numa: Call task_numa_free() from do_execve()") Link: https://lkml.kernel.org/r/20190716152047.14424-1-jannh@google.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- kernel/sched/fair.c | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d8ae0f1b4148..2852d0e76ea3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -726,7 +726,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(tsk == current); cgroup_free(tsk); - task_numa_free(tsk); + task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 036be95a87e9..6adb0e0f5feb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2353,13 +2353,23 @@ no_join: return; } -void task_numa_free(struct task_struct *p) +/* + * Get rid of NUMA staticstics associated with a task (either current or dead). + * If @final is set, the task is dead and has reached refcount zero, so we can + * safely free all relevant data structures. Otherwise, there might be + * concurrent reads from places like load balancing and procfs, and we should + * reset the data back to default state without freeing ->numa_faults. + */ +void task_numa_free(struct task_struct *p, bool final) { struct numa_group *grp = p->numa_group; - void *numa_faults = p->numa_faults; + unsigned long *numa_faults = p->numa_faults; unsigned long flags; int i; + if (!numa_faults) + return; + if (grp) { spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) @@ -2372,8 +2382,14 @@ void task_numa_free(struct task_struct *p) put_numa_group(grp); } - p->numa_faults = NULL; - kfree(numa_faults); + if (final) { + p->numa_faults = NULL; + kfree(numa_faults); + } else { + p->total_numa_faults = 0; + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + numa_faults[i] = 0; + } } /* -- cgit v1.2.3 From cb361d8cdef69990f6b4504dc1fd9a594d983c97 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 16 Jul 2019 17:20:47 +0200 Subject: sched/fair: Use RCU accessors consistently for ->numa_group The old code used RCU annotations and accessors inconsistently for ->numa_group, which can lead to use-after-frees and NULL dereferences. Let all accesses to ->numa_group use proper RCU helpers to prevent such issues. Signed-off-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: Will Deacon Fixes: 8c8a743c5087 ("sched/numa: Use {cpu, pid} to create task groups for shared faults") Link: https://lkml.kernel.org/r/20190716152047.14424-3-jannh@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 120 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 81 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6adb0e0f5feb..bc9cfeaac8bd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1086,6 +1086,21 @@ struct numa_group { unsigned long faults[0]; }; +/* + * For functions that can be called in multiple contexts that permit reading + * ->numa_group (see struct task_struct for locking rules). + */ +static struct numa_group *deref_task_numa_group(struct task_struct *p) +{ + return rcu_dereference_check(p->numa_group, p == current || + (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu))); +} + +static struct numa_group *deref_curr_numa_group(struct task_struct *p) +{ + return rcu_dereference_protected(p->numa_group, p == current); +} + static inline unsigned long group_faults_priv(struct numa_group *ng); static inline unsigned long group_faults_shared(struct numa_group *ng); @@ -1129,10 +1144,12 @@ static unsigned int task_scan_start(struct task_struct *p) { unsigned long smin = task_scan_min(p); unsigned long period = smin; + struct numa_group *ng; /* Scale the maximum scan period with the amount of shared memory. */ - if (p->numa_group) { - struct numa_group *ng = p->numa_group; + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); + if (ng) { unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); @@ -1140,6 +1157,7 @@ static unsigned int task_scan_start(struct task_struct *p) period *= shared + 1; period /= private + shared + 1; } + rcu_read_unlock(); return max(smin, period); } @@ -1148,13 +1166,14 @@ static unsigned int task_scan_max(struct task_struct *p) { unsigned long smin = task_scan_min(p); unsigned long smax; + struct numa_group *ng; /* Watch for min being lower than max due to floor calculations */ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); /* Scale the maximum scan period with the amount of shared memory. */ - if (p->numa_group) { - struct numa_group *ng = p->numa_group; + ng = deref_curr_numa_group(p); + if (ng) { unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); unsigned long period = smax; @@ -1186,7 +1205,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; p->numa_faults = NULL; - p->numa_group = NULL; + RCU_INIT_POINTER(p->numa_group, NULL); p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; @@ -1233,7 +1252,16 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p) pid_t task_numa_group_id(struct task_struct *p) { - return p->numa_group ? p->numa_group->gid : 0; + struct numa_group *ng; + pid_t gid = 0; + + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); + if (ng) + gid = ng->gid; + rcu_read_unlock(); + + return gid; } /* @@ -1258,11 +1286,13 @@ static inline unsigned long task_faults(struct task_struct *p, int nid) static inline unsigned long group_faults(struct task_struct *p, int nid) { - if (!p->numa_group) + struct numa_group *ng = deref_task_numa_group(p); + + if (!ng) return 0; - return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + - p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; + return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] + + ng->faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) @@ -1400,12 +1430,13 @@ static inline unsigned long task_weight(struct task_struct *p, int nid, static inline unsigned long group_weight(struct task_struct *p, int nid, int dist) { + struct numa_group *ng = deref_task_numa_group(p); unsigned long faults, total_faults; - if (!p->numa_group) + if (!ng) return 0; - total_faults = p->numa_group->total_faults; + total_faults = ng->total_faults; if (!total_faults) return 0; @@ -1419,7 +1450,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { - struct numa_group *ng = p->numa_group; + struct numa_group *ng = deref_curr_numa_group(p); int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid; @@ -1600,13 +1631,14 @@ static bool load_too_imbalanced(long src_load, long dst_load, static void task_numa_compare(struct task_numa_env *env, long taskimp, long groupimp, bool maymove) { + struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); struct rq *dst_rq = cpu_rq(env->dst_cpu); + long imp = p_ng ? groupimp : taskimp; struct task_struct *cur; long src_load, dst_load; - long load; - long imp = env->p->numa_group ? groupimp : taskimp; - long moveimp = imp; int dist = env->dist; + long moveimp = imp; + long load; if (READ_ONCE(dst_rq->numa_migrate_on)) return; @@ -1645,21 +1677,22 @@ static void task_numa_compare(struct task_numa_env *env, * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights. */ - if (cur->numa_group == env->p->numa_group) { + cur_ng = rcu_dereference(cur->numa_group); + if (cur_ng == p_ng) { imp = taskimp + task_weight(cur, env->src_nid, dist) - task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences. */ - if (cur->numa_group) + if (cur_ng) imp -= imp / 16; } else { /* * Compare the group weights. If a task is all by itself * (not part of a group), use the task weight instead. */ - if (cur->numa_group && env->p->numa_group) + if (cur_ng && p_ng) imp += group_weight(cur, env->src_nid, dist) - group_weight(cur, env->dst_nid, dist); else @@ -1757,11 +1790,12 @@ static int task_numa_migrate(struct task_struct *p) .best_imp = 0, .best_cpu = -1, }; + unsigned long taskweight, groupweight; struct sched_domain *sd; + long taskimp, groupimp; + struct numa_group *ng; struct rq *best_rq; - unsigned long taskweight, groupweight; int nid, ret, dist; - long taskimp, groupimp; /* * Pick the lowest SD_NUMA domain, as that would have the smallest @@ -1807,7 +1841,8 @@ static int task_numa_migrate(struct task_struct *p) * multiple NUMA nodes; in order to better consolidate the group, * we need to check other locations. */ - if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { + ng = deref_curr_numa_group(p); + if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -1840,7 +1875,7 @@ static int task_numa_migrate(struct task_struct *p) * A task that migrated to a second choice node will be better off * trying for a better one later. Do not set the preferred node here. */ - if (p->numa_group) { + if (ng) { if (env.best_cpu == -1) nid = env.src_nid; else @@ -2135,6 +2170,7 @@ static void task_numa_placement(struct task_struct *p) unsigned long total_faults; u64 runtime, period; spinlock_t *group_lock = NULL; + struct numa_group *ng; /* * The p->mm->numa_scan_seq field gets updated without @@ -2152,8 +2188,9 @@ static void task_numa_placement(struct task_struct *p) runtime = numa_get_avg_runtime(p, &period); /* If the task is part of a group prevent parallel updates to group stats */ - if (p->numa_group) { - group_lock = &p->numa_group->lock; + ng = deref_curr_numa_group(p); + if (ng) { + group_lock = &ng->lock; spin_lock_irq(group_lock); } @@ -2194,7 +2231,7 @@ static void task_numa_placement(struct task_struct *p) p->numa_faults[cpu_idx] += f_diff; faults += p->numa_faults[mem_idx]; p->total_numa_faults += diff; - if (p->numa_group) { + if (ng) { /* * safe because we can only change our own group * @@ -2202,14 +2239,14 @@ static void task_numa_placement(struct task_struct *p) * nid and priv in a specific region because it * is at the beginning of the numa_faults array. */ - p->numa_group->faults[mem_idx] += diff; - p->numa_group->faults_cpu[mem_idx] += f_diff; - p->numa_group->total_faults += diff; - group_faults += p->numa_group->faults[mem_idx]; + ng->faults[mem_idx] += diff; + ng->faults_cpu[mem_idx] += f_diff; + ng->total_faults += diff; + group_faults += ng->faults[mem_idx]; } } - if (!p->numa_group) { + if (!ng) { if (faults > max_faults) { max_faults = faults; max_nid = nid; @@ -2220,8 +2257,8 @@ static void task_numa_placement(struct task_struct *p) } } - if (p->numa_group) { - numa_group_count_active_nodes(p->numa_group); + if (ng) { + numa_group_count_active_nodes(ng); spin_unlock_irq(group_lock); max_nid = preferred_group_nid(p, max_nid); } @@ -2255,7 +2292,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, int cpu = cpupid_to_cpu(cpupid); int i; - if (unlikely(!p->numa_group)) { + if (unlikely(!deref_curr_numa_group(p))) { unsigned int size = sizeof(struct numa_group) + 4*nr_node_ids*sizeof(unsigned long); @@ -2291,7 +2328,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!grp) goto no_join; - my_grp = p->numa_group; + my_grp = deref_curr_numa_group(p); if (grp == my_grp) goto no_join; @@ -2362,7 +2399,8 @@ no_join: */ void task_numa_free(struct task_struct *p, bool final) { - struct numa_group *grp = p->numa_group; + /* safe: p either is current or is being freed by current */ + struct numa_group *grp = rcu_dereference_raw(p->numa_group); unsigned long *numa_faults = p->numa_faults; unsigned long flags; int i; @@ -2442,7 +2480,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) * actively using should be counted as local. This allows the * scan rate to slow down when a workload has settled down. */ - ng = p->numa_group; + ng = deref_curr_numa_group(p); if (!priv && !local && ng && ng->active_nodes > 1 && numa_is_active_node(cpu_node, ng) && numa_is_active_node(mem_node, ng)) @@ -10460,18 +10498,22 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) { int node; unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0; + struct numa_group *ng; + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); for_each_online_node(node) { if (p->numa_faults) { tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; } - if (p->numa_group) { - gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)], - gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)]; + if (ng) { + gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)], + gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; } print_numa_stats(m, node, tsf, tpf, gsf, gpf); } + rcu_read_unlock(); } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ -- cgit v1.2.3 From 78134300579a45f527ca173ec8fdb4701b69f16e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 20 Jul 2019 11:04:10 -0400 Subject: locking/rwsem: Don't call owner_on_cpu() on read-owner For writer, the owner value is cleared on unlock. For reader, it is left intact on unlock for providing better debugging aid on crash dump and the unlock of one reader may not mean the lock is free. As a result, the owner_on_cpu() shouldn't be used on read-owner as the task pointer value may not be valid and it might have been freed. That is the case in rwsem_spin_on_owner(), but not in rwsem_can_spin_on_owner(). This can lead to use-after-free error from KASAN. For example, BUG: KASAN: use-after-free in rwsem_down_write_slowpath (/home/miguel/kernel/linux/kernel/locking/rwsem.c:669 /home/miguel/kernel/linux/kernel/locking/rwsem.c:1125) Fix this by checking for RWSEM_READER_OWNED flag before calling owner_on_cpu(). Reported-by: Luis Henriques Tested-by: Luis Henriques Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Jeff Layton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Fixes: 94a9717b3c40e ("locking/rwsem: Make rwsem->owner an atomic_long_t") Link: https://lkml.kernel.org/r/81e82d5b-5074-77e8-7204-28479bbe0df0@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 37524a47f002..bc91aacaab58 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -666,7 +666,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, preempt_disable(); rcu_read_lock(); owner = rwsem_owner_flags(sem, &flags); - if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner))) + /* + * Don't check the read-owner as the entry may be stale. + */ + if ((flags & nonspinnable) || + (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) ret = false; rcu_read_unlock(); preempt_enable(); -- cgit v1.2.3 From e1b98fa316648420d0434d9ff5b92ad6609ba6c3 Mon Sep 17 00:00:00 2001 From: Jan Stancek Date: Thu, 18 Jul 2019 10:51:25 +0200 Subject: locking/rwsem: Add missing ACQUIRE to read_slowpath exit when queue is empty LTP mtest06 has been observed to occasionally hit "still mapped when deleted" and following BUG_ON on arm64. The extra mapcount originated from pagefault handler, which handled pagefault for vma that has already been detached. vma is detached under mmap_sem write lock by detach_vmas_to_be_unmapped(), which also invalidates vmacache. When the pagefault handler (under mmap_sem read lock) calls find_vma(), vmacache_valid() wrongly reports vmacache as valid. After rwsem down_read() returns via 'queue empty' path (as of v5.2), it does so without an ACQUIRE on sem->count: down_read() __down_read() rwsem_down_read_failed() __rwsem_down_read_failed_common() raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) { if (atomic_long_read(&sem->count) >= 0) { raw_spin_unlock_irq(&sem->wait_lock); return sem; The problem can be reproduced by running LTP mtest06 in a loop and building the kernel (-j $NCPUS) in parallel. It does reproduces since v4.20 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably in about an hour. The patched kernel ran fine for 10+ hours. Signed-off-by: Jan Stancek Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Will Deacon Acked-by: Waiman Long Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dbueso@suse.de Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & no writer") Link: https://lkml.kernel.org/r/50b8914e20d1d62bb2dee42d342836c2c16ebee7.1563438048.git.jstancek@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index bc91aacaab58..d3ce7c6c42a6 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1036,6 +1036,8 @@ queue: */ if (adjustment && !(atomic_long_read(&sem->count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + /* Provide lock ACQUIRE */ + smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast); -- cgit v1.2.3 From 99143f82a255e7f054bead8443462fae76dd829e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Jul 2019 14:56:17 +0200 Subject: lcoking/rwsem: Add missing ACQUIRE to read_slowpath sleep loop While reviewing another read_slowpath patch, both Will and I noticed another missing ACQUIRE, namely: X = 0; CPU0 CPU1 rwsem_down_read() for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); X = 1; rwsem_up_write(); rwsem_mark_wake() atomic_long_add(adjustment, &sem->count); smp_store_release(&waiter->task, NULL); if (!waiter.task) break; ... } r = X; Allows 'r == 0'. Reported-by: Peter Zijlstra (Intel) Reported-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index d3ce7c6c42a6..571938887cc8 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1073,8 +1073,10 @@ queue: /* wait to be given the lock */ while (true) { set_current_state(state); - if (!waiter.task) + if (!smp_load_acquire(&waiter.task)) { + /* Orders against rwsem_mark_wake()'s smp_store_release() */ break; + } if (signal_pending_state(state, current)) { raw_spin_lock_irq(&sem->wait_lock); if (waiter.task) -- cgit v1.2.3 From 6ffddfb9e1de21c3d0c0cfa4fe4a20dd3291a812 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Jul 2019 15:08:53 +0200 Subject: locking/rwsem: Add ACQUIRE comments Since we just reviewed read_slowpath for ACQUIRE correctness, add a few coments to retain our findings. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 571938887cc8..bd0f0d05724c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1004,6 +1004,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) atomic_long_add(-RWSEM_READER_BIAS, &sem->count); adjustment = 0; if (rwsem_optimistic_spin(sem, false)) { + /* rwsem_optimistic_spin() implies ACQUIRE on success */ /* * Wake up other readers in the wait list if the front * waiter is a reader. @@ -1018,6 +1019,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) } return sem; } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { + /* rwsem_reader_phase_trylock() implies ACQUIRE on success */ return sem; } @@ -1071,10 +1073,10 @@ queue: wake_up_q(&wake_q); /* wait to be given the lock */ - while (true) { + for (;;) { set_current_state(state); if (!smp_load_acquire(&waiter.task)) { - /* Orders against rwsem_mark_wake()'s smp_store_release() */ + /* Matches rwsem_mark_wake()'s smp_store_release(). */ break; } if (signal_pending_state(state, current)) { @@ -1082,6 +1084,7 @@ queue: if (waiter.task) goto out_nolock; raw_spin_unlock_irq(&sem->wait_lock); + /* Ordered by sem->wait_lock against rwsem_mark_wake(). */ break; } schedule(); @@ -1091,6 +1094,7 @@ queue: __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); return sem; + out_nolock: list_del(&waiter.list); if (list_empty(&sem->wait_list)) { @@ -1131,8 +1135,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) /* do optimistic spinning and steal lock if possible */ if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && - rwsem_optimistic_spin(sem, true)) + rwsem_optimistic_spin(sem, true)) { + /* rwsem_optimistic_spin() implies ACQUIRE on success */ return sem; + } /* * Disable reader optimistic spinning for this rwsem after @@ -1192,9 +1198,11 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) wait: /* wait until we successfully acquire the lock */ set_current_state(state); - while (true) { - if (rwsem_try_write_lock(sem, wstate)) + for (;;) { + if (rwsem_try_write_lock(sem, wstate)) { + /* rwsem_try_write_lock() implies ACQUIRE on success */ break; + } raw_spin_unlock_irq(&sem->wait_lock); -- cgit v1.2.3 From 68037aa78208f34bda4e5cd76c357f718b838cbb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 15 Jul 2019 11:27:49 +0200 Subject: locking/lockdep: Hide unused 'class' variable The usage is now hidden in an #ifdef, so we need to move the variable itself in there as well to avoid this warning: kernel/locking/lockdep_proc.c:203:21: error: unused variable 'class' [-Werror,-Wunused-variable] Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Bart Van Assche Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Qian Cai Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: Will Deacon Cc: Yuyang Du Cc: frederic@kernel.org Fixes: 68d41d8c94a3 ("locking/lockdep: Fix lock used or unused stats error") Link: https://lkml.kernel.org/r/20190715092809.736834-1-arnd@arndb.de Signed-off-by: Ingo Molnar --- kernel/locking/lockdep_proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 65b6a1600c8f..bda006f8a88b 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -200,7 +200,6 @@ static void lockdep_stats_debug_show(struct seq_file *m) static int lockdep_stats_show(struct seq_file *m, void *v) { - struct lock_class *class; unsigned long nr_unused = 0, nr_uncategorized = 0, nr_irq_safe = 0, nr_irq_unsafe = 0, nr_softirq_safe = 0, nr_softirq_unsafe = 0, @@ -211,6 +210,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) sum_forward_deps = 0; #ifdef CONFIG_PROVE_LOCKING + struct lock_class *class; + list_for_each_entry(class, &all_lock_classes, lock_entry) { if (class->usage_mask == 0) -- cgit v1.2.3 From 30a35f79faadfeb1b89a7fdb3875f14063519041 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 28 Jun 2019 12:29:03 +0200 Subject: locking/lockdep: Clean up #ifdef checks As Will Deacon points out, CONFIG_PROVE_LOCKING implies TRACE_IRQFLAGS, so the conditions I added in the previous patch, and some others in the same file can be simplified by only checking for the former. No functional change. Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: Andrew Morton Cc: Bart Van Assche Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Yuyang Du Fixes: 886532aee3cd ("locking/lockdep: Move mark_lock() inside CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING") Link: https://lkml.kernel.org/r/20190628102919.2345242-1-arnd@arndb.de Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 341f52117f88..4861cf8e274b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -448,7 +448,7 @@ static void print_lockdep_off(const char *bug_msg) unsigned long nr_stack_trace_entries; -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_PROVE_LOCKING /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the graph_lock. @@ -491,7 +491,7 @@ unsigned int max_lockdep_depth; DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); #endif -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_PROVE_LOCKING /* * Locking printouts: */ @@ -2969,7 +2969,7 @@ static void check_chain_key(struct task_struct *curr) #endif } -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_PROVE_LOCKING static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); @@ -3608,7 +3608,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } -#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ +#else /* CONFIG_PROVE_LOCKING */ static inline int mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) @@ -3627,7 +3627,7 @@ static inline int separate_irq_context(struct task_struct *curr, return 0; } -#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ +#endif /* CONFIG_PROVE_LOCKING */ /* * Initialize a lock instance's lock-class mapping info: @@ -4321,8 +4321,7 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie */ static void check_flags(unsigned long flags) { -#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \ - defined(CONFIG_TRACE_IRQFLAGS) +#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) if (!debug_locks) return; -- cgit v1.2.3 From 6c11c6e3d5e9e5caf8686cd6a5e4552cfc3ea326 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 3 Jul 2019 11:21:26 +0200 Subject: locking/mutex: Test for initialized mutex An uninitialized/ zeroed mutex will go unnoticed because there is no check for it. There is a magic check in the unlock's slowpath path which might go unnoticed if the unlock happens in the fastpath. Add a ->magic check early in the mutex_lock() and mutex_trylock() path. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190703092125.lsdf4gpsh2plhavb@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index edd1c082dbf5..5e069734363c 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -908,6 +908,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, might_sleep(); +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(lock->magic != lock); +#endif + ww = container_of(lock, struct ww_mutex, base); if (use_ww_ctx && ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) @@ -1379,8 +1383,13 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { - bool locked = __mutex_trylock(lock); + bool locked; + +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(lock->magic != lock); +#endif + locked = __mutex_trylock(lock); if (locked) mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); -- cgit v1.2.3 From 4ce54af8b33d3e21ca935fc1b89b58cbba956051 Mon Sep 17 00:00:00 2001 From: Leonard Crestez Date: Wed, 24 Jul 2019 15:53:24 +0300 Subject: perf/core: Fix creating kernel counters for PMUs that override event->cpu Some hardware PMU drivers will override perf_event.cpu inside their event_init callback. This causes a lockdep splat when initialized through the kernel API: WARNING: CPU: 0 PID: 250 at kernel/events/core.c:2917 ctx_sched_out+0x78/0x208 pc : ctx_sched_out+0x78/0x208 Call trace: ctx_sched_out+0x78/0x208 __perf_install_in_context+0x160/0x248 remote_function+0x58/0x68 generic_exec_single+0x100/0x180 smp_call_function_single+0x174/0x1b8 perf_install_in_context+0x178/0x188 perf_event_create_kernel_counter+0x118/0x160 Fix this by calling perf_install_in_context with event->cpu, just like perf_event_open Signed-off-by: Leonard Crestez Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mark Rutland Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Frank Li Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: https://lkml.kernel.org/r/c4ebe0503623066896d7046def4d6b1e06e0eb2e.1563972056.git.leonard.crestez@nxp.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 026a14541a38..0463c1151bae 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11274,7 +11274,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } - perf_install_in_context(ctx, event, cpu); + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); -- cgit v1.2.3 From c6622a425acd1d2f3a443cd39b490a8777b622d7 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 26 Jul 2019 12:34:32 -0700 Subject: dma-contiguous: do not overwrite align in dma_alloc_contiguous() The dma_alloc_contiguous() limits align at CONFIG_CMA_ALIGNMENT for cma_alloc() however it does not restore it for the fallback routine. This will result in a size mismatch between the allocation and free when running into the fallback routines after cma_alloc() fails, if the align is larger than CONFIG_CMA_ALIGNMENT. This patch adds a cma_align to take care of cma_alloc() and prevent the align from being overwritten. Fixes: fdaeec198ada ("dma-contiguous: add dma_{alloc,free}_contiguous() helpers") Reported-by: Dafna Hirschfeld Signed-off-by: Nicolin Chen Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index bfc0c17f2a3d..ea8259f53eda 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -243,8 +243,9 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) /* CMA can be used only in the context which permits sleeping */ if (cma && gfpflags_allow_blocking(gfp)) { - align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); - page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN); + size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); + + page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN); } /* Fallback allocation of normal pages */ -- cgit v1.2.3 From f46cc0152501e46d1b3aa5e7eade61145070eab0 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 26 Jul 2019 12:34:33 -0700 Subject: dma-contiguous: page-align the size in dma_free_contiguous() According to the original dma_direct_alloc_pages() code: { unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; if (!dma_release_from_contiguous(dev, page, count)) __free_pages(page, get_order(size)); } The count parameter for dma_release_from_contiguous() was page aligned before the right-shifting operation, while the new API dma_free_contiguous() forgets to have PAGE_ALIGN() at the size. So this patch simply adds it to prevent any corner case. Fixes: fdaeec198ada ("dma-contiguous: add dma_{alloc,free}_contiguous() helpers") Signed-off-by: Nicolin Chen Reviewed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index ea8259f53eda..2bd410f934b3 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -267,7 +267,8 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) */ void dma_free_contiguous(struct device *dev, struct page *page, size_t size) { - if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT)) + if (!cma_release(dev_get_cma_area(dev), page, + PAGE_ALIGN(size) >> PAGE_SHIFT)) __free_pages(page, get_order(size)); } -- cgit v1.2.3 From 1caf7d50f46bd0388e38e653b146aa81700e8eb8 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 24 Jul 2019 12:48:16 -0400 Subject: pidfd: Add warning if exit_state is 0 during notification Previously a condition got missed where the pidfd waiters are awakened before the exit_state gets set. This can result in a missed notification [1] and the polling thread waiting forever. It is fixed now, however it would be nice to avoid this kind of issue going unnoticed in the future. So just add a warning to catch it in the future. /* References */ [1]: https://lore.kernel.org/lkml/20190717172100.261204-1-joel@joelfernandes.org/ Signed-off-by: Joel Fernandes (Google) Link: https://lore.kernel.org/r/20190724164816.201099-1-joel@joelfernandes.org Signed-off-by: Christian Brauner --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 91b789dd6e72..349f5a67f100 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1885,6 +1885,7 @@ static void do_notify_pidfd(struct task_struct *task) { struct pid *pid; + WARN_ON(task->exit_state == 0); pid = task_pid(task); wake_up_all(&pid->wait_pidfd); } -- cgit v1.2.3 From 30b692d3b390c6fe78a5064be0c4bbd44a41be59 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 29 Jul 2019 17:48:24 +0200 Subject: exit: make setting exit_state consistent Since commit b191d6491be6 ("pidfd: fix a poll race when setting exit_state") we unconditionally set exit_state to EXIT_ZOMBIE before calling into do_notify_parent(). This was done to eliminate a race when querying exit_state in do_notify_pidfd(). Back then we decided to do the absolute minimal thing to fix this and not touch the rest of the exit_notify() function where exit_state is set. Since this fix has not caused any issues change the setting of exit_state to EXIT_DEAD in the autoreap case to account for the fact hat exit_state is set to EXIT_ZOMBIE unconditionally. This fix was planned but also explicitly requested in [1] and makes the whole code more consistent. /* References */ [1]: https://lore.kernel.org/lkml/CAHk-=wigcxGFR2szue4wavJtH5cYTTeNES=toUBVGsmX0rzX+g@mail.gmail.com Signed-off-by: Christian Brauner Acked-by: Oleg Nesterov Cc: Linus Torvalds --- kernel/exit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 4436158a6d30..5b4a5dcce8f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -734,9 +734,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) autoreap = true; } - tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; - if (tsk->exit_state == EXIT_DEAD) + if (autoreap) { + tsk->exit_state = EXIT_DEAD; list_add(&tsk->ptrace_entry, &dead); + } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) -- cgit v1.2.3 From 6c77221df96177da0520847ce91e33f539fb8b2d Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 30 Jul 2019 22:08:50 +0800 Subject: fgraph: Remove redundant ftrace_graph_notrace_addr() test We already have tested it before. The second one should be removed. With this change, the performance should have little improvement. Link: http://lkml.kernel.org/r/20190730140850.7927-1-changbin.du@gmail.com Cc: stable@vger.kernel.org Fixes: 9cd2992f2d6c ("fgraph: Have set_graph_notrace only affect function_graph tracer") Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions_graph.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 69ebf3c2f1b5..78af97163147 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -137,6 +137,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ if (ftrace_graph_notrace_addr(trace->func)) { trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT); /* @@ -155,16 +162,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (ftrace_graph_ignore_irqs()) return 0; - /* - * Do not trace a function if it's filtered by set_graph_notrace. - * Make the index of ret stack negative to indicate that it should - * ignore further functions. But it needs its own ret stack entry - * to recover the original index in order to continue tracing after - * returning from the function. - */ - if (ftrace_graph_notrace_addr(trace->func)) - return 1; - /* * Stop here if tracing_threshold is set. We only write function return * events to the ring buffer. -- cgit v1.2.3 From 68d8681e97bd1c90259f341c1695af05002070ef Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 2 Aug 2019 21:48:33 -0700 Subject: kernel/signal.c: fix a kernel-doc markup The kernel-doc parser doesn't handle expressions with %foo*. Instead, when an asterisk should be part of a constant, it uses an alternative notation: `foo*`. Link: http://lkml.kernel.org/r/7f18c2e0b5e39e6b7eb55ddeb043b8b260b49f2d.1563361575.git.mchehab+samsung@kernel.org Signed-off-by: Mauro Carvalho Chehab Cc: Deepa Dinamani Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 349f5a67f100..e667be6907d7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -349,7 +349,7 @@ void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask) * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop. * Group stop states are cleared and the group stop count is consumed if * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group - * stop, the appropriate %SIGNAL_* flags are set. + * stop, the appropriate `SIGNAL_*` flags are set. * * CONTEXT: * Must be called with @task->sighand->siglock held. -- cgit v1.2.3 From 14c5cebad510c2875ca525f36605b47058769670 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Aug 2019 21:49:26 -0700 Subject: memremap: move from kernel/ to mm/ memremap.c implements MM functionality for ZONE_DEVICE, so it really should be in the mm/ directory, not the kernel/ one. Link: http://lkml.kernel.org/r/20190722094143.18387-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Anshuman Khandual Acked-by: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 - kernel/memremap.c | 405 ------------------------------------------------------ 2 files changed, 406 deletions(-) delete mode 100644 kernel/memremap.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index a8d923b5481b..ef0d95a190b4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -111,7 +111,6 @@ obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o -obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_RSEQ) += rseq.o obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o diff --git a/kernel/memremap.c b/kernel/memremap.c deleted file mode 100644 index 6ee03a816d67..000000000000 --- a/kernel/memremap.c +++ /dev/null @@ -1,405 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright(c) 2015 Intel Corporation. All rights reserved. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static DEFINE_XARRAY(pgmap_array); -#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) -#define SECTION_SIZE (1UL << PA_SECTION_SHIFT) - -#ifdef CONFIG_DEV_PAGEMAP_OPS -DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL(devmap_managed_key); -static atomic_t devmap_managed_enable; - -static void devmap_managed_enable_put(void *data) -{ - if (atomic_dec_and_test(&devmap_managed_enable)) - static_branch_disable(&devmap_managed_key); -} - -static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) -{ - if (!pgmap->ops || !pgmap->ops->page_free) { - WARN(1, "Missing page_free method\n"); - return -EINVAL; - } - - if (atomic_inc_return(&devmap_managed_enable) == 1) - static_branch_enable(&devmap_managed_key); - return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL); -} -#else -static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) -{ - return -EINVAL; -} -#endif /* CONFIG_DEV_PAGEMAP_OPS */ - -static void pgmap_array_delete(struct resource *res) -{ - xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), - NULL, GFP_KERNEL); - synchronize_rcu(); -} - -static unsigned long pfn_first(struct dev_pagemap *pgmap) -{ - return PHYS_PFN(pgmap->res.start) + - vmem_altmap_offset(pgmap_altmap(pgmap)); -} - -static unsigned long pfn_end(struct dev_pagemap *pgmap) -{ - const struct resource *res = &pgmap->res; - - return (res->start + resource_size(res)) >> PAGE_SHIFT; -} - -static unsigned long pfn_next(unsigned long pfn) -{ - if (pfn % 1024 == 0) - cond_resched(); - return pfn + 1; -} - -#define for_each_device_pfn(pfn, map) \ - for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) - -static void dev_pagemap_kill(struct dev_pagemap *pgmap) -{ - if (pgmap->ops && pgmap->ops->kill) - pgmap->ops->kill(pgmap); - else - percpu_ref_kill(pgmap->ref); -} - -static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) -{ - if (pgmap->ops && pgmap->ops->cleanup) { - pgmap->ops->cleanup(pgmap); - } else { - wait_for_completion(&pgmap->done); - percpu_ref_exit(pgmap->ref); - } -} - -static void devm_memremap_pages_release(void *data) -{ - struct dev_pagemap *pgmap = data; - struct device *dev = pgmap->dev; - struct resource *res = &pgmap->res; - unsigned long pfn; - int nid; - - dev_pagemap_kill(pgmap); - for_each_device_pfn(pfn, pgmap) - put_page(pfn_to_page(pfn)); - dev_pagemap_cleanup(pgmap); - - /* pages are dead and unused, undo the arch mapping */ - nid = page_to_nid(pfn_to_page(PHYS_PFN(res->start))); - - mem_hotplug_begin(); - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - pfn = PHYS_PFN(res->start); - __remove_pages(page_zone(pfn_to_page(pfn)), pfn, - PHYS_PFN(resource_size(res)), NULL); - } else { - arch_remove_memory(nid, res->start, resource_size(res), - pgmap_altmap(pgmap)); - kasan_remove_zero_shadow(__va(res->start), resource_size(res)); - } - mem_hotplug_done(); - - untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); - pgmap_array_delete(res); - dev_WARN_ONCE(dev, pgmap->altmap.alloc, - "%s: failed to free all reserved pages\n", __func__); -} - -static void dev_pagemap_percpu_release(struct percpu_ref *ref) -{ - struct dev_pagemap *pgmap = - container_of(ref, struct dev_pagemap, internal_ref); - - complete(&pgmap->done); -} - -/** - * devm_memremap_pages - remap and provide memmap backing for the given resource - * @dev: hosting device for @res - * @pgmap: pointer to a struct dev_pagemap - * - * Notes: - * 1/ At a minimum the res and type members of @pgmap must be initialized - * by the caller before passing it to this function - * - * 2/ The altmap field may optionally be initialized, in which case - * PGMAP_ALTMAP_VALID must be set in pgmap->flags. - * - * 3/ The ref field may optionally be provided, in which pgmap->ref must be - * 'live' on entry and will be killed and reaped at - * devm_memremap_pages_release() time, or if this routine fails. - * - * 4/ res is expected to be a host memory range that could feasibly be - * treated as a "System RAM" range, i.e. not a device mmio range, but - * this is not enforced. - */ -void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) -{ - struct resource *res = &pgmap->res; - struct dev_pagemap *conflict_pgmap; - struct mhp_restrictions restrictions = { - /* - * We do not want any optional features only our own memmap - */ - .altmap = pgmap_altmap(pgmap), - }; - pgprot_t pgprot = PAGE_KERNEL; - int error, nid, is_ram; - bool need_devmap_managed = true; - - switch (pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { - WARN(1, "Device private memory not supported\n"); - return ERR_PTR(-EINVAL); - } - if (!pgmap->ops || !pgmap->ops->migrate_to_ram) { - WARN(1, "Missing migrate_to_ram method\n"); - return ERR_PTR(-EINVAL); - } - break; - case MEMORY_DEVICE_FS_DAX: - if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || - IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { - WARN(1, "File system DAX not supported\n"); - return ERR_PTR(-EINVAL); - } - break; - case MEMORY_DEVICE_DEVDAX: - case MEMORY_DEVICE_PCI_P2PDMA: - need_devmap_managed = false; - break; - default: - WARN(1, "Invalid pgmap type %d\n", pgmap->type); - break; - } - - if (!pgmap->ref) { - if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) - return ERR_PTR(-EINVAL); - - init_completion(&pgmap->done); - error = percpu_ref_init(&pgmap->internal_ref, - dev_pagemap_percpu_release, 0, GFP_KERNEL); - if (error) - return ERR_PTR(error); - pgmap->ref = &pgmap->internal_ref; - } else { - if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); - } - } - - if (need_devmap_managed) { - error = devmap_managed_enable_get(dev, pgmap); - if (error) - return ERR_PTR(error); - } - - conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL); - if (conflict_pgmap) { - dev_WARN(dev, "Conflicting mapping in same section\n"); - put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; - } - - conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL); - if (conflict_pgmap) { - dev_WARN(dev, "Conflicting mapping in same section\n"); - put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; - } - - is_ram = region_intersects(res->start, resource_size(res), - IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); - - if (is_ram != REGION_DISJOINT) { - WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__, - is_ram == REGION_MIXED ? "mixed" : "ram", res); - error = -ENXIO; - goto err_array; - } - - pgmap->dev = dev; - - error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), - PHYS_PFN(res->end), pgmap, GFP_KERNEL)); - if (error) - goto err_array; - - nid = dev_to_node(dev); - if (nid < 0) - nid = numa_mem_id(); - - error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(res->start), 0, - resource_size(res)); - if (error) - goto err_pfn_remap; - - mem_hotplug_begin(); - - /* - * For device private memory we call add_pages() as we only need to - * allocate and initialize struct page for the device memory. More- - * over the device memory is un-accessible thus we do not want to - * create a linear mapping for the memory like arch_add_memory() - * would do. - * - * For all other device memory types, which are accessible by - * the CPU, we do want the linear mapping and thus use - * arch_add_memory(). - */ - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - error = add_pages(nid, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), &restrictions); - } else { - error = kasan_add_zero_shadow(__va(res->start), resource_size(res)); - if (error) { - mem_hotplug_done(); - goto err_kasan; - } - - error = arch_add_memory(nid, res->start, resource_size(res), - &restrictions); - } - - if (!error) { - struct zone *zone; - - zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; - move_pfn_range_to_zone(zone, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), restrictions.altmap); - } - - mem_hotplug_done(); - if (error) - goto err_add_memory; - - /* - * Initialization of the pages has been deferred until now in order - * to allow us to do the work while not holding the hotplug lock. - */ - memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], - PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), pgmap); - percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); - - error = devm_add_action_or_reset(dev, devm_memremap_pages_release, - pgmap); - if (error) - return ERR_PTR(error); - - return __va(res->start); - - err_add_memory: - kasan_remove_zero_shadow(__va(res->start), resource_size(res)); - err_kasan: - untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); - err_pfn_remap: - pgmap_array_delete(res); - err_array: - dev_pagemap_kill(pgmap); - dev_pagemap_cleanup(pgmap); - return ERR_PTR(error); -} -EXPORT_SYMBOL_GPL(devm_memremap_pages); - -void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) -{ - devm_release_action(dev, devm_memremap_pages_release, pgmap); -} -EXPORT_SYMBOL_GPL(devm_memunmap_pages); - -unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) -{ - /* number of pfns from base where pfn_to_page() is valid */ - if (altmap) - return altmap->reserve + altmap->free; - return 0; -} - -void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) -{ - altmap->alloc -= nr_pfns; -} - -/** - * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn - * @pfn: page frame number to lookup page_map - * @pgmap: optional known pgmap that already has a reference - * - * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap - * is non-NULL but does not cover @pfn the reference to it will be released. - */ -struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap) -{ - resource_size_t phys = PFN_PHYS(pfn); - - /* - * In the cached case we're already holding a live reference. - */ - if (pgmap) { - if (phys >= pgmap->res.start && phys <= pgmap->res.end) - return pgmap; - put_dev_pagemap(pgmap); - } - - /* fall back to slow path lookup */ - rcu_read_lock(); - pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); - if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) - pgmap = NULL; - rcu_read_unlock(); - - return pgmap; -} -EXPORT_SYMBOL_GPL(get_dev_pagemap); - -#ifdef CONFIG_DEV_PAGEMAP_OPS -void __put_devmap_managed_page(struct page *page) -{ - int count = page_ref_dec_return(page); - - /* - * If refcount is 1 then page is freed and refcount is stable as nobody - * holds a reference on the page. - */ - if (count == 1) { - /* Clear Active bit in case of parallel mark_page_accessed */ - __ClearPageActive(page); - __ClearPageWaiters(page); - - mem_cgroup_uncharge(page); - - page->pgmap->ops->page_free(page); - } else if (!count) - __put_page(page); -} -EXPORT_SYMBOL(__put_devmap_managed_page); -#endif /* CONFIG_DEV_PAGEMAP_OPS */ -- cgit v1.2.3 From f4904815f97a934258445a8f763f6b6c48f007e7 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 2 Aug 2019 15:59:43 +0100 Subject: sched/deadline: Fix double accounting of rq/running bw in push & pull {push,pull}_dl_task() always calls {de,}activate_task() with .flags=0 which sets p->on_rq=TASK_ON_RQ_MIGRATING. {push,pull}_dl_task()->{de,}activate_task()->{de,en}queue_task()-> {de,en}queue_task_dl() calls {sub,add}_{running,rq}_bw() since p->on_rq==TASK_ON_RQ_MIGRATING. So {sub,add}_{running,rq}_bw() in {push,pull}_dl_task() is double-accounting for that task. Fix it by removing rq/running bw accounting in [push/pull]_dl_task(). Fixes: 7dd778841164 ("sched/core: Unify p->on_rq updates") Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Valentin Schneider Cc: Ingo Molnar Cc: Luca Abeni Cc: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Qais Yousef Link: https://lkml.kernel.org/r/20190802145945.18702-2-dietmar.eggemann@arm.com --- kernel/sched/deadline.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ef5b9f6b1d42..46122edd8552 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2088,17 +2088,13 @@ retry: } deactivate_task(rq, next_task, 0); - sub_running_bw(&next_task->dl, &rq->dl); - sub_rq_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); - add_rq_bw(&next_task->dl, &later_rq->dl); /* * Update the later_rq clock here, because the clock is used * by the cpufreq_update_util() inside __add_running_bw(). */ update_rq_clock(later_rq); - add_running_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); ret = 1; @@ -2186,11 +2182,7 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); - sub_running_bw(&p->dl, &src_rq->dl); - sub_rq_bw(&p->dl, &src_rq->dl); set_task_cpu(p, this_cpu); - add_rq_bw(&p->dl, &this_rq->dl); - add_running_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; -- cgit v1.2.3 From 14f5c7b46a41a595fc61db37f55721714729e59e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Aug 2019 12:41:31 +0200 Subject: sched/psi: Reduce psimon FIFO priority PSI defaults to a FIFO-99 thread, reduce this to FIFO-1. FIFO-99 is the very highest priority available to SCHED_FIFO and it not a suitable default; it would indicate the psi work is the most important work on the machine. Since Real-Time tasks will have pre-allocated memory and locked it in place, Real-Time tasks do not care about PSI. All it needs is to be above OTHER. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Tested-by: Suren Baghdasaryan Cc: Thomas Gleixner --- kernel/sched/psi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7acc632c3b82..7fe2c5fd26b5 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1051,7 +1051,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, if (!rcu_access_pointer(group->poll_kworker)) { struct sched_param param = { - .sched_priority = MAX_RT_PRIO - 1, + .sched_priority = 1, }; struct kthread_worker *kworker; -- cgit v1.2.3 From 04e048cf09d7b5fc995817cdc5ae1acd4482429c Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 29 Jul 2019 18:33:10 -0700 Subject: sched/psi: Do not require setsched permission from the trigger creator When a process creates a new trigger by writing into /proc/pressure/* files, permissions to write such a file should be used to determine whether the process is allowed to do so or not. Current implementation would also require such a process to have setsched capability. Setting of psi trigger thread's scheduling policy is an implementation detail and should not be exposed to the user level. Remove the permission check by using _nocheck version of the function. Suggested-by: Nick Kralevich Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Cc: lizefan@huawei.com Cc: mingo@redhat.com Cc: akpm@linux-foundation.org Cc: kernel-team@android.com Cc: dennisszhou@gmail.com Cc: dennis@kernel.org Cc: hannes@cmpxchg.org Cc: axboe@kernel.dk Link: https://lkml.kernel.org/r/20190730013310.162367-1-surenb@google.com --- kernel/sched/psi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7fe2c5fd26b5..23fbbcc414d5 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1061,7 +1061,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, mutex_unlock(&group->trigger_lock); return ERR_CAST(kworker); } - sched_setscheduler(kworker->task, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); kthread_init_delayed_work(&group->poll_work, psi_poll_work); rcu_assign_pointer(group->poll_kworker, kworker); -- cgit v1.2.3 From 491beed3b102b6e6c0e7734200661242226e3933 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 5 Aug 2019 09:19:06 +0800 Subject: genirq/affinity: Create affinity mask for single vector Since commit c66d4bd110a1f8 ("genirq/affinity: Add new callback for (re)calculating interrupt sets"), irq_create_affinity_masks() returns NULL in case of single vector. This change has caused regression on some drivers, such as lpfc. The problem is that single vector requests can happen in some generic cases: 1) kdump kernel 2) irq vectors resource is close to exhaustion. If in that situation the affinity mask for a single vector is not created, every caller has to handle the special case. There is no reason why the mask cannot be created, so remove the check for a single vector and create the mask. Fixes: c66d4bd110a1f8 ("genirq/affinity: Add new callback for (re)calculating interrupt sets") Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190805011906.5020-1-ming.lei@redhat.com --- kernel/irq/affinity.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 4352b08ae48d..6fef48033f96 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -251,11 +251,9 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) * Determine the number of vectors which need interrupt affinities * assigned. If the pre/post request exhausts the available vectors * then nothing to do here except for invoking the calc_sets() - * callback so the device driver can adjust to the situation. If there - * is only a single vector, then managing the queue is pointless as - * well. + * callback so the device driver can adjust to the situation. */ - if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors) + if (nvecs > affd->pre_vectors + affd->post_vectors) affvecs = nvecs - affd->pre_vectors - affd->post_vectors; else affvecs = 0; -- cgit v1.2.3