From 476c5818c37a7828d558f34ae01f0c32f8bfadde Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 29 Aug 2020 22:03:13 +0900 Subject: llist: Add nonatomic __llist_add() and __llist_dell_all() We'll use these in the new, lockless kretprobes code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/159870619318.1229682.13027387548510028721.stgit@devnote2 --- include/linux/llist.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/llist.h b/include/linux/llist.h index 2e9c7215882b..24f207b0190b 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -197,6 +197,16 @@ static inline struct llist_node *llist_next(struct llist_node *node) extern bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, struct llist_head *head); + +static inline bool __llist_add_batch(struct llist_node *new_first, + struct llist_node *new_last, + struct llist_head *head) +{ + new_last->next = head->first; + head->first = new_first; + return new_last->next == NULL; +} + /** * llist_add - add a new entry * @new: new entry to be added @@ -209,6 +219,11 @@ static inline bool llist_add(struct llist_node *new, struct llist_head *head) return llist_add_batch(new, new, head); } +static inline bool __llist_add(struct llist_node *new, struct llist_head *head) +{ + return __llist_add_batch(new, new, head); +} + /** * llist_del_all - delete all entries from lock-less list * @head: the head of lock-less list to delete all entries @@ -222,6 +237,14 @@ static inline struct llist_node *llist_del_all(struct llist_head *head) return xchg(&head->first, NULL); } +static inline struct llist_node *__llist_del_all(struct llist_head *head) +{ + struct llist_node *first = head->first; + + head->first = NULL; + return first; +} + extern struct llist_node *llist_del_first(struct llist_head *head); struct llist_node *llist_reverse_order(struct llist_node *head); -- cgit v1.2.3 From d741bf41d7c7db4898bacfcb020353cddc032fd8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 29 Aug 2020 22:03:24 +0900 Subject: kprobes: Remove kretprobe hash The kretprobe hash is mostly superfluous, replace it with a per-task variable. This gets rid of the task hash and it's related locking. Note that this may change the kprobes module-exported API for kretprobe handlers. If any out-of-tree kretprobe user uses ri->rp, use get_kretprobe(ri) instead. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/159870620431.1229682.16325792502413731312.stgit@devnote2 --- include/linux/kprobes.h | 19 +++++++++++++++++-- include/linux/sched.h | 4 ++++ 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 5c8c271fa1e9..00cf4421efd5 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #ifdef CONFIG_KPROBES @@ -144,6 +145,11 @@ static inline int kprobe_ftrace(struct kprobe *p) * ignored, due to maxactive being too low. * */ +struct kretprobe_holder { + struct kretprobe *rp; + refcount_t ref; +}; + struct kretprobe { struct kprobe kp; kretprobe_handler_t handler; @@ -152,17 +158,18 @@ struct kretprobe { int nmissed; size_t data_size; struct hlist_head free_instances; + struct kretprobe_holder *rph; raw_spinlock_t lock; }; struct kretprobe_instance { union { + struct llist_node llist; struct hlist_node hlist; struct rcu_head rcu; }; - struct kretprobe *rp; + struct kretprobe_holder *rph; kprobe_opcode_t *ret_addr; - struct task_struct *task; void *fp; char data[]; }; @@ -221,6 +228,14 @@ unsigned long kretprobe_trampoline_handler(struct pt_regs *regs, return ret; } +static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), + "Kretprobe is accessed from instance under preemptive context"); + + return READ_ONCE(ri->rph->rp); +} + #else /* CONFIG_KRETPROBES */ static inline void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) diff --git a/include/linux/sched.h b/include/linux/sched.h index afe01e232935..5911805cafde 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1315,6 +1315,10 @@ struct task_struct { struct callback_head mce_kill_me; #endif +#ifdef CONFIG_KRETPROBES + struct llist_head kretprobe_instances; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. -- cgit v1.2.3 From 29f006fdefe6f88abde973a0b0f20d2704e93fd4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 29 Aug 2020 22:03:35 +0900 Subject: asm-generic/atomic: Add try_cmpxchg() fallbacks Only x86 provides try_cmpxchg() outside of the atomic_t interfaces, provide generic fallbacks to create this interface from the widely available cmpxchg() function. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Acked-by: Will Deacon Link: https://lore.kernel.org/r/159870621515.1229682.15506193091065001742.stgit@devnote2 --- include/linux/atomic-arch-fallback.h | 90 ++++++++++++++++++++++++++++++++---- include/linux/atomic-fallback.h | 90 ++++++++++++++++++++++++++++++++---- 2 files changed, 160 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atomic-arch-fallback.h b/include/linux/atomic-arch-fallback.h index bcb6aa27cfa6..a3dba31df01e 100644 --- a/include/linux/atomic-arch-fallback.h +++ b/include/linux/atomic-arch-fallback.h @@ -9,9 +9,9 @@ #include #ifndef arch_xchg_relaxed -#define arch_xchg_relaxed arch_xchg -#define arch_xchg_acquire arch_xchg -#define arch_xchg_release arch_xchg +#define arch_xchg_acquire arch_xchg +#define arch_xchg_release arch_xchg +#define arch_xchg_relaxed arch_xchg #else /* arch_xchg_relaxed */ #ifndef arch_xchg_acquire @@ -32,9 +32,9 @@ #endif /* arch_xchg_relaxed */ #ifndef arch_cmpxchg_relaxed -#define arch_cmpxchg_relaxed arch_cmpxchg -#define arch_cmpxchg_acquire arch_cmpxchg -#define arch_cmpxchg_release arch_cmpxchg +#define arch_cmpxchg_acquire arch_cmpxchg +#define arch_cmpxchg_release arch_cmpxchg +#define arch_cmpxchg_relaxed arch_cmpxchg #else /* arch_cmpxchg_relaxed */ #ifndef arch_cmpxchg_acquire @@ -55,9 +55,9 @@ #endif /* arch_cmpxchg_relaxed */ #ifndef arch_cmpxchg64_relaxed -#define arch_cmpxchg64_relaxed arch_cmpxchg64 -#define arch_cmpxchg64_acquire arch_cmpxchg64 -#define arch_cmpxchg64_release arch_cmpxchg64 +#define arch_cmpxchg64_acquire arch_cmpxchg64 +#define arch_cmpxchg64_release arch_cmpxchg64 +#define arch_cmpxchg64_relaxed arch_cmpxchg64 #else /* arch_cmpxchg64_relaxed */ #ifndef arch_cmpxchg64_acquire @@ -77,6 +77,76 @@ #endif /* arch_cmpxchg64_relaxed */ +#ifndef arch_try_cmpxchg_relaxed +#ifdef arch_try_cmpxchg +#define arch_try_cmpxchg_acquire arch_try_cmpxchg +#define arch_try_cmpxchg_release arch_try_cmpxchg +#define arch_try_cmpxchg_relaxed arch_try_cmpxchg +#endif /* arch_try_cmpxchg */ + +#ifndef arch_try_cmpxchg +#define arch_try_cmpxchg(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = arch_cmpxchg((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif /* arch_try_cmpxchg */ + +#ifndef arch_try_cmpxchg_acquire +#define arch_try_cmpxchg_acquire(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = arch_cmpxchg_acquire((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif /* arch_try_cmpxchg_acquire */ + +#ifndef arch_try_cmpxchg_release +#define arch_try_cmpxchg_release(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = arch_cmpxchg_release((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif /* arch_try_cmpxchg_release */ + +#ifndef arch_try_cmpxchg_relaxed +#define arch_try_cmpxchg_relaxed(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = arch_cmpxchg_relaxed((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif /* arch_try_cmpxchg_relaxed */ + +#else /* arch_try_cmpxchg_relaxed */ + +#ifndef arch_try_cmpxchg_acquire +#define arch_try_cmpxchg_acquire(...) \ + __atomic_op_acquire(arch_try_cmpxchg, __VA_ARGS__) +#endif + +#ifndef arch_try_cmpxchg_release +#define arch_try_cmpxchg_release(...) \ + __atomic_op_release(arch_try_cmpxchg, __VA_ARGS__) +#endif + +#ifndef arch_try_cmpxchg +#define arch_try_cmpxchg(...) \ + __atomic_op_fence(arch_try_cmpxchg, __VA_ARGS__) +#endif + +#endif /* arch_try_cmpxchg_relaxed */ + #ifndef arch_atomic_read_acquire static __always_inline int arch_atomic_read_acquire(const atomic_t *v) @@ -2288,4 +2358,4 @@ arch_atomic64_dec_if_positive(atomic64_t *v) #endif #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// 90cd26cfd69d2250303d654955a0cc12620fb91b +// cca554917d7ea73d5e3e7397dd70c484cad9b2c4 diff --git a/include/linux/atomic-fallback.h b/include/linux/atomic-fallback.h index fd525c71d676..2a3f55d98be9 100644 --- a/include/linux/atomic-fallback.h +++ b/include/linux/atomic-fallback.h @@ -9,9 +9,9 @@ #include #ifndef xchg_relaxed -#define xchg_relaxed xchg -#define xchg_acquire xchg -#define xchg_release xchg +#define xchg_acquire xchg +#define xchg_release xchg +#define xchg_relaxed xchg #else /* xchg_relaxed */ #ifndef xchg_acquire @@ -32,9 +32,9 @@ #endif /* xchg_relaxed */ #ifndef cmpxchg_relaxed -#define cmpxchg_relaxed cmpxchg -#define cmpxchg_acquire cmpxchg -#define cmpxchg_release cmpxchg +#define cmpxchg_acquire cmpxchg +#define cmpxchg_release cmpxchg +#define cmpxchg_relaxed cmpxchg #else /* cmpxchg_relaxed */ #ifndef cmpxchg_acquire @@ -55,9 +55,9 @@ #endif /* cmpxchg_relaxed */ #ifndef cmpxchg64_relaxed -#define cmpxchg64_relaxed cmpxchg64 -#define cmpxchg64_acquire cmpxchg64 -#define cmpxchg64_release cmpxchg64 +#define cmpxchg64_acquire cmpxchg64 +#define cmpxchg64_release cmpxchg64 +#define cmpxchg64_relaxed cmpxchg64 #else /* cmpxchg64_relaxed */ #ifndef cmpxchg64_acquire @@ -77,6 +77,76 @@ #endif /* cmpxchg64_relaxed */ +#ifndef try_cmpxchg_relaxed +#ifdef try_cmpxchg +#define try_cmpxchg_acquire try_cmpxchg +#define try_cmpxchg_release try_cmpxchg +#define try_cmpxchg_relaxed try_cmpxchg +#endif /* try_cmpxchg */ + +#ifndef try_cmpxchg +#define try_cmpxchg(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = cmpxchg((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif /* try_cmpxchg */ + +#ifndef try_cmpxchg_acquire +#define try_cmpxchg_acquire(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = cmpxchg_acquire((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif /* try_cmpxchg_acquire */ + +#ifndef try_cmpxchg_release +#define try_cmpxchg_release(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = cmpxchg_release((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif /* try_cmpxchg_release */ + +#ifndef try_cmpxchg_relaxed +#define try_cmpxchg_relaxed(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = cmpxchg_relaxed((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif /* try_cmpxchg_relaxed */ + +#else /* try_cmpxchg_relaxed */ + +#ifndef try_cmpxchg_acquire +#define try_cmpxchg_acquire(...) \ + __atomic_op_acquire(try_cmpxchg, __VA_ARGS__) +#endif + +#ifndef try_cmpxchg_release +#define try_cmpxchg_release(...) \ + __atomic_op_release(try_cmpxchg, __VA_ARGS__) +#endif + +#ifndef try_cmpxchg +#define try_cmpxchg(...) \ + __atomic_op_fence(try_cmpxchg, __VA_ARGS__) +#endif + +#endif /* try_cmpxchg_relaxed */ + #define arch_atomic_read atomic_read #define arch_atomic_read_acquire atomic_read_acquire @@ -2522,4 +2592,4 @@ atomic64_dec_if_positive(atomic64_t *v) #endif #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// 9d95b56f98d82a2a26c7b79ccdd0c47572d50a6f +// d78e6c293c661c15188f0ec05bce45188c8d5892 -- cgit v1.2.3 From e563604a5f5a891283b6a8db4001cee833a7c6b8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 29 Aug 2020 22:03:46 +0900 Subject: freelist: Implement lockless freelist A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly speedy under low contention. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/159870622579.1229682.16729440870040944993.stgit@devnote2 --- include/linux/freelist.h | 129 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 include/linux/freelist.h (limited to 'include/linux') diff --git a/include/linux/freelist.h b/include/linux/freelist.h new file mode 100644 index 000000000000..fc1842b96469 --- /dev/null +++ b/include/linux/freelist.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */ +#ifndef FREELIST_H +#define FREELIST_H + +#include + +/* + * Copyright: cameron@moodycamel.com + * + * A simple CAS-based lock-free free list. Not the fastest thing in the world + * under heavy contention, but simple and correct (assuming nodes are never + * freed until after the free list is destroyed), and fairly speedy under low + * contention. + * + * Adapted from: https://moodycamel.com/blog/2014/solving-the-aba-problem-for-lock-free-free-lists + */ + +struct freelist_node { + atomic_t refs; + struct freelist_node *next; +}; + +struct freelist_head { + struct freelist_node *head; +}; + +#define REFS_ON_FREELIST 0x80000000 +#define REFS_MASK 0x7FFFFFFF + +static inline void __freelist_add(struct freelist_node *node, struct freelist_head *list) +{ + /* + * Since the refcount is zero, and nobody can increase it once it's + * zero (except us, and we run only one copy of this method per node at + * a time, i.e. the single thread case), then we know we can safely + * change the next pointer of the node; however, once the refcount is + * back above zero, then other threads could increase it (happens under + * heavy contention, when the refcount goes to zero in between a load + * and a refcount increment of a node in try_get, then back up to + * something non-zero, then the refcount increment is done by the other + * thread) -- so if the CAS to add the node to the actual list fails, + * decrese the refcount and leave the add operation to the next thread + * who puts the refcount back to zero (which could be us, hence the + * loop). + */ + struct freelist_node *head = READ_ONCE(list->head); + + for (;;) { + WRITE_ONCE(node->next, head); + atomic_set_release(&node->refs, 1); + + if (!try_cmpxchg_release(&list->head, &head, node)) { + /* + * Hmm, the add failed, but we can only try again when + * the refcount goes back to zero. + */ + if (atomic_fetch_add_release(REFS_ON_FREELIST - 1, &node->refs) == 1) + continue; + } + return; + } +} + +static inline void freelist_add(struct freelist_node *node, struct freelist_head *list) +{ + /* + * We know that the should-be-on-freelist bit is 0 at this point, so + * it's safe to set it using a fetch_add. + */ + if (!atomic_fetch_add_release(REFS_ON_FREELIST, &node->refs)) { + /* + * Oh look! We were the last ones referencing this node, and we + * know we want to add it to the free list, so let's do it! + */ + __freelist_add(node, list); + } +} + +static inline struct freelist_node *freelist_try_get(struct freelist_head *list) +{ + struct freelist_node *prev, *next, *head = smp_load_acquire(&list->head); + unsigned int refs; + + while (head) { + prev = head; + refs = atomic_read(&head->refs); + if ((refs & REFS_MASK) == 0 || + !atomic_try_cmpxchg_acquire(&head->refs, &refs, refs+1)) { + head = smp_load_acquire(&list->head); + continue; + } + + /* + * Good, reference count has been incremented (it wasn't at + * zero), which means we can read the next and not worry about + * it changing between now and the time we do the CAS. + */ + next = READ_ONCE(head->next); + if (try_cmpxchg_acquire(&list->head, &head, next)) { + /* + * Yay, got the node. This means it was on the list, + * which means should-be-on-freelist must be false no + * matter the refcount (because nobody else knows it's + * been taken off yet, it can't have been put back on). + */ + WARN_ON_ONCE(atomic_read(&head->refs) & REFS_ON_FREELIST); + + /* + * Decrease refcount twice, once for our ref, and once + * for the list's ref. + */ + atomic_fetch_add(-2, &head->refs); + + return head; + } + + /* + * OK, the head must have changed on us, but we still need to decrement + * the refcount we increased. + */ + refs = atomic_fetch_add(-1, &prev->refs); + if (refs == REFS_ON_FREELIST + 1) + __freelist_add(prev, list); + } + + return NULL; +} + +#endif /* FREELIST_H */ -- cgit v1.2.3 From 6e426e0fcd20ce144bb93e00b70df51e9f2e08c3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 29 Aug 2020 22:03:56 +0900 Subject: kprobes: Replace rp->free_instance with freelist Gets rid of rp->lock, and as a result kretprobes are now fully lockless. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/159870623583.1229682.17472357584134058687.stgit@devnote2 --- include/linux/kprobes.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 00cf4421efd5..b7824e3f1ef5 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #ifdef CONFIG_KPROBES @@ -157,17 +158,16 @@ struct kretprobe { int maxactive; int nmissed; size_t data_size; - struct hlist_head free_instances; + struct freelist_head freelist; struct kretprobe_holder *rph; - raw_spinlock_t lock; }; struct kretprobe_instance { union { - struct llist_node llist; - struct hlist_node hlist; + struct freelist_node freelist; struct rcu_head rcu; }; + struct llist_node llist; struct kretprobe_holder *rph; kprobe_opcode_t *ret_addr; void *fp; -- cgit v1.2.3