From f1b499f029c5dde85d46a8811353c62f29157541 Mon Sep 17 00:00:00 2001 From: John Kacur Date: Thu, 5 Aug 2010 17:10:53 +0200 Subject: lockdep: Remove __debug_show_held_locks There is no longer any functional difference between __debug_show_held_locks() and debug_show_held_locks(), so remove the former. Signed-off-by: John Kacur Cc: Peter Zijlstra LKML-Reference: <1281021054-4228-1-git-send-email-jkacur@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/debug_locks.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 29b3ce3f2a1d..2833452ea01c 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -49,7 +49,6 @@ struct task_struct; #ifdef CONFIG_LOCKDEP extern void debug_show_all_locks(void); -extern void __debug_show_held_locks(struct task_struct *task); extern void debug_show_held_locks(struct task_struct *task); extern void debug_check_no_locks_freed(const void *from, unsigned long len); extern void debug_check_no_locks_held(struct task_struct *task); @@ -58,10 +57,6 @@ static inline void debug_show_all_locks(void) { } -static inline void __debug_show_held_locks(struct task_struct *task) -{ -} - static inline void debug_show_held_locks(struct task_struct *task) { } -- cgit v1.2.3 From ca5ecddfa8fcbd948c95530e7e817cee9fb43a3d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Apr 2010 14:39:09 -0700 Subject: rcu: define __rcu address space modifier for sparse This commit provides definitions for the __rcu annotation defined earlier. This annotation permits sparse to check for correct use of RCU-protected pointers. If a pointer that is annotated with __rcu is accessed directly (as opposed to via rcu_dereference(), rcu_assign_pointer(), or one of their variants), sparse can be made to complain. To enable such complaints, use the new default-disabled CONFIG_SPARSE_RCU_POINTER kernel configuration option. Please note that these sparse complaints are intended to be a debugging aid, -not- a code-style-enforcement mechanism. There are special rcu_dereference_protected() and rcu_access_pointer() accessors for use when RCU read-side protection is not required, for example, when no other CPU has access to the data structure in question or while the current CPU hold the update-side lock. This patch also updates a number of docbook comments that were showing their age. Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Christopher Li Reviewed-by: Josh Triplett --- include/linux/compiler.h | 4 + include/linux/rcupdate.h | 352 ++++++++++++++++++++++++++++------------------- include/linux/srcu.h | 27 +++- 3 files changed, 240 insertions(+), 143 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c1a62c56a660..320d6c94ff84 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -16,7 +16,11 @@ # define __release(x) __context__(x,-1) # define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) # define __percpu __attribute__((noderef, address_space(3))) +#ifdef CONFIG_SPARSE_RCU_POINTER +# define __rcu __attribute__((noderef, address_space(4))) +#else # define __rcu +#endif extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); #else diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9fbc54a2585d..b973dea2d6b0 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -41,6 +41,7 @@ #include #include #include +#include #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; /* for sysctl */ @@ -120,14 +121,15 @@ extern struct lockdep_map rcu_sched_lock_map; extern int debug_lockdep_rcu_enabled(void); /** - * rcu_read_lock_held - might we be in RCU read-side critical section? + * rcu_read_lock_held() - might we be in RCU read-side critical section? * * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, * this assumes we are in an RCU read-side critical section unless it can - * prove otherwise. + * prove otherwise. This is useful for debug checks in functions that + * require that they be called within an RCU read-side critical section. * - * Check debug_lockdep_rcu_enabled() to prevent false positives during boot + * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. */ static inline int rcu_read_lock_held(void) @@ -144,14 +146,16 @@ static inline int rcu_read_lock_held(void) extern int rcu_read_lock_bh_held(void); /** - * rcu_read_lock_sched_held - might we be in RCU-sched read-side critical section? + * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? * * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling * of preemption (including disabling irqs) counts as an RCU-sched - * read-side critical section. + * read-side critical section. This is useful for debug checks in functions + * that required that they be called within an RCU-sched read-side + * critical section. * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. @@ -220,41 +224,155 @@ extern int rcu_my_thread_group_empty(void); } \ } while (0) +#else /* #ifdef CONFIG_PROVE_RCU */ + +#define __do_rcu_dereference_check(c) do { } while (0) + +#endif /* #else #ifdef CONFIG_PROVE_RCU */ + +/* + * Helper functions for rcu_dereference_check(), rcu_dereference_protected() + * and rcu_assign_pointer(). Some of these could be folded into their + * callers, but they are left separate in order to ease introduction of + * multiple flavors of pointers to match the multiple flavors of RCU + * (e.g., __rcu_bh, * __rcu_sched, and __srcu), should this make sense in + * the future. + */ +#define __rcu_access_pointer(p, space) \ + ({ \ + typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \ + (void) (((typeof (*p) space *)p) == p); \ + ((typeof(*p) __force __kernel *)(_________p1)); \ + }) +#define __rcu_dereference_check(p, c, space) \ + ({ \ + typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \ + __do_rcu_dereference_check(c); \ + (void) (((typeof (*p) space *)p) == p); \ + smp_read_barrier_depends(); \ + ((typeof(*p) __force __kernel *)(_________p1)); \ + }) +#define __rcu_dereference_protected(p, c, space) \ + ({ \ + __do_rcu_dereference_check(c); \ + (void) (((typeof (*p) space *)p) == p); \ + ((typeof(*p) __force __kernel *)(p)); \ + }) + +#define __rcu_dereference_index_check(p, c) \ + ({ \ + typeof(p) _________p1 = ACCESS_ONCE(p); \ + __do_rcu_dereference_check(c); \ + smp_read_barrier_depends(); \ + (_________p1); \ + }) +#define __rcu_assign_pointer(p, v, space) \ + ({ \ + if (!__builtin_constant_p(v) || \ + ((v) != NULL)) \ + smp_wmb(); \ + (p) = (typeof(*v) __force space *)(v); \ + }) + + +/** + * rcu_access_pointer() - fetch RCU pointer with no dereferencing + * @p: The pointer to read + * + * Return the value of the specified RCU-protected pointer, but omit the + * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful + * when the value of this pointer is accessed, but the pointer is not + * dereferenced, for example, when testing an RCU-protected pointer against + * NULL. Although rcu_access_pointer() may also be used in cases where + * update-side locks prevent the value of the pointer from changing, you + * should instead use rcu_dereference_protected() for this use case. + */ +#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu) + /** - * rcu_dereference_check - rcu_dereference with debug checking + * rcu_dereference_check() - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Do an rcu_dereference(), but check that the conditions under which the - * dereference will take place are correct. Typically the conditions indicate - * the various locking conditions that should be held at that point. The check - * should return true if the conditions are satisfied. + * dereference will take place are correct. Typically the conditions + * indicate the various locking conditions that should be held at that + * point. The check should return true if the conditions are satisfied. + * An implicit check for being in an RCU read-side critical section + * (rcu_read_lock()) is included. * * For example: * - * bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() || - * lockdep_is_held(&foo->lock)); + * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock)); * * could be used to indicate to lockdep that foo->bar may only be dereferenced - * if either the RCU read lock is held, or that the lock required to replace + * if either rcu_read_lock() is held, or that the lock required to replace * the bar struct at foo->bar is held. * * Note that the list of conditions may also include indications of when a lock * need not be held, for example during initialisation or destruction of the * target struct: * - * bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() || - * lockdep_is_held(&foo->lock) || + * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) || * atomic_read(&foo->usage) == 0); + * + * Inserts memory barriers on architectures that require them + * (currently only the Alpha), prevents the compiler from refetching + * (and from merging fetches), and, more importantly, documents exactly + * which pointers are protected by RCU and checks that the pointer is + * annotated as __rcu. */ #define rcu_dereference_check(p, c) \ - ({ \ - __do_rcu_dereference_check(c); \ - rcu_dereference_raw(p); \ - }) + __rcu_dereference_check((p), rcu_read_lock_held() || (c), __rcu) + +/** + * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is the RCU-bh counterpart to rcu_dereference_check(). + */ +#define rcu_dereference_bh_check(p, c) \ + __rcu_dereference_check((p), rcu_read_lock_bh_held() || (c), __rcu) /** - * rcu_dereference_protected - fetch RCU pointer when updates prevented + * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is the RCU-sched counterpart to rcu_dereference_check(). + */ +#define rcu_dereference_sched_check(p, c) \ + __rcu_dereference_check((p), rcu_read_lock_sched_held() || (c), \ + __rcu) + +#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/ + +/** + * rcu_dereference_index_check() - rcu_dereference for indices with debug checking + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * Similar to rcu_dereference_check(), but omits the sparse checking. + * This allows rcu_dereference_index_check() to be used on integers, + * which can then be used as array indices. Attempting to use + * rcu_dereference_check() on an integer will give compiler warnings + * because the sparse address-space mechanism relies on dereferencing + * the RCU-protected pointer. Dereferencing integers is not something + * that even gcc will put up with. + * + * Note that this function does not implicitly check for RCU read-side + * critical sections. If this function gains lots of uses, it might + * make sense to provide versions for each flavor of RCU, but it does + * not make sense as of early 2010. + */ +#define rcu_dereference_index_check(p, c) \ + __rcu_dereference_index_check((p), (c)) + +/** + * rcu_dereference_protected() - fetch RCU pointer when updates prevented + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place * * Return the value of the specified RCU-protected pointer, but omit * both the smp_read_barrier_depends() and the ACCESS_ONCE(). This @@ -263,35 +381,61 @@ extern int rcu_my_thread_group_empty(void); * prevent the compiler from repeating this reference or combining it * with other references, so it should not be used without protection * of appropriate locks. + * + * This function is only for update-side use. Using this function + * when protected only by rcu_read_lock() will result in infrequent + * but very ugly failures. */ #define rcu_dereference_protected(p, c) \ - ({ \ - __do_rcu_dereference_check(c); \ - (p); \ - }) + __rcu_dereference_protected((p), (c), __rcu) -#else /* #ifdef CONFIG_PROVE_RCU */ +/** + * rcu_dereference_bh_protected() - fetch RCU-bh pointer when updates prevented + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is the RCU-bh counterpart to rcu_dereference_protected(). + */ +#define rcu_dereference_bh_protected(p, c) \ + __rcu_dereference_protected((p), (c), __rcu) -#define rcu_dereference_check(p, c) rcu_dereference_raw(p) -#define rcu_dereference_protected(p, c) (p) +/** + * rcu_dereference_sched_protected() - fetch RCU-sched pointer when updates prevented + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is the RCU-sched counterpart to rcu_dereference_protected(). + */ +#define rcu_dereference_sched_protected(p, c) \ + __rcu_dereference_protected((p), (c), __rcu) -#endif /* #else #ifdef CONFIG_PROVE_RCU */ /** - * rcu_access_pointer - fetch RCU pointer with no dereferencing + * rcu_dereference() - fetch RCU-protected pointer for dereferencing + * @p: The pointer to read, prior to dereferencing * - * Return the value of the specified RCU-protected pointer, but omit the - * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful - * when the value of this pointer is accessed, but the pointer is not - * dereferenced, for example, when testing an RCU-protected pointer against - * NULL. This may also be used in cases where update-side locks prevent - * the value of the pointer from changing, but rcu_dereference_protected() - * is a lighter-weight primitive for this use case. + * This is a simple wrapper around rcu_dereference_check(). */ -#define rcu_access_pointer(p) ACCESS_ONCE(p) +#define rcu_dereference(p) rcu_dereference_check(p, 0) /** - * rcu_read_lock - mark the beginning of an RCU read-side critical section. + * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing + * @p: The pointer to read, prior to dereferencing + * + * Makes rcu_dereference_check() do the dirty work. + */ +#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0) + +/** + * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing + * @p: The pointer to read, prior to dereferencing + * + * Makes rcu_dereference_check() do the dirty work. + */ +#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) + +/** + * rcu_read_lock() - mark the beginning of an RCU read-side critical section * * When synchronize_rcu() is invoked on one CPU while other CPUs * are within RCU read-side critical sections, then the @@ -337,7 +481,7 @@ static inline void rcu_read_lock(void) */ /** - * rcu_read_unlock - marks the end of an RCU read-side critical section. + * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * See rcu_read_lock() for more information. */ @@ -349,15 +493,16 @@ static inline void rcu_read_unlock(void) } /** - * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section + * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section * * This is equivalent of rcu_read_lock(), but to be used when updates - * are being done using call_rcu_bh(). Since call_rcu_bh() callbacks - * consider completion of a softirq handler to be a quiescent state, - * a process in RCU read-side critical section must be protected by - * disabling softirqs. Read-side critical sections in interrupt context - * can use just rcu_read_lock(). - * + * are being done using call_rcu_bh() or synchronize_rcu_bh(). Since + * both call_rcu_bh() and synchronize_rcu_bh() consider completion of a + * softirq handler to be a quiescent state, a process in RCU read-side + * critical section must be protected by disabling softirqs. Read-side + * critical sections in interrupt context can use just rcu_read_lock(), + * though this should at least be commented to avoid confusing people + * reading the code. */ static inline void rcu_read_lock_bh(void) { @@ -379,13 +524,12 @@ static inline void rcu_read_unlock_bh(void) } /** - * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section + * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section * - * Should be used with either - * - synchronize_sched() - * or - * - call_rcu_sched() and rcu_barrier_sched() - * on the write-side to insure proper synchronization. + * This is equivalent of rcu_read_lock(), but to be used when updates + * are being done using call_rcu_sched() or synchronize_rcu_sched(). + * Read-side critical sections can also be introduced by anything that + * disables preemption, including local_irq_disable() and friends. */ static inline void rcu_read_lock_sched(void) { @@ -420,54 +564,14 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) preempt_enable_notrace(); } - /** - * rcu_dereference_raw - fetch an RCU-protected pointer + * rcu_assign_pointer() - assign to RCU-protected pointer + * @p: pointer to assign to + * @v: value to assign (publish) * - * The caller must be within some flavor of RCU read-side critical - * section, or must be otherwise preventing the pointer from changing, - * for example, by holding an appropriate lock. This pointer may later - * be safely dereferenced. It is the caller's responsibility to have - * done the right thing, as this primitive does no checking of any kind. - * - * Inserts memory barriers on architectures that require them - * (currently only the Alpha), and, more importantly, documents - * exactly which pointers are protected by RCU. - */ -#define rcu_dereference_raw(p) ({ \ - typeof(p) _________p1 = ACCESS_ONCE(p); \ - smp_read_barrier_depends(); \ - (_________p1); \ - }) - -/** - * rcu_dereference - fetch an RCU-protected pointer, checking for RCU - * - * Makes rcu_dereference_check() do the dirty work. - */ -#define rcu_dereference(p) \ - rcu_dereference_check(p, rcu_read_lock_held()) - -/** - * rcu_dereference_bh - fetch an RCU-protected pointer, checking for RCU-bh - * - * Makes rcu_dereference_check() do the dirty work. - */ -#define rcu_dereference_bh(p) \ - rcu_dereference_check(p, rcu_read_lock_bh_held()) - -/** - * rcu_dereference_sched - fetch RCU-protected pointer, checking for RCU-sched - * - * Makes rcu_dereference_check() do the dirty work. - */ -#define rcu_dereference_sched(p) \ - rcu_dereference_check(p, rcu_read_lock_sched_held()) - -/** - * rcu_assign_pointer - assign (publicize) a pointer to a newly - * initialized structure that will be dereferenced by RCU read-side - * critical sections. Returns the value assigned. + * Assigns the specified value to the specified RCU-protected + * pointer, ensuring that any concurrent RCU readers will see + * any prior initialization. Returns the value assigned. * * Inserts memory barriers on architectures that require them * (pretty much all of them other than x86), and also prevents @@ -476,14 +580,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * call documents which pointers will be dereferenced by RCU read-side * code. */ - #define rcu_assign_pointer(p, v) \ - ({ \ - if (!__builtin_constant_p(v) || \ - ((v) != NULL)) \ - smp_wmb(); \ - (p) = (v); \ - }) + __rcu_assign_pointer((p), (v), __rcu) + +/** + * RCU_INIT_POINTER() - initialize an RCU protected pointer + * + * Initialize an RCU-protected pointer in such a way to avoid RCU-lockdep + * splats. + */ +#define RCU_INIT_POINTER(p, v) \ + p = (typeof(*v) __force __rcu *)(v) /* Infrastructure to implement the synchronize_() primitives. */ @@ -495,7 +602,7 @@ struct rcu_synchronize { extern void wakeme_after_rcu(struct rcu_head *head); /** - * call_rcu - Queue an RCU callback for invocation after a grace period. + * call_rcu() - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. * @func: actual update function to be invoked after the grace period * @@ -509,7 +616,7 @@ extern void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head)); /** - * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. + * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. * @head: structure to be used for queueing the RCU updates. * @func: actual update function to be invoked after the grace period * @@ -566,37 +673,4 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -#ifndef CONFIG_PROVE_RCU -#define __do_rcu_dereference_check(c) do { } while (0) -#endif /* #ifdef CONFIG_PROVE_RCU */ - -#define __rcu_dereference_index_check(p, c) \ - ({ \ - typeof(p) _________p1 = ACCESS_ONCE(p); \ - __do_rcu_dereference_check(c); \ - smp_read_barrier_depends(); \ - (_________p1); \ - }) - -/** - * rcu_dereference_index_check() - rcu_dereference for indices with debug checking - * @p: The pointer to read, prior to dereferencing - * @c: The conditions under which the dereference will take place - * - * Similar to rcu_dereference_check(), but omits the sparse checking. - * This allows rcu_dereference_index_check() to be used on integers, - * which can then be used as array indices. Attempting to use - * rcu_dereference_check() on an integer will give compiler warnings - * because the sparse address-space mechanism relies on dereferencing - * the RCU-protected pointer. Dereferencing integers is not something - * that even gcc will put up with. - * - * Note that this function does not implicitly check for RCU read-side - * critical sections. If this function gains lots of uses, it might - * make sense to provide versions for each flavor of RCU, but it does - * not make sense as of early 2010. - */ -#define rcu_dereference_index_check(p, c) \ - __rcu_dereference_index_check((p), (c)) - #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 4d5d2f546dbf..6f456a720ff0 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -108,12 +108,31 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** - * srcu_dereference - fetch SRCU-protected pointer with checking + * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing + * @p: the pointer to fetch and protect for later dereferencing + * @sp: pointer to the srcu_struct, which is used to check that we + * really are in an SRCU read-side critical section. + * @c: condition to check for update-side use * - * Makes rcu_dereference_check() do the dirty work. + * If PROVE_RCU is enabled, invoking this outside of an RCU read-side + * critical section will result in an RCU-lockdep splat, unless @c evaluates + * to 1. The @c argument will normally be a logical expression containing + * lockdep_is_held() calls. */ -#define srcu_dereference(p, sp) \ - rcu_dereference_check(p, srcu_read_lock_held(sp)) +#define srcu_dereference_check(p, sp, c) \ + __rcu_dereference_check((p), srcu_read_lock_held(sp) || (c), __rcu) + +/** + * srcu_dereference - fetch SRCU-protected pointer for later dereferencing + * @p: the pointer to fetch and protect for later dereferencing + * @sp: pointer to the srcu_struct, which is used to check that we + * really are in an SRCU read-side critical section. + * + * Makes rcu_dereference_check() do the dirty work. If PROVE_RCU + * is enabled, invoking this outside of an RCU read-side critical + * section will result in an RCU-lockdep splat. + */ +#define srcu_dereference(p, sp) srcu_dereference_check((p), (sp), 0) /** * srcu_read_lock - register a new reader for an SRCU-protected structure. -- cgit v1.2.3 From 67bdbffd696f29a0b68aa8daa285783a06651583 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 25 Feb 2010 16:55:13 +0100 Subject: rculist: avoid __rcu annotations This avoids warnings from missing __rcu annotations in the rculist implementation, making it possible to use the same lists in both RCU and non-RCU cases. We can add rculist annotations later, together with lockdep support for rculist, which is missing as well, but that may involve changing all the users. Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Pavel Emelyanov Cc: Sukadev Bhattiprolu Reviewed-by: Josh Triplett --- include/linux/rculist.h | 53 +++++++++++++++++++++++++++---------------- include/linux/rculist_nulls.h | 16 +++++++++---- 2 files changed, 45 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4ec3b38ce9c5..c10b1050dbe6 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -9,6 +9,12 @@ #include #include +/* + * return the ->next pointer of a list_head in an rcu safe + * way, we must not access it directly + */ +#define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) + /* * Insert a new entry between two known consecutive entries. * @@ -20,7 +26,7 @@ static inline void __list_add_rcu(struct list_head *new, { new->next = next; new->prev = prev; - rcu_assign_pointer(prev->next, new); + rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } @@ -138,7 +144,7 @@ static inline void list_replace_rcu(struct list_head *old, { new->next = old->next; new->prev = old->prev; - rcu_assign_pointer(new->prev->next, new); + rcu_assign_pointer(list_next_rcu(new->prev), new); new->next->prev = new; old->prev = LIST_POISON2; } @@ -193,7 +199,7 @@ static inline void list_splice_init_rcu(struct list_head *list, */ last->next = at; - rcu_assign_pointer(head->next, first); + rcu_assign_pointer(list_next_rcu(head), first); first->prev = head; at->prev = last; } @@ -208,7 +214,9 @@ static inline void list_splice_init_rcu(struct list_head *list, * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ - container_of(rcu_dereference_raw(ptr), type, member) + ({typeof (*ptr) __rcu *__ptr = (typeof (*ptr) __rcu __force *)ptr; \ + container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \ + }) /** * list_first_entry_rcu - get the first element from a list @@ -225,9 +233,9 @@ static inline void list_splice_init_rcu(struct list_head *list, list_entry_rcu((ptr)->next, type, member) #define __list_for_each_rcu(pos, head) \ - for (pos = rcu_dereference_raw((head)->next); \ + for (pos = rcu_dereference_raw(list_next_rcu(head)); \ pos != (head); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(list_next_rcu((pos))) /** * list_for_each_entry_rcu - iterate over rcu list of given type @@ -257,9 +265,9 @@ static inline void list_splice_init_rcu(struct list_head *list, * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_continue_rcu(pos, head) \ - for ((pos) = rcu_dereference_raw((pos)->next); \ + for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \ prefetch((pos)->next), (pos) != (head); \ - (pos) = rcu_dereference_raw((pos)->next)) + (pos) = rcu_dereference_raw(list_next_rcu(pos))) /** * list_for_each_entry_continue_rcu - continue iteration over list of given type @@ -314,12 +322,19 @@ static inline void hlist_replace_rcu(struct hlist_node *old, new->next = next; new->pprev = old->pprev; - rcu_assign_pointer(*new->pprev, new); + rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new); if (next) new->next->pprev = &new->next; old->pprev = LIST_POISON2; } +/* + * return the first or the next element in an RCU protected hlist + */ +#define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first))) +#define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next))) +#define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev))) + /** * hlist_add_head_rcu * @n: the element to add to the hash list. @@ -346,7 +361,7 @@ static inline void hlist_add_head_rcu(struct hlist_node *n, n->next = first; n->pprev = &h->first; - rcu_assign_pointer(h->first, n); + rcu_assign_pointer(hlist_first_rcu(h), n); if (first) first->pprev = &n->next; } @@ -374,7 +389,7 @@ static inline void hlist_add_before_rcu(struct hlist_node *n, { n->pprev = next->pprev; n->next = next; - rcu_assign_pointer(*(n->pprev), n); + rcu_assign_pointer(hlist_pprev_rcu(n), n); next->pprev = &n->next; } @@ -401,15 +416,15 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, { n->next = prev->next; n->pprev = &prev->next; - rcu_assign_pointer(prev->next, n); + rcu_assign_pointer(hlist_next_rcu(prev), n); if (n->next) n->next->pprev = &n->next; } -#define __hlist_for_each_rcu(pos, head) \ - for (pos = rcu_dereference((head)->first); \ - pos && ({ prefetch(pos->next); 1; }); \ - pos = rcu_dereference(pos->next)) +#define __hlist_for_each_rcu(pos, head) \ + for (pos = rcu_dereference(hlist_first_rcu(head)); \ + pos && ({ prefetch(pos->next); 1; }); \ + pos = rcu_dereference(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu - iterate over rcu list of given type @@ -422,11 +437,11 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ -#define hlist_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = rcu_dereference_raw((head)->first); \ +#define hlist_for_each_entry_rcu(tpos, pos, head, member) \ + for (pos = rcu_dereference_raw(hlist_first_rcu(head)); \ pos && ({ prefetch(pos->next); 1; }) && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index b70ffe53cb9f..2ae13714828b 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -37,6 +37,12 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) } } +#define hlist_nulls_first_rcu(head) \ + (*((struct hlist_nulls_node __rcu __force **)&(head)->first)) + +#define hlist_nulls_next_rcu(node) \ + (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) + /** * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. @@ -88,7 +94,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, n->next = first; n->pprev = &h->first; - rcu_assign_pointer(h->first, n); + rcu_assign_pointer(hlist_nulls_first_rcu(h), n); if (!is_a_nulls(first)) first->pprev = &n->next; } @@ -100,11 +106,11 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, * @member: the name of the hlist_nulls_node within the struct. * */ -#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = rcu_dereference_raw((head)->first); \ - (!is_a_nulls(pos)) && \ +#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ + for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ + (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) #endif #endif -- cgit v1.2.3 From 2c392b8c3450ceb69ba1b93cb0cddb3998fb8cdc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 24 Feb 2010 19:41:39 +0100 Subject: cgroups: __rcu annotations Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Acked-by: Paul Menage Cc: Li Zefan Reviewed-by: Josh Triplett --- include/linux/cgroup.h | 4 ++-- include/linux/sched.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ed3e92e41c6e..3cb7d04308cd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -75,7 +75,7 @@ struct cgroup_subsys_state { unsigned long flags; /* ID for this css, if possible */ - struct css_id *id; + struct css_id __rcu *id; }; /* bits in struct cgroup_subsys_state flags field */ @@ -205,7 +205,7 @@ struct cgroup { struct list_head children; /* my children */ struct cgroup *parent; /* my parent */ - struct dentry *dentry; /* cgroup fs entry, RCU protected */ + struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */ /* Private pointers for each registered subsystem */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; diff --git a/include/linux/sched.h b/include/linux/sched.h index 1e2a6db2d7dd..bbffd087476c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1418,7 +1418,7 @@ struct task_struct { #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock */ - struct css_set *cgroups; + struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock */ struct list_head cg_list; #endif -- cgit v1.2.3 From 1b0ba1c9037b2265d6e5d0165d31e4c0269b603b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 24 Feb 2010 19:45:09 +0100 Subject: credentials: rcu annotation Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar Acked-by: David Howells Reviewed-by: Josh Triplett --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index bbffd087476c..2c756666c111 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1288,9 +1288,9 @@ struct task_struct { struct list_head cpu_timers[3]; /* process credentials */ - const struct cred *real_cred; /* objective and real subjective task + const struct cred __rcu *real_cred; /* objective and real subjective task * credentials (COW) */ - const struct cred *cred; /* effective (overridable) subjective task + const struct cred __rcu *cred; /* effective (overridable) subjective task * credentials (COW) */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations -- cgit v1.2.3 From e63ba744a64d234c8a07c469ab1806443cb0a6ff Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 26 Feb 2010 18:01:20 +0100 Subject: keys: __rcu annotations Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Acked-by: David Howells Reviewed-by: Josh Triplett --- include/linux/cred.h | 2 +- include/linux/key.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 4d2c39573f36..4aaeab376446 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -84,7 +84,7 @@ struct thread_group_cred { atomic_t usage; pid_t tgid; /* thread group process ID */ spinlock_t lock; - struct key *session_keyring; /* keyring inherited over fork */ + struct key __rcu *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process */ struct rcu_head rcu; /* RCU deletion hook */ }; diff --git a/include/linux/key.h b/include/linux/key.h index cd50dfa1d4c2..3db0adce1fda 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -178,8 +178,9 @@ struct key { */ union { unsigned long value; + void __rcu *rcudata; void *data; - struct keyring_list *subscriptions; + struct keyring_list __rcu *subscriptions; } payload; }; -- cgit v1.2.3 From 5b22216e11f717adc344abc7f97b42e03127c6c0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Mar 2010 10:20:10 +0100 Subject: nfs: __rcu annotations Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Acked-by: Trond Myklebust --- include/linux/nfs_fs.h | 2 +- include/linux/sunrpc/auth_gss.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 508f8cf6da37..d0edf7d823ae 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -185,7 +185,7 @@ struct nfs_inode { struct nfs4_cached_acl *nfs4_acl; /* NFSv4 state */ struct list_head open_states; - struct nfs_delegation *delegation; + struct nfs_delegation __rcu *delegation; fmode_t delegation_state; struct rw_semaphore rwsem; #endif /* CONFIG_NFS_V4*/ diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h index 671538d25bc1..8eee9dbbfe7a 100644 --- a/include/linux/sunrpc/auth_gss.h +++ b/include/linux/sunrpc/auth_gss.h @@ -69,7 +69,7 @@ struct gss_cl_ctx { enum rpc_gss_proc gc_proc; u32 gc_seq; spinlock_t gc_seq_lock; - struct gss_ctx *gc_gss_ctx; + struct gss_ctx __rcu *gc_gss_ctx; struct xdr_netobj gc_wire_ctx; u32 gc_win; unsigned long gc_expiry; @@ -80,7 +80,7 @@ struct gss_upcall_msg; struct gss_cred { struct rpc_cred gc_base; enum rpc_gss_svc gc_service; - struct gss_cl_ctx *gc_ctx; + struct gss_cl_ctx __rcu *gc_ctx; struct gss_upcall_msg *gc_upcall; unsigned long gc_upcall_timestamp; unsigned char gc_machine_cred : 1; -- cgit v1.2.3 From 2be85279281bafe7de808ca99de59af4fd474c49 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Mar 2010 15:50:28 +0100 Subject: input: __rcu annotations Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Dmitry Torokhov Acked-by: Dmitry Torokhov Reviewed-by: Josh Triplett --- include/linux/input.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/input.h b/include/linux/input.h index 896a92227bc4..d6ae1761be97 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -1196,7 +1196,7 @@ struct input_dev { int (*flush)(struct input_dev *dev, struct file *file); int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value); - struct input_handle *grab; + struct input_handle __rcu *grab; spinlock_t event_lock; struct mutex mutex; -- cgit v1.2.3 From 4b6a2872a2a00042ee50024822ab706e5456aad8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Mar 2010 15:59:23 +0100 Subject: kvm: add __rcu annotations Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Avi Kivity Cc: Marcelo Tosatti Reviewed-by: Josh Triplett --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c13cc48697aa..ac740b26eb10 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -205,7 +205,7 @@ struct kvm { struct mutex irq_lock; #ifdef CONFIG_HAVE_KVM_IRQCHIP - struct kvm_irq_routing_table *irq_routing; + struct kvm_irq_routing_table __rcu *irq_routing; struct hlist_head mask_notifier_list; struct hlist_head irq_ack_notifier_list; #endif -- cgit v1.2.3 From 4221a9918e38b7494cee341dda7b7b4bb8c04bde Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 26 Jun 2010 01:08:19 +0900 Subject: Add RCU check for find_task_by_vpid(). find_task_by_vpid() says "Must be called under rcu_read_lock().". But due to commit 3120438 "rcu: Disable lockdep checking in RCU list-traversal primitives", we are currently unable to catch "find_task_by_vpid() with tasklist_lock held but RCU lock not held" errors due to the RCU-lockdep checks being suppressed in the RCU variants of the struct list_head traversals. This commit therefore places an explicit check for being in an RCU read-side critical section in find_task_by_pid_ns(). =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- kernel/pid.c:386 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 1 lock held by rc.sysinit/1102: #0: (tasklist_lock){.+.+..}, at: [] sys_setpgid+0x40/0x160 stack backtrace: Pid: 1102, comm: rc.sysinit Not tainted 2.6.35-rc3-dirty #1 Call Trace: [] lockdep_rcu_dereference+0x94/0xb0 [] find_task_by_pid_ns+0x6d/0x70 [] find_task_by_vpid+0x18/0x20 [] sys_setpgid+0x47/0x160 [] sysenter_do_call+0x12/0x36 Commit updated to use a new rcu_lockdep_assert() exported API rather than the old internal __do_rcu_dereference(). Signed-off-by: Tetsuo Handa Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index b973dea2d6b0..b124bc6a75ad 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -215,7 +215,11 @@ static inline int rcu_read_lock_sched_held(void) extern int rcu_my_thread_group_empty(void); -#define __do_rcu_dereference_check(c) \ +/** + * rcu_lockdep_assert - emit lockdep splat if specified condition not met + * @c: condition to check + */ +#define rcu_lockdep_assert(c) \ do { \ static bool __warned; \ if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \ @@ -226,7 +230,7 @@ extern int rcu_my_thread_group_empty(void); #else /* #ifdef CONFIG_PROVE_RCU */ -#define __do_rcu_dereference_check(c) do { } while (0) +#define rcu_lockdep_assert(c) do { } while (0) #endif /* #else #ifdef CONFIG_PROVE_RCU */ @@ -247,14 +251,14 @@ extern int rcu_my_thread_group_empty(void); #define __rcu_dereference_check(p, c, space) \ ({ \ typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \ - __do_rcu_dereference_check(c); \ + rcu_lockdep_assert(c); \ (void) (((typeof (*p) space *)p) == p); \ smp_read_barrier_depends(); \ ((typeof(*p) __force __kernel *)(_________p1)); \ }) #define __rcu_dereference_protected(p, c, space) \ ({ \ - __do_rcu_dereference_check(c); \ + rcu_lockdep_assert(c); \ (void) (((typeof (*p) space *)p) == p); \ ((typeof(*p) __force __kernel *)(p)); \ }) @@ -262,7 +266,7 @@ extern int rcu_my_thread_group_empty(void); #define __rcu_dereference_index_check(p, c) \ ({ \ typeof(p) _________p1 = ACCESS_ONCE(p); \ - __do_rcu_dereference_check(c); \ + rcu_lockdep_assert(c); \ smp_read_barrier_depends(); \ (_________p1); \ }) -- cgit v1.2.3 From 77d8485a8b5416c615b6acd95f01bfcacd7d81ff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Jul 2010 17:38:59 -0700 Subject: rcu: improve kerneldoc for rcu_read_lock(), call_rcu(), and synchronize_rcu() Make it explicit that new RCU read-side critical sections that start after call_rcu() and synchronize_rcu() start might still be running after the end of the relevant grace period. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index b124bc6a75ad..3e1b6625553b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -450,7 +450,7 @@ extern int rcu_my_thread_group_empty(void); * until after the all the other CPUs exit their critical sections. * * Note, however, that RCU callbacks are permitted to run concurrently - * with RCU read-side critical sections. One way that this can happen + * with new RCU read-side critical sections. One way that this can happen * is via the following sequence of events: (1) CPU 0 enters an RCU * read-side critical section, (2) CPU 1 invokes call_rcu() to register * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, @@ -608,11 +608,13 @@ extern void wakeme_after_rcu(struct rcu_head *head); /** * call_rcu() - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period + * @func: actual callback function to be invoked after the grace period * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. RCU read-side critical * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ @@ -622,9 +624,9 @@ extern void call_rcu(struct rcu_head *head, /** * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period + * @func: actual callback function to be invoked after the grace period * - * The update function will be invoked some time after a full grace + * The callback function will be invoked some time after a full grace * period elapses, in other words after all currently executing RCU * read-side critical sections have completed. call_rcu_bh() assumes * that the read-side critical sections end on completion of a softirq -- cgit v1.2.3 From 374a8e0dc33c984fac284de7d57d77af3cfdbfb7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 24 Feb 2010 20:00:13 +0100 Subject: notifiers: __rcu annotations Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Alan Cox Reviewed-by: Josh Triplett --- include/linux/notifier.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/notifier.h b/include/linux/notifier.h index b2f1a4d83550..2026f9e1ceb8 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -49,28 +49,28 @@ struct notifier_block { int (*notifier_call)(struct notifier_block *, unsigned long, void *); - struct notifier_block *next; + struct notifier_block __rcu *next; int priority; }; struct atomic_notifier_head { spinlock_t lock; - struct notifier_block *head; + struct notifier_block __rcu *head; }; struct blocking_notifier_head { struct rw_semaphore rwsem; - struct notifier_block *head; + struct notifier_block __rcu *head; }; struct raw_notifier_head { - struct notifier_block *head; + struct notifier_block __rcu *head; }; struct srcu_notifier_head { struct mutex mutex; struct srcu_struct srcu; - struct notifier_block *head; + struct notifier_block __rcu *head; }; #define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ -- cgit v1.2.3 From a1115570b31091f3e3ab9e6cf7ee8d320a42be84 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 25 Feb 2010 23:43:52 +0100 Subject: radix-tree: __rcu annotations Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Nick Piggin Reviewed-by: Josh Triplett --- include/linux/radix-tree.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 634b8e674ac5..a39cbed9ee17 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -47,6 +47,8 @@ static inline void *radix_tree_indirect_to_ptr(void *ptr) { return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); } +#define radix_tree_indirect_to_ptr(ptr) \ + radix_tree_indirect_to_ptr((void __force *)(ptr)) static inline int radix_tree_is_indirect_ptr(void *ptr) { @@ -61,7 +63,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) struct radix_tree_root { unsigned int height; gfp_t gfp_mask; - struct radix_tree_node *rnode; + struct radix_tree_node __rcu *rnode; }; #define RADIX_TREE_INIT(mask) { \ -- cgit v1.2.3 From d2c2486bc8e185548490e8edbc84d185de9eaff1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 26 Feb 2010 14:53:26 +0100 Subject: idr: __rcu annotations Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Manfred Spraul Reviewed-by: Josh Triplett --- include/linux/idr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/idr.h b/include/linux/idr.h index e968db71e33a..cdb715e58e3e 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -50,14 +50,14 @@ struct idr_layer { unsigned long bitmap; /* A zero bit means "space here" */ - struct idr_layer *ary[1< Date: Wed, 24 Feb 2010 20:01:56 +0100 Subject: kernel: __rcu annotations This adds annotations for RCU operations in core kernel components Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Al Viro Cc: Jens Axboe Cc: Andrew Morton Reviewed-by: Josh Triplett --- include/linux/fdtable.h | 6 +++--- include/linux/fs.h | 2 +- include/linux/genhd.h | 6 +++--- include/linux/init_task.h | 4 ++-- include/linux/iocontext.h | 2 +- include/linux/mm_types.h | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index f59ed297b661..133c0ba25e30 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -31,7 +31,7 @@ struct embedded_fd_set { struct fdtable { unsigned int max_fds; - struct file ** fd; /* current fd array */ + struct file __rcu **fd; /* current fd array */ fd_set *close_on_exec; fd_set *open_fds; struct rcu_head rcu; @@ -46,7 +46,7 @@ struct files_struct { * read mostly part */ atomic_t count; - struct fdtable *fdt; + struct fdtable __rcu *fdt; struct fdtable fdtab; /* * written part on a separate cache line in SMP @@ -55,7 +55,7 @@ struct files_struct { int next_fd; struct embedded_fd_set close_on_exec_init; struct embedded_fd_set open_fds_init; - struct file * fd_array[NR_OPEN_DEFAULT]; + struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; #define rcu_dereference_check_fdtable(files, fdtfd) \ diff --git a/include/linux/fs.h b/include/linux/fs.h index 76041b614758..aa3dc8d20436 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1380,7 +1380,7 @@ struct super_block { * Saved mount options for lazy filesystems using * generic_show_options() */ - char *s_options; + char __rcu *s_options; }; extern struct timespec current_fs_time(struct super_block *sb); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5f2f4c4d8fb0..af3f06b41dc1 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -129,8 +129,8 @@ struct blk_scsi_cmd_filter { struct disk_part_tbl { struct rcu_head rcu_head; int len; - struct hd_struct *last_lookup; - struct hd_struct *part[]; + struct hd_struct __rcu *last_lookup; + struct hd_struct __rcu *part[]; }; struct gendisk { @@ -149,7 +149,7 @@ struct gendisk { * non-critical accesses use RCU. Always access through * helpers. */ - struct disk_part_tbl *part_tbl; + struct disk_part_tbl __rcu *part_tbl; struct hd_struct part0; const struct block_device_operations *fops; diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 1f43fa56f600..6460fc65ed6b 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -137,8 +137,8 @@ extern struct cred init_cred; .children = LIST_HEAD_INIT(tsk.children), \ .sibling = LIST_HEAD_INIT(tsk.sibling), \ .group_leader = &tsk, \ - .real_cred = &init_cred, \ - .cred = &init_cred, \ + RCU_INIT_POINTER(.real_cred, &init_cred), \ + RCU_INIT_POINTER(.cred, &init_cred), \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(tsk.cred_guard_mutex), \ .comm = "swapper", \ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 64d529133031..3e70b21884a9 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -53,7 +53,7 @@ struct io_context { struct radix_tree_root radix_root; struct hlist_head cic_list; - void *ioc_data; + void __rcu *ioc_data; }; static inline struct io_context *ioc_task_link(struct io_context *ioc) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b8bb9a6a1f37..05537a5eb855 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -299,7 +299,7 @@ struct mm_struct { * new_owner->mm == mm * new_owner->alloc_lock is held */ - struct task_struct *owner; + struct task_struct __rcu *owner; #endif #ifdef CONFIG_PROC_FS -- cgit v1.2.3 From 5e8067adfdbaf97039a97540765b1e16eb8d61cc Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 17 Apr 2010 08:48:41 -0400 Subject: rcu head remove init RCU heads really don't need to be initialized. Their state before call_rcu() really does not matter. We need to keep init/destroy_rcu_head_on_stack() though, since we want debugobjects to be able to keep track of these objects. Signed-off-by: Alexey Dobriyan Signed-off-by: Mathieu Desnoyers CC: David S. Miller CC: "Paul E. McKenney" CC: akpm@linux-foundation.org CC: mingo@elte.hu CC: laijs@cn.fujitsu.com CC: dipankar@in.ibm.com CC: josh@joshtriplett.org CC: dvhltc@us.ibm.com CC: niv@us.ibm.com CC: tglx@linutronix.de CC: peterz@infradead.org CC: rostedt@goodmis.org CC: Valdis.Kletnieks@vt.edu CC: dhowells@redhat.com CC: eric.dumazet@gmail.com CC: Alexey Dobriyan Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 3e1b6625553b..27b44b3e3024 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -75,12 +75,6 @@ extern void rcu_init(void); #error "Unknown RCU implementation specified to kernel configuration" #endif -#define RCU_HEAD_INIT { .next = NULL, .func = NULL } -#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT -#define INIT_RCU_HEAD(ptr) do { \ - (ptr)->next = NULL; (ptr)->func = NULL; \ -} while (0) - /* * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic * initialization and destruction of rcu_head on the stack. rcu_head structures -- cgit v1.2.3 From a57eb940d130477a799dfb24a570ee04979c0f7f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Jun 2010 16:49:16 -0700 Subject: rcu: Add a TINY_PREEMPT_RCU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a small-memory-footprint uniprocessor-only implementation of preemptible RCU. This implementation uses but a single blocked-tasks list rather than the combinatorial number used per leaf rcu_node by TREE_PREEMPT_RCU, which reduces memory consumption and greatly simplifies processing. This version also takes advantage of uniprocessor execution to accelerate grace periods in the case where there are no readers. The general design is otherwise broadly similar to that of TREE_PREEMPT_RCU. This implementation is a step towards having RCU implementation driven off of the SMP and PREEMPT kernel configuration variables, which can happen once this implementation has accumulated sufficient experience. Removed ACCESS_ONCE() from __rcu_read_unlock() and added barrier() as suggested by Steve Rostedt in order to avoid the compiler-reordering issue noted by Mathieu Desnoyers (http://lkml.org/lkml/2010/8/16/183). As can be seen below, CONFIG_TINY_PREEMPT_RCU represents almost 5Kbyte savings compared to CONFIG_TREE_PREEMPT_RCU. Of course, for non-real-time workloads, CONFIG_TINY_RCU is even better. CONFIG_TREE_PREEMPT_RCU text data bss dec filename 13 0 0 13 kernel/rcupdate.o 6170 825 28 7023 kernel/rcutree.o ---- 7026 Total CONFIG_TINY_PREEMPT_RCU text data bss dec filename 13 0 0 13 kernel/rcupdate.o 2081 81 8 2170 kernel/rcutiny.o ---- 2183 Total CONFIG_TINY_RCU (non-preemptible) text data bss dec filename 13 0 0 13 kernel/rcupdate.o 719 25 0 744 kernel/rcutiny.o --- 757 Total Requested-by: Loïc Minier Signed-off-by: Paul E. McKenney --- include/linux/hardirq.h | 2 +- include/linux/init_task.h | 10 +++- include/linux/rcupdate.h | 3 +- include/linux/rcutiny.h | 126 ++++++++++++++++++++++++++++++++-------------- include/linux/rcutree.h | 2 + include/linux/sched.h | 10 ++-- 6 files changed, 108 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index d5b387669dab..1f4517d55b19 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -139,7 +139,7 @@ static inline void account_system_vtime(struct task_struct *tsk) #endif #if defined(CONFIG_NO_HZ) -#if defined(CONFIG_TINY_RCU) +#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) extern void rcu_enter_nohz(void); extern void rcu_exit_nohz(void); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 6460fc65ed6b..2fea6c8ef6ba 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -82,11 +82,17 @@ extern struct group_info init_groups; # define CAP_INIT_BSET CAP_FULL_SET #ifdef CONFIG_TREE_PREEMPT_RCU +#define INIT_TASK_RCU_TREE_PREEMPT() \ + .rcu_blocked_node = NULL, +#else +#define INIT_TASK_RCU_TREE_PREEMPT(tsk) +#endif +#ifdef CONFIG_PREEMPT_RCU #define INIT_TASK_RCU_PREEMPT(tsk) \ .rcu_read_lock_nesting = 0, \ .rcu_read_unlock_special = 0, \ - .rcu_blocked_node = NULL, \ - .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), + .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ + INIT_TASK_RCU_TREE_PREEMPT() #else #define INIT_TASK_RCU_PREEMPT(tsk) #endif diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 27b44b3e3024..24b896649384 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -58,7 +58,6 @@ struct rcu_head { }; /* Exported common interfaces */ -extern void rcu_barrier(void); extern void rcu_barrier_bh(void); extern void rcu_barrier_sched(void); extern void synchronize_sched_expedited(void); @@ -69,7 +68,7 @@ extern void rcu_init(void); #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) #include -#elif defined(CONFIG_TINY_RCU) +#elif defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) #include #else #error "Unknown RCU implementation specified to kernel configuration" diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index e2e893144a84..4cc5eba41616 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -29,66 +29,51 @@ void rcu_sched_qs(int cpu); void rcu_bh_qs(int cpu); -static inline void rcu_note_context_switch(int cpu) -{ - rcu_sched_qs(cpu); -} +#ifdef CONFIG_TINY_RCU #define __rcu_read_lock() preempt_disable() #define __rcu_read_unlock() preempt_enable() +#else /* #ifdef CONFIG_TINY_RCU */ +void __rcu_read_lock(void); +void __rcu_read_unlock(void); +#endif /* #else #ifdef CONFIG_TINY_RCU */ #define __rcu_read_lock_bh() local_bh_disable() #define __rcu_read_unlock_bh() local_bh_enable() -#define call_rcu_sched call_rcu +extern void call_rcu_sched(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)); #define rcu_init_sched() do { } while (0) -extern void rcu_check_callbacks(int cpu, int user); -static inline int rcu_needs_cpu(int cpu) -{ - return 0; -} +extern void synchronize_sched(void); -/* - * Return the number of grace periods. - */ -static inline long rcu_batches_completed(void) -{ - return 0; -} +#ifdef CONFIG_TINY_RCU -/* - * Return the number of bottom-half grace periods. - */ -static inline long rcu_batches_completed_bh(void) -{ - return 0; -} +#define call_rcu call_rcu_sched -static inline void rcu_force_quiescent_state(void) +static inline void synchronize_rcu(void) { + synchronize_sched(); } -static inline void rcu_bh_force_quiescent_state(void) +static inline void synchronize_rcu_expedited(void) { + synchronize_sched(); /* Only one CPU, so pretty fast anyway!!! */ } -static inline void rcu_sched_force_quiescent_state(void) +static inline void rcu_barrier(void) { + rcu_barrier_sched(); /* Only one CPU, so only one list of callbacks! */ } -extern void synchronize_sched(void); +#else /* #ifdef CONFIG_TINY_RCU */ -static inline void synchronize_rcu(void) -{ - synchronize_sched(); -} +void synchronize_rcu(void); +void rcu_barrier(void); +void synchronize_rcu_expedited(void); -static inline void synchronize_rcu_bh(void) -{ - synchronize_sched(); -} +#endif /* #else #ifdef CONFIG_TINY_RCU */ -static inline void synchronize_rcu_expedited(void) +static inline void synchronize_rcu_bh(void) { synchronize_sched(); } @@ -117,15 +102,82 @@ static inline void rcu_exit_nohz(void) #endif /* #else #ifdef CONFIG_NO_HZ */ +#ifdef CONFIG_TINY_RCU + +static inline void rcu_preempt_note_context_switch(void) +{ +} + static inline void exit_rcu(void) { } +static inline int rcu_needs_cpu(int cpu) +{ + return 0; +} + static inline int rcu_preempt_depth(void) { return 0; } +#else /* #ifdef CONFIG_TINY_RCU */ + +void rcu_preempt_note_context_switch(void); +extern void exit_rcu(void); +int rcu_preempt_needs_cpu(void); + +static inline int rcu_needs_cpu(int cpu) +{ + return rcu_preempt_needs_cpu(); +} + +/* + * Defined as macro as it is a very low level header + * included from areas that don't even know about current + * FIXME: combine with include/linux/rcutree.h into rcupdate.h. + */ +#define rcu_preempt_depth() (current->rcu_read_lock_nesting) + +#endif /* #else #ifdef CONFIG_TINY_RCU */ + +static inline void rcu_note_context_switch(int cpu) +{ + rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(); +} + +extern void rcu_check_callbacks(int cpu, int user); + +/* + * Return the number of grace periods. + */ +static inline long rcu_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of bottom-half grace periods. + */ +static inline long rcu_batches_completed_bh(void) +{ + return 0; +} + +static inline void rcu_force_quiescent_state(void) +{ +} + +static inline void rcu_bh_force_quiescent_state(void) +{ +} + +static inline void rcu_sched_force_quiescent_state(void) +{ +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC extern int rcu_scheduler_active __read_mostly; diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index c0ed1c056f29..c13b85dd22bc 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -95,6 +95,8 @@ static inline void synchronize_rcu_bh_expedited(void) synchronize_sched_expedited(); } +extern void rcu_barrier(void); + extern void rcu_check_callbacks(int cpu, int user); extern long rcu_batches_completed(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c756666c111..e18473f0eb78 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1202,11 +1202,13 @@ struct task_struct { unsigned int policy; cpumask_t cpus_allowed; -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; char rcu_read_unlock_special; - struct rcu_node *rcu_blocked_node; struct list_head rcu_node_entry; +#endif /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_TREE_PREEMPT_RCU + struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -1740,7 +1742,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) -#ifdef CONFIG_TREE_PREEMPT_RCU +#ifdef CONFIG_PREEMPT_RCU #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ @@ -1749,7 +1751,9 @@ static inline void rcu_copy_process(struct task_struct *p) { p->rcu_read_lock_nesting = 0; p->rcu_read_unlock_special = 0; +#ifdef CONFIG_TREE_PREEMPT_RCU p->rcu_blocked_node = NULL; +#endif INIT_LIST_HEAD(&p->rcu_node_entry); } -- cgit v1.2.3 From 9079fd7c2e06a92cf27d05224a1f478581916c5b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 7 Aug 2010 21:59:54 -0700 Subject: rcu: update obsolete rcu_read_lock() comment. The comment says that blocking is illegal in rcu_read_lock()-style RCU read-side critical sections, which is no longer entirely true given preemptible RCU. This commit provides a fix. Suggested-by: David Miller Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 24b896649384..d7af96ef6fcf 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -458,7 +458,20 @@ extern int rcu_my_thread_group_empty(void); * will be deferred until the outermost RCU read-side critical section * completes. * - * It is illegal to block while in an RCU read-side critical section. + * You can avoid reading and understanding the next paragraph by + * following this rule: don't put anything in an rcu_read_lock() RCU + * read-side critical section that would block in a !PREEMPT kernel. + * But if you want the full story, read on! + * + * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), it + * is illegal to block while in an RCU read-side critical section. In + * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU) + * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may + * be preempted, but explicit blocking is illegal. Finally, in preemptible + * RCU implementations in real-time (CONFIG_PREEMPT_RT) kernel builds, + * RCU read-side critical sections may be preempted and they may also + * block, but only when acquiring spinlocks that are subject to priority + * inheritance. */ static inline void rcu_read_lock(void) { -- cgit v1.2.3 From 53d84e004d5e8c018be395c4330dc72fd60bd13e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Aug 2010 14:28:53 -0700 Subject: rcu: permit suppressing current grace period's CPU stall warnings When using a kernel debugger, a long sojourn in the debugger can get you lots of RCU CPU stall warnings once you resume. This might not be helpful, especially if you are using the system console. This patch therefore allows RCU CPU stall warnings to be suppressed, but only for the duration of the current set of grace periods. This differs from Jason's original patch in that it adds support for tiny RCU and preemptible RCU, and uses a slightly different method for suppressing the RCU CPU stall warning messages. Signed-off-by: Jason Wessel Signed-off-by: Paul E. McKenney Tested-by: Jason Wessel --- include/linux/rcutiny.h | 4 ++++ include/linux/rcutree.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 4cc5eba41616..3fa179784e18 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -178,6 +178,10 @@ static inline void rcu_sched_force_quiescent_state(void) { } +static inline void rcu_cpu_stall_reset(void) +{ +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC extern int rcu_scheduler_active __read_mostly; diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index c13b85dd22bc..0726809497ba 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -36,6 +36,7 @@ extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); extern void rcu_note_context_switch(int cpu); extern int rcu_needs_cpu(int cpu); +extern void rcu_cpu_stall_reset(void); #ifdef CONFIG_TREE_PREEMPT_RCU -- cgit v1.2.3 From a3dc3fb161f9b4066c0fce22db72638af8baf83b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Aug 2010 16:16:25 -0700 Subject: rcu: repair code-duplication FIXMEs Combine the duplicate definitions of ULONG_CMP_GE(), ULONG_CMP_LT(), and rcu_preempt_depth() into include/linux/rcupdate.h. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 15 +++++++++++++++ include/linux/rcutiny.h | 7 ------- include/linux/rcutree.h | 6 ------ 3 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d7af96ef6fcf..325bad7bbca9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -47,6 +47,9 @@ extern int rcutorture_runnable; /* for sysctl */ #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) +#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) + /** * struct rcu_head - callback structure for use with RCU * @next: next update requests in a list @@ -66,6 +69,18 @@ extern int sched_expedited_torture_stats(char *page); /* Internal to kernel */ extern void rcu_init(void); +#ifdef CONFIG_PREEMPT_RCU + +/* + * Defined as a macro as it is a very low level header included from + * areas that don't even know about current. This gives the rcu_read_lock() + * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other + * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. + */ +#define rcu_preempt_depth() (current->rcu_read_lock_nesting) + +#endif /* #ifdef CONFIG_PREEMPT_RCU */ + #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) #include #elif defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 3fa179784e18..c6b11dc5ba0a 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -133,13 +133,6 @@ static inline int rcu_needs_cpu(int cpu) return rcu_preempt_needs_cpu(); } -/* - * Defined as macro as it is a very low level header - * included from areas that don't even know about current - * FIXME: combine with include/linux/rcutree.h into rcupdate.h. - */ -#define rcu_preempt_depth() (current->rcu_read_lock_nesting) - #endif /* #else #ifdef CONFIG_TINY_RCU */ static inline void rcu_note_context_switch(int cpu) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 0726809497ba..54a20c11f98d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -45,12 +45,6 @@ extern void __rcu_read_unlock(void); extern void synchronize_rcu(void); extern void exit_rcu(void); -/* - * Defined as macro as it is a very low level header - * included from areas that don't even know about current - */ -#define rcu_preempt_depth() (current->rcu_read_lock_nesting) - #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ static inline void __rcu_read_lock(void) -- cgit v1.2.3 From 73d4da4d360136826b36f78f5cf72b29da82c8a6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Aug 2010 10:50:54 -0700 Subject: rcu: Upgrade srcu_read_lock() docbook about SRCU grace periods It is illegal to wait for an SRCU grace period while within the corresponding flavor of SRCU read-side critical section. Therefore, this commit updates the srcu_read_lock() docbook accordingly. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 6f456a720ff0..58971e891f48 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -139,7 +139,12 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp) * @sp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section. Note that SRCU read-side - * critical sections may be nested. + * critical sections may be nested. However, it is illegal to + * call anything that waits on an SRCU grace period for the same + * srcu_struct, whether directly or indirectly. Please note that + * one way to indirectly wait on an SRCU grace period is to acquire + * a mutex that is held elsewhere while calling synchronize_srcu() or + * synchronize_srcu_expedited(). */ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) { -- cgit v1.2.3 From 7b0b759b65247cbc66384a912be9acf8d4800636 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Aug 2010 14:18:46 -0700 Subject: rcu: combine duplicate code, courtesy of CONFIG_PREEMPT_RCU The CONFIG_PREEMPT_RCU kernel configuration parameter was recently re-introduced, but as an indication of the type of RCU (preemptible vs. non-preemptible) instead of as selecting a given implementation. This commit uses CONFIG_PREEMPT_RCU to combine duplicate code from include/linux/rcutiny.h and include/linux/rcutree.h into include/linux/rcupdate.h. This commit also combines a few other pieces of duplicate code that have accumulated. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/rcutiny.h | 51 -------------------------------- include/linux/rcutree.h | 50 -------------------------------- 3 files changed, 72 insertions(+), 104 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 325bad7bbca9..89414d67d961 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -61,16 +61,30 @@ struct rcu_head { }; /* Exported common interfaces */ +extern void call_rcu_sched(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)); +extern void synchronize_sched(void); extern void rcu_barrier_bh(void); extern void rcu_barrier_sched(void); extern void synchronize_sched_expedited(void); extern int sched_expedited_torture_stats(char *page); -/* Internal to kernel */ -extern void rcu_init(void); +static inline void __rcu_read_lock_bh(void) +{ + local_bh_disable(); +} + +static inline void __rcu_read_unlock_bh(void) +{ + local_bh_enable(); +} #ifdef CONFIG_PREEMPT_RCU +extern void __rcu_read_lock(void); +extern void __rcu_read_unlock(void); +void synchronize_rcu(void); + /* * Defined as a macro as it is a very low level header included from * areas that don't even know about current. This gives the rcu_read_lock() @@ -79,7 +93,53 @@ extern void rcu_init(void); */ #define rcu_preempt_depth() (current->rcu_read_lock_nesting) -#endif /* #ifdef CONFIG_PREEMPT_RCU */ +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +static inline void __rcu_read_lock(void) +{ + preempt_disable(); +} + +static inline void __rcu_read_unlock(void) +{ + preempt_enable(); +} + +static inline void synchronize_rcu(void) +{ + synchronize_sched(); +} + +static inline int rcu_preempt_depth(void) +{ + return 0; +} + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + +/* Internal to kernel */ +extern void rcu_init(void); +extern void rcu_sched_qs(int cpu); +extern void rcu_bh_qs(int cpu); +extern void rcu_check_callbacks(int cpu, int user); +struct notifier_block; + +#ifdef CONFIG_NO_HZ + +extern void rcu_enter_nohz(void); +extern void rcu_exit_nohz(void); + +#else /* #ifdef CONFIG_NO_HZ */ + +static inline void rcu_enter_nohz(void) +{ +} + +static inline void rcu_exit_nohz(void) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ */ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) #include @@ -626,6 +686,8 @@ struct rcu_synchronize { extern void wakeme_after_rcu(struct rcu_head *head); +#ifdef CONFIG_PREEMPT_RCU + /** * call_rcu() - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -642,6 +704,13 @@ extern void wakeme_after_rcu(struct rcu_head *head); extern void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head)); +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +/* In classic RCU, call_rcu() is just call_rcu_sched(). */ +#define call_rcu call_rcu_sched + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + /** * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. * @head: structure to be used for queueing the RCU updates. diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index c6b11dc5ba0a..13877cb93a60 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -27,34 +27,10 @@ #include -void rcu_sched_qs(int cpu); -void rcu_bh_qs(int cpu); - -#ifdef CONFIG_TINY_RCU -#define __rcu_read_lock() preempt_disable() -#define __rcu_read_unlock() preempt_enable() -#else /* #ifdef CONFIG_TINY_RCU */ -void __rcu_read_lock(void); -void __rcu_read_unlock(void); -#endif /* #else #ifdef CONFIG_TINY_RCU */ -#define __rcu_read_lock_bh() local_bh_disable() -#define __rcu_read_unlock_bh() local_bh_enable() -extern void call_rcu_sched(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)); - #define rcu_init_sched() do { } while (0) -extern void synchronize_sched(void); - #ifdef CONFIG_TINY_RCU -#define call_rcu call_rcu_sched - -static inline void synchronize_rcu(void) -{ - synchronize_sched(); -} - static inline void synchronize_rcu_expedited(void) { synchronize_sched(); /* Only one CPU, so pretty fast anyway!!! */ @@ -67,7 +43,6 @@ static inline void rcu_barrier(void) #else /* #ifdef CONFIG_TINY_RCU */ -void synchronize_rcu(void); void rcu_barrier(void); void synchronize_rcu_expedited(void); @@ -83,25 +58,6 @@ static inline void synchronize_rcu_bh_expedited(void) synchronize_sched(); } -struct notifier_block; - -#ifdef CONFIG_NO_HZ - -extern void rcu_enter_nohz(void); -extern void rcu_exit_nohz(void); - -#else /* #ifdef CONFIG_NO_HZ */ - -static inline void rcu_enter_nohz(void) -{ -} - -static inline void rcu_exit_nohz(void) -{ -} - -#endif /* #else #ifdef CONFIG_NO_HZ */ - #ifdef CONFIG_TINY_RCU static inline void rcu_preempt_note_context_switch(void) @@ -117,11 +73,6 @@ static inline int rcu_needs_cpu(int cpu) return 0; } -static inline int rcu_preempt_depth(void) -{ - return 0; -} - #else /* #ifdef CONFIG_TINY_RCU */ void rcu_preempt_note_context_switch(void); @@ -141,8 +92,6 @@ static inline void rcu_note_context_switch(int cpu) rcu_preempt_note_context_switch(); } -extern void rcu_check_callbacks(int cpu, int user); - /* * Return the number of grace periods. */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 54a20c11f98d..95518e628794 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,59 +30,23 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H -struct notifier_block; - -extern void rcu_sched_qs(int cpu); -extern void rcu_bh_qs(int cpu); extern void rcu_note_context_switch(int cpu); extern int rcu_needs_cpu(int cpu); extern void rcu_cpu_stall_reset(void); #ifdef CONFIG_TREE_PREEMPT_RCU -extern void __rcu_read_lock(void); -extern void __rcu_read_unlock(void); -extern void synchronize_rcu(void); extern void exit_rcu(void); #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -static inline void __rcu_read_lock(void) -{ - preempt_disable(); -} - -static inline void __rcu_read_unlock(void) -{ - preempt_enable(); -} - -#define synchronize_rcu synchronize_sched - static inline void exit_rcu(void) { } -static inline int rcu_preempt_depth(void) -{ - return 0; -} - #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ -static inline void __rcu_read_lock_bh(void) -{ - local_bh_disable(); -} -static inline void __rcu_read_unlock_bh(void) -{ - local_bh_enable(); -} - -extern void call_rcu_sched(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)); extern void synchronize_rcu_bh(void); -extern void synchronize_sched(void); extern void synchronize_rcu_expedited(void); static inline void synchronize_rcu_bh_expedited(void) @@ -92,8 +56,6 @@ static inline void synchronize_rcu_bh_expedited(void) extern void rcu_barrier(void); -extern void rcu_check_callbacks(int cpu, int user); - extern long rcu_batches_completed(void); extern long rcu_batches_completed_bh(void); extern long rcu_batches_completed_sched(void); @@ -101,18 +63,6 @@ extern void rcu_force_quiescent_state(void); extern void rcu_bh_force_quiescent_state(void); extern void rcu_sched_force_quiescent_state(void); -#ifdef CONFIG_NO_HZ -void rcu_enter_nohz(void); -void rcu_exit_nohz(void); -#else /* CONFIG_NO_HZ */ -static inline void rcu_enter_nohz(void) -{ -} -static inline void rcu_exit_nohz(void) -{ -} -#endif /* CONFIG_NO_HZ */ - /* A context switch is a grace period for RCU-sched and RCU-bh. */ static inline int rcu_blocking_is_gp(void) { -- cgit v1.2.3 From 65e6bf484c497f02d47a0faae69ee398cd59cfda Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Aug 2010 21:43:09 -0700 Subject: rcu: add comment stating that list_empty() applies to RCU-protected lists Because list_empty() does not dereference any RCU-protected pointers, and further does not pass such pointers to the caller (so that the caller does not dereference them either), it is safe to use list_empty() on RCU-protected lists. There is no need for a list_empty_rcu(). This commit adds a comment stating this explicitly. Requested-by: Andrew Morton Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index c10b1050dbe6..f31ef61f1c65 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -9,6 +9,15 @@ #include #include +/* + * Why is there no list_empty_rcu()? Because list_empty() serves this + * purpose. The list_empty() function fetches the RCU-protected pointer + * and compares it to the address of the list head, but neither dereferences + * this pointer itself nor provides this pointer to the caller. Therefore, + * it is not necessary to use rcu_dereference(), so that list_empty() can + * be used anywhere you would want to use a list_empty_rcu(). + */ + /* * return the ->next pointer of a list_head in an rcu safe * way, we must not access it directly -- cgit v1.2.3 From 01a08546af311c065f34727787dd0cc8dc0c216f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 31 Aug 2010 10:28:16 +0200 Subject: sched: Add book scheduling domain On top of the SMT and MC scheduling domains this adds the BOOK scheduling domain. This is useful for NUMA like machines which do not have an interface which tells which piece of memory is attached to which node or where the hardware performs striping. Signed-off-by: Heiko Carstens Signed-off-by: Peter Zijlstra LKML-Reference: <20100831082844.253053798@de.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + include/linux/topology.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1e2a6db2d7dd..b51c53c285b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -875,6 +875,7 @@ enum sched_domain_level { SD_LV_NONE = 0, SD_LV_SIBLING, SD_LV_MC, + SD_LV_BOOK, SD_LV_CPU, SD_LV_NODE, SD_LV_ALLNODES, diff --git a/include/linux/topology.h b/include/linux/topology.h index 64e084ff5e5c..b91a40e847d2 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -201,6 +201,12 @@ int arch_update_cpu_topology(void); .balance_interval = 64, \ } +#ifdef CONFIG_SCHED_BOOK +#ifndef SD_BOOK_INIT +#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!! +#endif +#endif /* CONFIG_SCHED_BOOK */ + #ifdef CONFIG_NUMA #ifndef SD_NODE_INIT #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! -- cgit v1.2.3 From 637bbdc5b83615ef9f45f50399d1c7f27473c713 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Mon, 13 Sep 2010 20:19:03 +0800 Subject: sched: Remove unused PF_ALIGNWARN flag PF_ALIGNWARN is not implemented and it is for 486 as the comment. It is not likely someone will implement this flag feature. So here remove this flag and leave the valuable 0x00000001 for future use. Signed-off-by: Dave Young Cc: Peter Zijlstra Cc: Linus Torvalds LKML-Reference: <20100913121903.GB22238@darkstar> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index b51c53c285b8..cdf56693ecbf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1682,8 +1682,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * /* * Per process flags */ -#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ - /* Not implemented yet, only for 486*/ #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ -- cgit v1.2.3 From a3c74c52570c0c4ac90c9a0216de800c39089ba7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 14 Sep 2010 21:43:47 +0900 Subject: futex: Mark restart_block.futex.uaddr[2] __user @uaddr and @uaddr2 fields in restart_block.futex are user pointers. Add __user and remove unnecessary casts. Signed-off-by: Namhyung Kim Cc: Peter Zijlstra Cc: Darren Hart LKML-Reference: <1284468228-8723-2-git-send-email-namhyung@gmail.com> Signed-off-by: Thomas Gleixner --- include/linux/thread_info.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index a8cc4e13434c..c90696544176 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -23,12 +23,12 @@ struct restart_block { }; /* For futex_wait and futex_wait_requeue_pi */ struct { - u32 *uaddr; + u32 __user *uaddr; u32 val; u32 flags; u32 bitset; u64 time; - u32 *uaddr2; + u32 __user *uaddr2; } futex; /* For nanosleep */ struct { -- cgit v1.2.3 From 53ecfba259f54b6967a35d19f4a564e3bc07997f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Sep 2010 17:24:21 -0700 Subject: rcu: only one evaluation of arg in rcu_dereference_check() unless sparse The current version of the __rcu_access_pointer(), __rcu_dereference_check(), and __rcu_dereference_protected() macros evaluate their "p" argument three times, not counting typeof()s. This is bad news if that argument contains a side effect. This commit therefore evaluates this argument only once in normal kernel builds. However, the straightforward approach defeats sparse's RCU-pointer checking, so when __CHECKER__ is defined, the additional pair of evaluations of the "p" argument are performed in order to permit sparse to detect misuse of RCU-protected pointers. Signed-off-by: Paul E. McKenney Cc: Arnd Bergmann --- include/linux/rcupdate.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 89414d67d961..03cda7bed985 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -310,24 +310,32 @@ extern int rcu_my_thread_group_empty(void); * (e.g., __rcu_bh, * __rcu_sched, and __srcu), should this make sense in * the future. */ + +#ifdef __CHECKER__ +#define rcu_dereference_sparse(p, space) \ + ((void)(((typeof(*p) space *)p) == p)) +#else /* #ifdef __CHECKER__ */ +#define rcu_dereference_sparse(p, space) +#endif /* #else #ifdef __CHECKER__ */ + #define __rcu_access_pointer(p, space) \ ({ \ typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \ - (void) (((typeof (*p) space *)p) == p); \ + rcu_dereference_sparse(p, space); \ ((typeof(*p) __force __kernel *)(_________p1)); \ }) #define __rcu_dereference_check(p, c, space) \ ({ \ typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \ rcu_lockdep_assert(c); \ - (void) (((typeof (*p) space *)p) == p); \ + rcu_dereference_sparse(p, space); \ smp_read_barrier_depends(); \ ((typeof(*p) __force __kernel *)(_________p1)); \ }) #define __rcu_dereference_protected(p, c, space) \ ({ \ rcu_lockdep_assert(c); \ - (void) (((typeof (*p) space *)p) == p); \ + rcu_dereference_sparse(p, space); \ ((typeof(*p) __force __kernel *)(p)); \ }) -- cgit v1.2.3 From d1ea13c6e2cce0106531852daaa93dd97aec9580 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Sep 2010 18:40:07 +0200 Subject: genirq: Cleanup irq_chip->typename leftovers 3 years transition phase is enough. Cleanup the last users and remove the cruft. Signed-off-by: Thomas Gleixner Cc: Leo Chen Cc: Hirokazu Takata Cc: Chris Metcalf Cc: Jeff Dike Cc: Chris Zankel --- include/linux/irq.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index c03243ad84b4..06273a2a17e7 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -106,7 +106,6 @@ struct msi_desc; * @bus_sync_unlock: function to sync and unlock slow bus (i2c) chips * * @release: release function solely used by UML - * @typename: obsoleted by name, kept as migration helper */ struct irq_chip { const char *name; @@ -135,11 +134,6 @@ struct irq_chip { #ifdef CONFIG_IRQ_RELEASE_METHOD void (*release)(unsigned int irq, void *dev_id); #endif - /* - * For compatibility, ->typename is copied into ->name. - * Will disappear. - */ - const char *typename; }; struct timer_rand_state; -- cgit v1.2.3 From 5c80cc78de46aef6cd5e714208da05c3f7f548f8 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 30 Sep 2010 14:43:16 +0200 Subject: x86, amd_nb: Enable GART support for AMD family 0x15 CPUs AMD CPU family 0x15 still supports GART for compatibility reasons. Signed-off-by: Andreas Herrmann LKML-Reference: <20100930124316.GG20545@loge.amd.com> Signed-off-by: H. Peter Anvin --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 10d33309e9a6..edc0279be72f 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -514,6 +514,7 @@ #define PCI_DEVICE_ID_AMD_11H_NB_DRAM 0x1302 #define PCI_DEVICE_ID_AMD_11H_NB_MISC 0x1303 #define PCI_DEVICE_ID_AMD_11H_NB_LINK 0x1304 +#define PCI_DEVICE_ID_AMD_15H_NB_MISC 0x1603 #define PCI_DEVICE_ID_AMD_LANCE 0x2000 #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 #define PCI_DEVICE_ID_AMD_SCSI 0x2020 -- cgit v1.2.3 From ff7dcd44dd446db2c3e13bdedf2d52b8e0127f16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:44:25 +0000 Subject: genirq: Create irq_data Low level chip functions need access to irq_desc->handler_data, irq_desc->chip_data and irq_desc->msi_desc. We hand down the irq number to the low level functions, so they need to lookup irq_desc. With sparse irq this means a radix tree lookup. We could hand down irq_desc itself, but low level chip functions have no need to fiddle with it directly and we want to restrict access to irq_desc further. Preparatory patch for new chip functions. Note, that the ugly anon union/struct is there to avoid a full tree wide clean up for now. This is not going to last 3 years like __do_IRQ() Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121841.645542300@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- include/linux/irq.h | 90 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 06273a2a17e7..363c76ff82c8 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -83,6 +83,37 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, struct proc_dir_entry; struct msi_desc; +/** + * struct irq_data - per irq and irq chip data passed down to chip functions + * @irq: interrupt number + * @node: node index useful for balancing + * @chip: low level interrupt hardware access + * @handler_data: per-IRQ data for the irq_chip methods + * @chip_data: platform-specific per-chip private data for the chip + * methods, to allow shared chip implementations + * @msi_desc: MSI descriptor + * @affinity: IRQ affinity on SMP + * @irq_2_iommu: iommu with this irq + * + * The fields here need to overlay the ones in irq_desc until we + * cleaned up the direct references and switched everything over to + * irq_data. + */ +struct irq_data { + unsigned int irq; + unsigned int node; + struct irq_chip *chip; + void *handler_data; + void *chip_data; + struct msi_desc *msi_desc; +#ifdef CONFIG_SMP + cpumask_var_t affinity; +#endif +#ifdef CONFIG_INTR_REMAP + struct irq_2_iommu *irq_2_iommu; +#endif +}; + /** * struct irq_chip - hardware interrupt chip descriptor * @@ -140,16 +171,10 @@ struct timer_rand_state; struct irq_2_iommu; /** * struct irq_desc - interrupt descriptor - * @irq: interrupt number for this descriptor + * @irq_data: per irq and chip data passed down to chip functions * @timer_rand_state: pointer to timer rand state struct * @kstat_irqs: irq stats per cpu - * @irq_2_iommu: iommu with this irq * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] - * @chip: low level interrupt hardware access - * @msi_desc: MSI descriptor - * @handler_data: per-IRQ data for the irq_chip methods - * @chip_data: platform-specific per-chip private data for the chip - * methods, to allow shared chip implementations * @action: the irq action chain * @status: status information * @depth: disable-depth, for nested irq_disable() calls @@ -158,8 +183,6 @@ struct irq_2_iommu; * @last_unhandled: aging timer for unhandled count * @irqs_unhandled: stats field for spurious unhandled interrupts * @lock: locking for SMP - * @affinity: IRQ affinity on SMP - * @node: node index useful for balancing * @pending_mask: pending rebalanced interrupts * @threads_active: number of irqaction threads currently running * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers @@ -167,17 +190,32 @@ struct irq_2_iommu; * @name: flow handler name for /proc/interrupts output */ struct irq_desc { - unsigned int irq; - struct timer_rand_state *timer_rand_state; - unsigned int *kstat_irqs; + + /* + * This union will go away, once we fixed the direct access to + * irq_desc all over the place. The direct fields are a 1:1 + * overlay of irq_data. + */ + union { + struct irq_data irq_data; + struct { + unsigned int irq; + unsigned int node; + struct irq_chip *chip; + void *handler_data; + void *chip_data; + struct msi_desc *msi_desc; +#ifdef CONFIG_SMP + cpumask_var_t affinity; +#endif #ifdef CONFIG_INTR_REMAP - struct irq_2_iommu *irq_2_iommu; + struct irq_2_iommu *irq_2_iommu; #endif + }; + }; + struct timer_rand_state *timer_rand_state; + unsigned int *kstat_irqs; irq_flow_handler_t handle_irq; - struct irq_chip *chip; - struct msi_desc *msi_desc; - void *handler_data; - void *chip_data; struct irqaction *action; /* IRQ action list */ unsigned int status; /* IRQ status */ @@ -188,9 +226,7 @@ struct irq_desc { unsigned int irqs_unhandled; raw_spinlock_t lock; #ifdef CONFIG_SMP - cpumask_var_t affinity; const struct cpumask *affinity_hint; - unsigned int node; #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_var_t pending_mask; #endif @@ -406,15 +442,15 @@ extern int set_irq_chip_data(unsigned int irq, void *data); extern int set_irq_type(unsigned int irq, unsigned int type); extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); -#define get_irq_chip(irq) (irq_to_desc(irq)->chip) -#define get_irq_chip_data(irq) (irq_to_desc(irq)->chip_data) -#define get_irq_data(irq) (irq_to_desc(irq)->handler_data) -#define get_irq_msi(irq) (irq_to_desc(irq)->msi_desc) +#define get_irq_chip(irq) (irq_to_desc(irq)->irq_data.chip) +#define get_irq_chip_data(irq) (irq_to_desc(irq)->irq_data.chip_data) +#define get_irq_data(irq) (irq_to_desc(irq)->irq_data.handler_data) +#define get_irq_msi(irq) (irq_to_desc(irq)->irq_data.msi_desc) -#define get_irq_desc_chip(desc) ((desc)->chip) -#define get_irq_desc_chip_data(desc) ((desc)->chip_data) -#define get_irq_desc_data(desc) ((desc)->handler_data) -#define get_irq_desc_msi(desc) ((desc)->msi_desc) +#define get_irq_desc_chip(desc) ((desc)->irq_data.chip) +#define get_irq_desc_chip_data(desc) ((desc)->irq_data.chip_data) +#define get_irq_desc_data(desc) ((desc)->irq_data.handler_data) +#define get_irq_desc_msi(desc) ((desc)->irq_data.msi_desc) #endif /* CONFIG_GENERIC_HARDIRQS */ -- cgit v1.2.3 From 6b8ff3120c758340505dddf08ad685ebb841d5d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Oct 2010 12:58:38 +0200 Subject: genirq: Convert core code to irq_data Convert all references in the core code to orq, chip, handler_data, chip_data, msi_desc, affinity to irq_data.* Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/irq.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 363c76ff82c8..002351d83c3f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -475,12 +475,12 @@ static inline bool alloc_desc_masks(struct irq_desc *desc, int node, gfp = GFP_NOWAIT; #ifdef CONFIG_CPUMASK_OFFSTACK - if (!alloc_cpumask_var_node(&desc->affinity, gfp, node)) + if (!alloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) return false; #ifdef CONFIG_GENERIC_PENDING_IRQ if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { - free_cpumask_var(desc->affinity); + free_cpumask_var(desc->irq_data.affinity); return false; } #endif @@ -490,7 +490,7 @@ static inline bool alloc_desc_masks(struct irq_desc *desc, int node, static inline void init_desc_masks(struct irq_desc *desc) { - cpumask_setall(desc->affinity); + cpumask_setall(desc->irq_data.affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif @@ -510,7 +510,7 @@ static inline void init_copy_desc_masks(struct irq_desc *old_desc, struct irq_desc *new_desc) { #ifdef CONFIG_CPUMASK_OFFSTACK - cpumask_copy(new_desc->affinity, old_desc->affinity); + cpumask_copy(new_desc->irq_data.affinity, old_desc->irq_data.affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_copy(new_desc->pending_mask, old_desc->pending_mask); @@ -521,7 +521,7 @@ static inline void init_copy_desc_masks(struct irq_desc *old_desc, static inline void free_desc_masks(struct irq_desc *old_desc, struct irq_desc *new_desc) { - free_cpumask_var(old_desc->affinity); + free_cpumask_var(old_desc->irq_data.affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ free_cpumask_var(old_desc->pending_mask); -- cgit v1.2.3 From f8822657e799b02c55556c99a601261e207a299d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 12:44:32 +0000 Subject: genirq: Provide advanced irq chip functions The low level irq chip functions want access to irq_desc->irq_data. Provide new functions which hand down irq_data instead of the irq number so these functions avoid to call irq_to_desc() which is a radix tree lookup in case of sparse irq. This provides all the old functions except one: end(). end() is a relict of __do_IRQ() and will just go away with the __do_IRQ() code. The replacement for set_affinity() has an extra argument "bool force". The reason for this is to notify the low level code, that the move has to be done right away and cannot be delayed until the next interrupt happens. That's necessary to handle the irq fixup on cpu unplug in the generic code. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20100927121841.742126604@linutronix.de> Reviewed-by: H. Peter Anvin Reviewed-by: Ingo Molnar --- include/linux/irq.h | 66 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 002351d83c3f..0c83cbd2df4e 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -118,23 +118,38 @@ struct irq_data { * struct irq_chip - hardware interrupt chip descriptor * * @name: name for /proc/interrupts - * @startup: start up the interrupt (defaults to ->enable if NULL) - * @shutdown: shut down the interrupt (defaults to ->disable if NULL) - * @enable: enable the interrupt (defaults to chip->unmask if NULL) - * @disable: disable the interrupt - * @ack: start of a new interrupt - * @mask: mask an interrupt source - * @mask_ack: ack and mask an interrupt source - * @unmask: unmask an interrupt source - * @eoi: end of interrupt - chip level - * @end: end of interrupt - flow level - * @set_affinity: set the CPU affinity on SMP machines - * @retrigger: resend an IRQ to the CPU - * @set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ - * @set_wake: enable/disable power-management wake-on of an IRQ + * @startup: deprecated, replaced by irq_startup + * @shutdown: deprecated, replaced by irq_shutdown + * @enable: deprecated, replaced by irq_enable + * @disable: deprecated, replaced by irq_disable + * @ack: deprecated, replaced by irq_ack + * @mask: deprecated, replaced by irq_mask + * @mask_ack: deprecated, replaced by irq_mask_ack + * @unmask: deprecated, replaced by irq_unmask + * @eoi: deprecated, replaced by irq_eoi + * @end: deprecated, will go away with __do_IRQ() + * @set_affinity: deprecated, replaced by irq_set_affinity + * @retrigger: deprecated, replaced by irq_retrigger + * @set_type: deprecated, replaced by irq_set_type + * @set_wake: deprecated, replaced by irq_wake + * @bus_lock: deprecated, replaced by irq_bus_lock + * @bus_sync_unlock: deprecated, replaced by irq_bus_sync_unlock * - * @bus_lock: function to lock access to slow bus (i2c) chips - * @bus_sync_unlock: function to sync and unlock slow bus (i2c) chips + * @irq_startup: start up the interrupt (defaults to ->enable if NULL) + * @irq_shutdown: shut down the interrupt (defaults to ->disable if NULL) + * @irq_enable: enable the interrupt (defaults to chip->unmask if NULL) + * @irq_disable: disable the interrupt + * @irq_ack: start of a new interrupt + * @irq_mask: mask an interrupt source + * @irq_mask_ack: ack and mask an interrupt source + * @irq_unmask: unmask an interrupt source + * @irq_eoi: end of interrupt + * @irq_set_affinity: set the CPU affinity on SMP machines + * @irq_retrigger: resend an IRQ to the CPU + * @irq_set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ + * @irq_set_wake: enable/disable power-management wake-on of an IRQ + * @irq_bus_lock: function to lock access to slow bus (i2c) chips + * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips * * @release: release function solely used by UML */ @@ -161,6 +176,25 @@ struct irq_chip { void (*bus_lock)(unsigned int irq); void (*bus_sync_unlock)(unsigned int irq); + unsigned int (*irq_startup)(struct irq_data *data); + void (*irq_shutdown)(struct irq_data *data); + void (*irq_enable)(struct irq_data *data); + void (*irq_disable)(struct irq_data *data); + + void (*irq_ack)(struct irq_data *data); + void (*irq_mask)(struct irq_data *data); + void (*irq_mask_ack)(struct irq_data *data); + void (*irq_unmask)(struct irq_data *data); + void (*irq_eoi)(struct irq_data *data); + + int (*irq_set_affinity)(struct irq_data *data, const struct cpumask *dest, bool force); + int (*irq_retrigger)(struct irq_data *data); + int (*irq_set_type)(struct irq_data *data, unsigned int flow_type); + int (*irq_set_wake)(struct irq_data *data, unsigned int on); + + void (*irq_bus_lock)(struct irq_data *data); + void (*irq_bus_sync_unlock)(struct irq_data *data); + /* Currently used only by UML, might disappear one day.*/ #ifdef CONFIG_IRQ_RELEASE_METHOD void (*release)(unsigned int irq, void *dev_id); -- cgit v1.2.3 From bd151412263a67b5321e9dd1d5b4bf6d96fdebf3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Oct 2010 15:17:14 +0200 Subject: genirq: Provide config option to disable deprecated code This option covers now the old chip functions and the irq_desc data fields which are moving to struct irq_data. More stuff will follow. Pretty handy for testing a conversion, whether something broke or not. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/irq.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 0c83cbd2df4e..82ed8231394a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -155,6 +155,7 @@ struct irq_data { */ struct irq_chip { const char *name; +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED unsigned int (*startup)(unsigned int irq); void (*shutdown)(unsigned int irq); void (*enable)(unsigned int irq); @@ -175,7 +176,7 @@ struct irq_chip { void (*bus_lock)(unsigned int irq); void (*bus_sync_unlock)(unsigned int irq); - +#endif unsigned int (*irq_startup)(struct irq_data *data); void (*irq_shutdown)(struct irq_data *data); void (*irq_enable)(struct irq_data *data); @@ -225,6 +226,9 @@ struct irq_2_iommu; */ struct irq_desc { +#ifdef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED + struct irq_data irq_data; +#else /* * This union will go away, once we fixed the direct access to * irq_desc all over the place. The direct fields are a 1:1 @@ -247,6 +251,8 @@ struct irq_desc { #endif }; }; +#endif + struct timer_rand_state *timer_rand_state; unsigned int *kstat_irqs; irq_flow_handler_t handle_irq; -- cgit v1.2.3 From 773e3f93577ffb493fb7c39b1a6ecf39b5748e87 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Oct 2010 14:03:02 -0700 Subject: rcu: move check from rcu_dereference_bh to rcu_read_lock_bh_held As suggested by Linus, push the irqs_disabled() down to the rcu_read_lock_bh_held() level so that all callers get the benefit of the correct check. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 83af1f8d8b74..9fbc54a2585d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -454,7 +454,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_bh(p) \ - rcu_dereference_check(p, rcu_read_lock_bh_held() || irqs_disabled()) + rcu_dereference_check(p, rcu_read_lock_bh_held()) /** * rcu_dereference_sched - fetch RCU-protected pointer, checking for RCU-sched -- cgit v1.2.3 From e144710b302525de5b90b9c3ba43562458d8957f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Oct 2010 16:03:45 +0200 Subject: genirq: Distangle irq.h Move irq_desc and internal functions out of irq.h Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/irq.h | 292 +++--------------------------------------------- include/linux/irqdesc.h | 171 ++++++++++++++++++++++++++++ 2 files changed, 184 insertions(+), 279 deletions(-) create mode 100644 include/linux/irqdesc.h (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 82ed8231394a..f5827abbc034 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -80,7 +80,6 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, # define IRQ_NO_BALANCING_MASK IRQ_NO_BALANCING #endif -struct proc_dir_entry; struct msi_desc; /** @@ -202,152 +201,36 @@ struct irq_chip { #endif }; -struct timer_rand_state; -struct irq_2_iommu; -/** - * struct irq_desc - interrupt descriptor - * @irq_data: per irq and chip data passed down to chip functions - * @timer_rand_state: pointer to timer rand state struct - * @kstat_irqs: irq stats per cpu - * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] - * @action: the irq action chain - * @status: status information - * @depth: disable-depth, for nested irq_disable() calls - * @wake_depth: enable depth, for multiple set_irq_wake() callers - * @irq_count: stats field to detect stalled irqs - * @last_unhandled: aging timer for unhandled count - * @irqs_unhandled: stats field for spurious unhandled interrupts - * @lock: locking for SMP - * @pending_mask: pending rebalanced interrupts - * @threads_active: number of irqaction threads currently running - * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers - * @dir: /proc/irq/ procfs entry - * @name: flow handler name for /proc/interrupts output - */ -struct irq_desc { - -#ifdef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED - struct irq_data irq_data; -#else - /* - * This union will go away, once we fixed the direct access to - * irq_desc all over the place. The direct fields are a 1:1 - * overlay of irq_data. - */ - union { - struct irq_data irq_data; - struct { - unsigned int irq; - unsigned int node; - struct irq_chip *chip; - void *handler_data; - void *chip_data; - struct msi_desc *msi_desc; -#ifdef CONFIG_SMP - cpumask_var_t affinity; -#endif -#ifdef CONFIG_INTR_REMAP - struct irq_2_iommu *irq_2_iommu; -#endif - }; - }; -#endif - - struct timer_rand_state *timer_rand_state; - unsigned int *kstat_irqs; - irq_flow_handler_t handle_irq; - struct irqaction *action; /* IRQ action list */ - unsigned int status; /* IRQ status */ - - unsigned int depth; /* nested irq disables */ - unsigned int wake_depth; /* nested wake enables */ - unsigned int irq_count; /* For detecting broken IRQs */ - unsigned long last_unhandled; /* Aging timer for unhandled count */ - unsigned int irqs_unhandled; - raw_spinlock_t lock; -#ifdef CONFIG_SMP - const struct cpumask *affinity_hint; -#ifdef CONFIG_GENERIC_PENDING_IRQ - cpumask_var_t pending_mask; -#endif -#endif - atomic_t threads_active; - wait_queue_head_t wait_for_threads; -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *dir; -#endif - const char *name; -} ____cacheline_internodealigned_in_smp; - -extern void arch_init_copy_chip_data(struct irq_desc *old_desc, - struct irq_desc *desc, int node); -extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc); - -#ifndef CONFIG_SPARSE_IRQ -extern struct irq_desc irq_desc[NR_IRQS]; -#endif - -#ifdef CONFIG_NUMA_IRQ_DESC -extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node); -#else -static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) -{ - return desc; -} -#endif - -extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node); +/* This include will go away once we isolated irq_desc usage to core code */ +#include /* * Pick up the arch-dependent methods: */ #include +struct irqaction; extern int setup_irq(unsigned int irq, struct irqaction *new); extern void remove_irq(unsigned int irq, struct irqaction *act); #ifdef CONFIG_GENERIC_HARDIRQS #ifdef CONFIG_SMP - -#ifdef CONFIG_GENERIC_PENDING_IRQ - +# ifdef CONFIG_GENERIC_PENDING_IRQ void move_native_irq(int irq); void move_masked_irq(int irq); - -#else /* CONFIG_GENERIC_PENDING_IRQ */ - -static inline void move_irq(int irq) -{ -} - -static inline void move_native_irq(int irq) -{ -} - -static inline void move_masked_irq(int irq) -{ -} - -#endif /* CONFIG_GENERIC_PENDING_IRQ */ - -#else /* CONFIG_SMP */ - -#define move_native_irq(x) -#define move_masked_irq(x) - -#endif /* CONFIG_SMP */ +# else +static inline void move_irq(int irq) { } +static inline void move_native_irq(int irq) { } +static inline void move_masked_irq(int irq) { } +# endif +#else +static inline void move_native_irq(int irq) { } +static inline void move_masked_irq(int irq) { } +#endif extern int no_irq_affinity; -static inline int irq_balancing_disabled(unsigned int irq) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - return desc->status & IRQ_NO_BALANCING_MASK; -} - /* Handle irq action chains: */ extern irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action); @@ -363,42 +246,10 @@ extern void handle_percpu_irq(unsigned int irq, struct irq_desc *desc); extern void handle_bad_irq(unsigned int irq, struct irq_desc *desc); extern void handle_nested_irq(unsigned int irq); -/* - * Monolithic do_IRQ implementation. - */ -#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ -extern unsigned int __do_IRQ(unsigned int irq); -#endif - -/* - * Architectures call this to let the generic IRQ layer - * handle an interrupt. If the descriptor is attached to an - * irqchip-style controller then we call the ->handle_irq() handler, - * and it calls __do_IRQ() if it's attached to an irqtype-style controller. - */ -static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc) -{ -#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ - desc->handle_irq(irq, desc); -#else - if (likely(desc->handle_irq)) - desc->handle_irq(irq, desc); - else - __do_IRQ(irq); -#endif -} - -static inline void generic_handle_irq(unsigned int irq) -{ - generic_handle_irq_desc(irq, irq_to_desc(irq)); -} - /* Handling of unhandled and spurious interrupts: */ extern void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret); -/* Resending of interrupts :*/ -void check_irq_resend(struct irq_desc *desc, unsigned int irq); /* Enable/disable irq debugging output: */ extern int noirqdebug_setup(char *str); @@ -421,16 +272,6 @@ extern void __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name); -/* caller has locked the irq_desc and both params are valid */ -static inline void __set_irq_handler_unlocked(int irq, - irq_flow_handler_t handler) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - desc->handle_irq = handler; -} - /* * Set a highlevel flow handler for a given IRQ: */ @@ -462,13 +303,6 @@ extern unsigned int create_irq_nr(unsigned int irq_want, int node); extern int create_irq(void); extern void destroy_irq(unsigned int irq); -/* Test to see if a driver has successfully requested an irq */ -static inline int irq_has_action(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - return desc->action != NULL; -} - /* Dynamic irq helper functions */ extern void dynamic_irq_init(unsigned int irq); void dynamic_irq_init_keep_chip_data(unsigned int irq); @@ -487,108 +321,8 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); #define get_irq_data(irq) (irq_to_desc(irq)->irq_data.handler_data) #define get_irq_msi(irq) (irq_to_desc(irq)->irq_data.msi_desc) -#define get_irq_desc_chip(desc) ((desc)->irq_data.chip) -#define get_irq_desc_chip_data(desc) ((desc)->irq_data.chip_data) -#define get_irq_desc_data(desc) ((desc)->irq_data.handler_data) -#define get_irq_desc_msi(desc) ((desc)->irq_data.msi_desc) - #endif /* CONFIG_GENERIC_HARDIRQS */ #endif /* !CONFIG_S390 */ -#ifdef CONFIG_SMP -/** - * alloc_desc_masks - allocate cpumasks for irq_desc - * @desc: pointer to irq_desc struct - * @node: node which will be handling the cpumasks - * @boot: true if need bootmem - * - * Allocates affinity and pending_mask cpumask if required. - * Returns true if successful (or not required). - */ -static inline bool alloc_desc_masks(struct irq_desc *desc, int node, - bool boot) -{ - gfp_t gfp = GFP_ATOMIC; - - if (boot) - gfp = GFP_NOWAIT; - -#ifdef CONFIG_CPUMASK_OFFSTACK - if (!alloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) - return false; - -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { - free_cpumask_var(desc->irq_data.affinity); - return false; - } -#endif -#endif - return true; -} - -static inline void init_desc_masks(struct irq_desc *desc) -{ - cpumask_setall(desc->irq_data.affinity); -#ifdef CONFIG_GENERIC_PENDING_IRQ - cpumask_clear(desc->pending_mask); -#endif -} - -/** - * init_copy_desc_masks - copy cpumasks for irq_desc - * @old_desc: pointer to old irq_desc struct - * @new_desc: pointer to new irq_desc struct - * - * Insures affinity and pending_masks are copied to new irq_desc. - * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the - * irq_desc struct so the copy is redundant. - */ - -static inline void init_copy_desc_masks(struct irq_desc *old_desc, - struct irq_desc *new_desc) -{ -#ifdef CONFIG_CPUMASK_OFFSTACK - cpumask_copy(new_desc->irq_data.affinity, old_desc->irq_data.affinity); - -#ifdef CONFIG_GENERIC_PENDING_IRQ - cpumask_copy(new_desc->pending_mask, old_desc->pending_mask); -#endif -#endif -} - -static inline void free_desc_masks(struct irq_desc *old_desc, - struct irq_desc *new_desc) -{ - free_cpumask_var(old_desc->irq_data.affinity); - -#ifdef CONFIG_GENERIC_PENDING_IRQ - free_cpumask_var(old_desc->pending_mask); -#endif -} - -#else /* !CONFIG_SMP */ - -static inline bool alloc_desc_masks(struct irq_desc *desc, int node, - bool boot) -{ - return true; -} - -static inline void init_desc_masks(struct irq_desc *desc) -{ -} - -static inline void init_copy_desc_masks(struct irq_desc *old_desc, - struct irq_desc *new_desc) -{ -} - -static inline void free_desc_masks(struct irq_desc *old_desc, - struct irq_desc *new_desc) -{ -} -#endif /* CONFIG_SMP */ - #endif /* _LINUX_IRQ_H */ diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h new file mode 100644 index 000000000000..22e426fdd301 --- /dev/null +++ b/include/linux/irqdesc.h @@ -0,0 +1,171 @@ +#ifndef _LINUX_IRQDESC_H +#define _LINUX_IRQDESC_H + +/* + * Core internal functions to deal with irq descriptors + * + * This include will move to kernel/irq once we cleaned up the tree. + * For now it's included from + */ + +struct proc_dir_entry; +struct timer_rand_state; +struct irq_2_iommu; +/** + * struct irq_desc - interrupt descriptor + * @irq_data: per irq and chip data passed down to chip functions + * @timer_rand_state: pointer to timer rand state struct + * @kstat_irqs: irq stats per cpu + * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] + * @action: the irq action chain + * @status: status information + * @depth: disable-depth, for nested irq_disable() calls + * @wake_depth: enable depth, for multiple set_irq_wake() callers + * @irq_count: stats field to detect stalled irqs + * @last_unhandled: aging timer for unhandled count + * @irqs_unhandled: stats field for spurious unhandled interrupts + * @lock: locking for SMP + * @pending_mask: pending rebalanced interrupts + * @threads_active: number of irqaction threads currently running + * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers + * @dir: /proc/irq/ procfs entry + * @name: flow handler name for /proc/interrupts output + */ +struct irq_desc { + +#ifdef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED + struct irq_data irq_data; +#else + /* + * This union will go away, once we fixed the direct access to + * irq_desc all over the place. The direct fields are a 1:1 + * overlay of irq_data. + */ + union { + struct irq_data irq_data; + struct { + unsigned int irq; + unsigned int node; + struct irq_chip *chip; + void *handler_data; + void *chip_data; + struct msi_desc *msi_desc; +#ifdef CONFIG_SMP + cpumask_var_t affinity; +#endif +#ifdef CONFIG_INTR_REMAP + struct irq_2_iommu *irq_2_iommu; +#endif + }; + }; +#endif + + struct timer_rand_state *timer_rand_state; + unsigned int *kstat_irqs; + irq_flow_handler_t handle_irq; + struct irqaction *action; /* IRQ action list */ + unsigned int status; /* IRQ status */ + + unsigned int depth; /* nested irq disables */ + unsigned int wake_depth; /* nested wake enables */ + unsigned int irq_count; /* For detecting broken IRQs */ + unsigned long last_unhandled; /* Aging timer for unhandled count */ + unsigned int irqs_unhandled; + raw_spinlock_t lock; +#ifdef CONFIG_SMP + const struct cpumask *affinity_hint; +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_var_t pending_mask; +#endif +#endif + atomic_t threads_active; + wait_queue_head_t wait_for_threads; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *dir; +#endif + const char *name; +} ____cacheline_internodealigned_in_smp; + +extern void arch_init_copy_chip_data(struct irq_desc *old_desc, + struct irq_desc *desc, int node); +extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc); + +#ifndef CONFIG_SPARSE_IRQ +extern struct irq_desc irq_desc[NR_IRQS]; +#endif + +#ifdef CONFIG_NUMA_IRQ_DESC +extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node); +#else +static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) +{ + return desc; +} +#endif + +extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node); + +#ifdef CONFIG_GENERIC_HARDIRQS + +#define get_irq_desc_chip(desc) ((desc)->irq_data.chip) +#define get_irq_desc_chip_data(desc) ((desc)->irq_data.chip_data) +#define get_irq_desc_data(desc) ((desc)->irq_data.handler_data) +#define get_irq_desc_msi(desc) ((desc)->irq_data.msi_desc) + +/* + * Monolithic do_IRQ implementation. + */ +#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ +extern unsigned int __do_IRQ(unsigned int irq); +#endif + +/* + * Architectures call this to let the generic IRQ layer + * handle an interrupt. If the descriptor is attached to an + * irqchip-style controller then we call the ->handle_irq() handler, + * and it calls __do_IRQ() if it's attached to an irqtype-style controller. + */ +static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc) +{ +#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ + desc->handle_irq(irq, desc); +#else + if (likely(desc->handle_irq)) + desc->handle_irq(irq, desc); + else + __do_IRQ(irq); +#endif +} + +static inline void generic_handle_irq(unsigned int irq) +{ + generic_handle_irq_desc(irq, irq_to_desc(irq)); +} + +/* Test to see if a driver has successfully requested an irq */ +static inline int irq_has_action(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + return desc->action != NULL; +} + +static inline int irq_balancing_disabled(unsigned int irq) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + return desc->status & IRQ_NO_BALANCING_MASK; +} + +/* caller has locked the irq_desc and both params are valid */ +static inline void __set_irq_handler_unlocked(int irq, + irq_flow_handler_t handler) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->handle_irq = handler; +} +#endif + +#endif -- cgit v1.2.3 From 3a3856d00c74560a7b8d9f8a13c1ca94ee786b78 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Oct 2010 13:47:12 +0200 Subject: genirq: Remove unsused inline move_irq() has no users. Remove it and simplify the ifdef forrest while at it. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/irq.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index f5827abbc034..80fdab208c13 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -215,15 +215,9 @@ extern void remove_irq(unsigned int irq, struct irqaction *act); #ifdef CONFIG_GENERIC_HARDIRQS -#ifdef CONFIG_SMP -# ifdef CONFIG_GENERIC_PENDING_IRQ +#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) void move_native_irq(int irq); void move_masked_irq(int irq); -# else -static inline void move_irq(int irq) { } -static inline void move_native_irq(int irq) { } -static inline void move_masked_irq(int irq) { } -# endif #else static inline void move_native_irq(int irq) { } static inline void move_masked_irq(int irq) { } -- cgit v1.2.3 From 442471848f5abb55b99cba1229301655f67492b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 10:40:18 +0200 Subject: genirq: Provide status modifier Provide a irq_desc.status modifier function to cleanup the direct access to irq_desc in arch and driver code. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/irq.h | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 80fdab208c13..e7e7ac83edd8 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -72,6 +72,10 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_ONESHOT 0x08000000 /* IRQ is not unmasked after hardirq */ #define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */ +#define IRQF_MODIFY_MASK \ + (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ + IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL) + #ifdef CONFIG_IRQ_PER_CPU # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) # define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) @@ -289,8 +293,27 @@ set_irq_chained_handler(unsigned int irq, extern void set_irq_nested_thread(unsigned int irq, int nest); -extern void set_irq_noprobe(unsigned int irq); -extern void set_irq_probe(unsigned int irq); +void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set); + +static inline void irq_set_status_flags(unsigned int irq, unsigned long set) +{ + irq_modify_status(irq, 0, set); +} + +static inline void irq_clear_status_flags(unsigned int irq, unsigned long clr) +{ + irq_modify_status(irq, clr, 0); +} + +static inline void set_irq_noprobe(unsigned int irq) +{ + irq_modify_status(irq, 0, IRQ_NOPROBE); +} + +static inline void set_irq_probe(unsigned int irq) +{ + irq_modify_status(irq, IRQ_NOPROBE, 0); +} /* Handle dynamic irq creation and destruction */ extern unsigned int create_irq_nr(unsigned int irq_want, int node); -- cgit v1.2.3 From f303a6dd127b5ec6de90d1cd79ed19820c7e9658 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 17:34:01 +0200 Subject: genirq: Sanitize irq_data accessors Get the data structure from the core and provide inline wrappers to access the irq_data members. Provide accessor inlines for irq_data as well. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/irq.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index e7e7ac83edd8..bea40556c5a6 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -85,6 +85,7 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #endif struct msi_desc; +struct irq_2_iommu; /** * struct irq_data - per irq and irq chip data passed down to chip functions @@ -332,11 +333,64 @@ extern int set_irq_data(unsigned int irq, void *data); extern int set_irq_chip_data(unsigned int irq, void *data); extern int set_irq_type(unsigned int irq, unsigned int type); extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); +extern struct irq_data *irq_get_irq_data(unsigned int irq); -#define get_irq_chip(irq) (irq_to_desc(irq)->irq_data.chip) -#define get_irq_chip_data(irq) (irq_to_desc(irq)->irq_data.chip_data) -#define get_irq_data(irq) (irq_to_desc(irq)->irq_data.handler_data) -#define get_irq_msi(irq) (irq_to_desc(irq)->irq_data.msi_desc) +static inline struct irq_chip *get_irq_chip(unsigned int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + return d ? d->chip : NULL; +} + +static inline struct irq_chip *irq_data_get_irq_chip(struct irq_data *d) +{ + return d->chip; +} + +static inline void *get_irq_chip_data(unsigned int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + return d ? d->chip_data : NULL; +} + +static inline void *irq_data_get_irq_chip_data(struct irq_data *d) +{ + return d->chip_data; +} + +static inline void *get_irq_data(unsigned int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + return d ? d->handler_data : NULL; +} + +static inline void *irq_data_get_irq_data(struct irq_data *d) +{ + return d->handler_data; +} + +static inline struct msi_desc *get_irq_msi(unsigned int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + return d ? d->msi_desc : NULL; +} + +static inline struct msi_desc *irq_data_get_msi(struct irq_data *d) +{ + return d->msi_desc; +} + +#ifdef CONFIG_INTR_REMAP +static inline struct irq_2_iommu *get_irq_iommu(unsigned int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + return d ? d->irq_2_iommu : NULL; +} + +static inline struct irq_2_iommu *irq_data_get_iommu(struct irq_data *d) +{ + return d->irq_2_iommu; +} +#endif #endif /* CONFIG_GENERIC_HARDIRQS */ -- cgit v1.2.3 From 154cd387cdf0e5566ce523cbddf92dd2a062dfd6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Sep 2010 15:58:45 +0200 Subject: genirq: Remove early_init_irq_lock_class() early_init_irq_lock_class() is called way before anything touches the irq descriptors. In case of SPARSE_IRQ=y this is a NOP operation because the radix tree is empty at this point. For the SPARSE_IRQ=n case it's sufficient to set the lock class in early_init_irq(). Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/lockdep.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 06aed8305bf3..17d050ce7ab8 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -424,14 +424,6 @@ do { \ #endif /* CONFIG_LOCKDEP */ -#ifdef CONFIG_GENERIC_HARDIRQS -extern void early_init_irq_lock_class(void); -#else -static inline void early_init_irq_lock_class(void) -{ -} -#endif - #ifdef CONFIG_TRACE_IRQFLAGS extern void early_boot_irqs_off(void); extern void early_boot_irqs_on(void); -- cgit v1.2.3 From 1318a481fc37c503a901b96ae06b692ca2b21af5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 21:01:37 +0200 Subject: genirq: Provide default irq init flags Arch code sets it's own irq_desc.status flags right after boot and for dynamically allocated interrupts. That might involve iterating over a huge array. Allow ARCH_IRQ_INIT_FLAGS to set separate flags aside of IRQ_DISABLED which is the default. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/irq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index bea40556c5a6..30a300991ed4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -214,6 +214,12 @@ struct irq_chip { */ #include +#ifndef ARCH_IRQ_INIT_FLAGS +# define ARCH_IRQ_INIT_FLAGS 0 +#endif + +#define IRQ_DEFAULT_INIT_FLAGS (IRQ_DISABLED | ARCH_IRQ_INIT_FLAGS) + struct irqaction; extern int setup_irq(unsigned int irq, struct irqaction *new); extern void remove_irq(unsigned int irq, struct irqaction *act); -- cgit v1.2.3 From 1f5a5b87f78fade3ae48dfd55e8765d1d622ea4e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 17:48:26 +0200 Subject: genirq: Implement a sane sparse_irq allocator The current sparse_irq allocator has several short comings due to failures in the design or the lack of it: - Requires iteration over the number of active irqs to find a free slot (Some architectures have grown their own workarounds for this) - Removal of entries is not possible - Racy between create_irq_nr and destroy_irq (plugged by horrible callbacks) - Migration of active irq descriptors is not possible - No bulk allocation of irq ranges - Sprinkeled irq_desc references all over the place outside of kernel/irq/ (The previous chip functions series is addressing this issue) Implement a sane allocator which fixes the above short comings (though migration of active descriptors needs a full tree wide cleanup of the direct and mostly unlocked access to irq_desc). The new allocator still uses a radix_tree, but uses a bitmap for keeping track of allocated irq numbers. That allows: - Fast lookup of a free slot - Allows the removal of descriptors - Prevents the create/destroy race - Bulk allocation of consecutive irq ranges - Basic design is ready for migration of life descriptors after further cleanups The bitmap is also used in the SPARSE_IRQ=n case for lookup and raceless (de)allocation of irq numbers. So it removes the requirement for looping through the descriptor array to find slots. Right now it uses sparse_irq_lock to protect the bitmap and the radix tree, but after cleaning up all users we should be able convert that to a mutex and to switch the radix_tree and decriptor allocations to GFP_KERNEL. [ Folded in a bugfix from Yinghai Lu ] Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/irq.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 30a300991ed4..cefacf928b33 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -398,6 +398,29 @@ static inline struct irq_2_iommu *irq_data_get_iommu(struct irq_data *d) } #endif +int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node); +void irq_free_descs(unsigned int irq, unsigned int cnt); + +static inline int irq_alloc_desc(int node) +{ + return irq_alloc_descs(-1, 0, 1, node); +} + +static inline int irq_alloc_desc_at(unsigned int at, int node) +{ + return irq_alloc_descs(at, at, 1, node); +} + +static inline int irq_alloc_desc_from(unsigned int from, int node) +{ + return irq_alloc_descs(-1, from, 1, node); +} + +static inline void irq_free_desc(unsigned int irq) +{ + irq_free_descs(irq, 1); +} + #endif /* CONFIG_GENERIC_HARDIRQS */ #endif /* !CONFIG_S390 */ -- cgit v1.2.3 From a98d24b71b6e229965f18dc00d28dc71cb8fe324 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Sep 2010 10:45:07 +0200 Subject: genirq: Implement sane enumeration Use the allocator bitmap to lookup active interrupts. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/irqnr.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index 7bf89bc8cbca..05aa8c23483f 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -25,6 +25,7 @@ extern int nr_irqs; extern struct irq_desc *irq_to_desc(unsigned int irq); +unsigned int irq_get_next_irq(unsigned int offset); # define for_each_irq_desc(irq, desc) \ for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; \ @@ -47,6 +48,10 @@ extern struct irq_desc *irq_to_desc(unsigned int irq); #define irq_node(irq) 0 #endif +# define for_each_active_irq(irq) \ + for (irq = irq_get_next_irq(0); irq < nr_irqs; \ + irq = irq_get_next_irq(irq + 1)) + #endif /* CONFIG_GENERIC_HARDIRQS */ #define for_each_irq_nr(irq) \ -- cgit v1.2.3 From 06f6c3399e9f9ff6eafc200e80f9226c3cee0eaf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Oct 2010 12:31:46 +0200 Subject: genirq: Implement irq reservation Mark a range of interrupts as allocated. In the SPARSE_IRQ=n case we need this to update the bitmap for the legacy irqs so the enumerator via irq_get_next_irq() works. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index cefacf928b33..096b74d5d0d7 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -400,6 +400,7 @@ static inline struct irq_2_iommu *irq_data_get_iommu(struct irq_data *d) int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node); void irq_free_descs(unsigned int irq, unsigned int cnt); +int irq_reserve_irqs(unsigned int from, unsigned int cnt); static inline int irq_alloc_desc(int node) { -- cgit v1.2.3 From b683de2b3cb17bb10fa6fd4af614dc75b5749fe0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Sep 2010 20:55:03 +0200 Subject: genirq: Query arch for number of early descriptors sparse irq sets up NR_IRQS_LEGACY irq descriptors and archs then go ahead and allocate more. Use the unused return value of arch_probe_nr_irqs() to let the architecture return the number of early allocations. Fix up all users. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/irq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 096b74d5d0d7..ef878823ee3b 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -214,6 +214,10 @@ struct irq_chip { */ #include +#ifndef NR_IRQS_LEGACY +# define NR_IRQS_LEGACY 0 +#endif + #ifndef ARCH_IRQ_INIT_FLAGS # define ARCH_IRQ_INIT_FLAGS 0 #endif -- cgit v1.2.3 From 1c9db52534a2c0e9776788cd34ccc193289fc18c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 16:46:51 +0200 Subject: pci: Convert msi to new irq_chip functions Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Jesse Barnes Cc: Benjamin Herrenschmidt Cc: "David S. Miller" Cc: Tony Luck Cc: Russell King --- include/linux/msi.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 91b05c171854..329d17c395a7 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -11,8 +11,9 @@ struct msi_msg { /* Helper functions */ struct irq_desc; -extern void mask_msi_irq(unsigned int irq); -extern void unmask_msi_irq(unsigned int irq); +struct irq_data; +extern void mask_msi_irq(struct irq_data *data); +extern void unmask_msi_irq(struct irq_data *data); extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); extern void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); -- cgit v1.2.3 From 39431acb1a4c464e62471cb3058b8ffffb9244db Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 19:09:51 +0200 Subject: pci: Cleanup the irq_desc mess in msi Handing down irq_desc to msi just so that msi can access irq_desc.irq_data.msi_desc is a pretty stupid idea. The calling code can hand down a pointer to msi_desc so msi code does not need to know about the irq descriptor at all. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Jesse Barnes --- include/linux/msi.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 329d17c395a7..05acced439a3 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -10,13 +10,13 @@ struct msi_msg { }; /* Helper functions */ -struct irq_desc; struct irq_data; +struct msi_desc; extern void mask_msi_irq(struct irq_data *data); extern void unmask_msi_irq(struct irq_data *data); -extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); -extern void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); -extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); +extern void __read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); +extern void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg); +extern void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); extern void read_msi_msg(unsigned int irq, struct msi_msg *msg); extern void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg); extern void write_msi_msg(unsigned int irq, struct msi_msg *msg); -- cgit v1.2.3 From 5c2837fbaa609e615ef9a1c58a4cd26ce90be35b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 17:15:11 +0200 Subject: dmar: Convert to new irq chip functions Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: David Woodhouse --- include/linux/dmar.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index d7cecc90ed34..cb86aa1ca436 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -187,8 +187,9 @@ static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev) /* Can't use the common MSI interrupt functions * since DMAR is not a pci device */ -extern void dmar_msi_unmask(unsigned int irq); -extern void dmar_msi_mask(unsigned int irq); +struct irq_data; +extern void dmar_msi_unmask(struct irq_data *data); +extern void dmar_msi_mask(struct irq_data *data); extern void dmar_msi_read(int irq, struct msi_msg *msg); extern void dmar_msi_write(int irq, struct msi_msg *msg); extern int dmar_set_interrupt(struct intel_iommu *iommu); -- cgit v1.2.3 From e9f7ac664bfc36685a8eb3315ec21c067d0cee36 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2010 17:22:09 +0200 Subject: ht: Convert to new irq_chip functions Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Jesse Barnes --- include/linux/htirq.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/htirq.h b/include/linux/htirq.h index c96ea46737d0..70a1dbbf2093 100644 --- a/include/linux/htirq.h +++ b/include/linux/htirq.h @@ -9,8 +9,9 @@ struct ht_irq_msg { /* Helper functions.. */ void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg); void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg); -void mask_ht_irq(unsigned int irq); -void unmask_ht_irq(unsigned int irq); +struct irq_data; +void mask_ht_irq(struct irq_data *data); +void unmask_ht_irq(struct irq_data *data); /* The arch hook for getting things started */ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev); -- cgit v1.2.3 From d0ad63927c6d4d511e172c78ba4a623539ef6901 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Oct 2010 18:41:37 +0200 Subject: pci: intr_remap: Remove unused functions No users. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Suresh Siddha Cc: David Woodhouse Cc: Jesse Barnes --- include/linux/dmar.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index cb86aa1ca436..200439ec7c49 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -119,8 +119,6 @@ extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count); extern int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 sub_handle); extern int map_irq_to_irte_handle(int irq, u16 *sub_handle); -extern int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index); -extern int flush_irte(int irq); extern int free_irte(int irq); extern int irq_remapped(int irq); -- cgit v1.2.3 From 423f085952fd7253407cb92984cc2d495a564481 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 10 Oct 2010 11:39:09 +0200 Subject: x86: Embedd irq_2_iommu into irq_cfg That interrupt remapping code is x86 specific and tied to the io_apic code. No need for separate allocator functions in the interrupt remapping code. This allows to simplify the code and irq_2_iommu is small (13 bytes on 64bit) so it's not a real problem even if interrupt remapping is runtime disabled. If it's compile time disabled the impact is zero. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Suresh Siddha Cc: David Woodhouse Cc: Jesse Barnes --- include/linux/dmar.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 200439ec7c49..4475f8cf7a62 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -106,6 +106,7 @@ struct irte { __u64 high; }; }; + #ifdef CONFIG_INTR_REMAP extern int intr_remapping_enabled; extern int intr_remapping_supported(void); -- cgit v1.2.3 From 1a0730d6649113c820217387a011a17dd4aff3ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 11 Oct 2010 11:55:37 +0200 Subject: x86: Speed up the irq_remapped check in hot pathes irq_2_iommu is in struct irq_cfg, so we can do the irq_remapped check based on irq_cfg instead of going through a lookup function. That's especially interesting in the eoi_ioapic_irq() hotpath. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Suresh Siddha Cc: David Woodhouse Cc: Jesse Barnes --- include/linux/dmar.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 4475f8cf7a62..51651b76d40f 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -122,7 +122,6 @@ extern int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, extern int map_irq_to_irte_handle(int irq, u16 *sub_handle); extern int free_irte(int irq); -extern int irq_remapped(int irq); extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev); extern struct intel_iommu *map_ioapic_to_ir(int apic); extern struct intel_iommu *map_hpet_to_ir(u8 id); @@ -176,7 +175,6 @@ static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev) return 0; } -#define irq_remapped(irq) (0) #define enable_intr_remapping(mode) (-1) #define disable_intr_remapping() (0) #define reenable_intr_remapping(mode) (0) -- cgit v1.2.3 From 10ba1e0eeef6a3c9453d96364e28cb4d911e1ac3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 11 Oct 2010 12:21:18 +0200 Subject: genirq: Remove irq_2_iommu irq_2_iommu is now in the x86 code where it belongs. Remove all leftovers. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Suresh Siddha Cc: David Woodhouse Cc: Jesse Barnes --- include/linux/irq.h | 18 ------------------ include/linux/irqdesc.h | 4 ---- 2 files changed, 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index ef878823ee3b..49702b22883e 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -85,7 +85,6 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #endif struct msi_desc; -struct irq_2_iommu; /** * struct irq_data - per irq and irq chip data passed down to chip functions @@ -97,7 +96,6 @@ struct irq_2_iommu; * methods, to allow shared chip implementations * @msi_desc: MSI descriptor * @affinity: IRQ affinity on SMP - * @irq_2_iommu: iommu with this irq * * The fields here need to overlay the ones in irq_desc until we * cleaned up the direct references and switched everything over to @@ -113,9 +111,6 @@ struct irq_data { #ifdef CONFIG_SMP cpumask_var_t affinity; #endif -#ifdef CONFIG_INTR_REMAP - struct irq_2_iommu *irq_2_iommu; -#endif }; /** @@ -389,19 +384,6 @@ static inline struct msi_desc *irq_data_get_msi(struct irq_data *d) return d->msi_desc; } -#ifdef CONFIG_INTR_REMAP -static inline struct irq_2_iommu *get_irq_iommu(unsigned int irq) -{ - struct irq_data *d = irq_get_irq_data(irq); - return d ? d->irq_2_iommu : NULL; -} - -static inline struct irq_2_iommu *irq_data_get_iommu(struct irq_data *d) -{ - return d->irq_2_iommu; -} -#endif - int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node); void irq_free_descs(unsigned int irq, unsigned int cnt); int irq_reserve_irqs(unsigned int from, unsigned int cnt); diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 22e426fdd301..f77dc5618d7e 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -10,7 +10,6 @@ struct proc_dir_entry; struct timer_rand_state; -struct irq_2_iommu; /** * struct irq_desc - interrupt descriptor * @irq_data: per irq and chip data passed down to chip functions @@ -52,9 +51,6 @@ struct irq_desc { struct msi_desc *msi_desc; #ifdef CONFIG_SMP cpumask_var_t affinity; -#endif -#ifdef CONFIG_INTR_REMAP - struct irq_2_iommu *irq_2_iommu; #endif }; }; -- cgit v1.2.3 From b7d0d8258a9f71949b810e0f82a3d75088f4d364 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 Sep 2010 18:44:23 +0200 Subject: genirq: Remove arch_init_chip_data() This function should have not been there in the first place. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/interrupt.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a0384a4d1e6f..19988983aeac 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -641,11 +641,8 @@ static inline void init_irq_proc(void) struct seq_file; int show_interrupts(struct seq_file *p, void *v); -struct irq_desc; - extern int early_irq_init(void); extern int arch_probe_nr_irqs(void); extern int arch_early_irq_init(void); -extern int arch_init_chip_data(struct irq_desc *desc, int node); #endif -- cgit v1.2.3 From b7b29338dc7111ed8bd4d6555d84afae13ebe752 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 Sep 2010 18:46:55 +0200 Subject: genirq: Sanitize dynamic irq handling Use the cleanup functions of the dynamic allocator. No need to have separate implementations. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/irq.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 49702b22883e..e9639115dff1 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -326,11 +326,15 @@ extern unsigned int create_irq_nr(unsigned int irq_want, int node); extern int create_irq(void); extern void destroy_irq(unsigned int irq); -/* Dynamic irq helper functions */ -extern void dynamic_irq_init(unsigned int irq); -void dynamic_irq_init_keep_chip_data(unsigned int irq); +/* + * Dynamic irq helper functions. Obsolete. Use irq_alloc_desc* and + * irq_free_desc instead. + */ extern void dynamic_irq_cleanup(unsigned int irq); -void dynamic_irq_cleanup_keep_chip_data(unsigned int irq); +static inline void dynamic_irq_init(unsigned int irq) +{ + dynamic_irq_cleanup(irq); +} /* Set/get chip/data for an IRQ: */ extern int set_irq_chip(unsigned int irq, struct irq_chip *chip); -- cgit v1.2.3 From 78f90d91f395cd0dc1ef3f21e0c5cd6fd50d202c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 Sep 2010 17:18:47 +0200 Subject: genirq: Remove the now unused sparse irq leftovers The move_irq_desc() function was only used due to the problem that the allocator did not free the old descriptors. So the descriptors had to be moved in create_irq_nr(). That's history. The code would have never been able to move active interrupt descriptors on affinity settings. That can be done in a completely different way w/o all this horror. Remove all of it. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/irqdesc.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index f77dc5618d7e..979c68cc7458 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -82,24 +82,16 @@ struct irq_desc { const char *name; } ____cacheline_internodealigned_in_smp; -extern void arch_init_copy_chip_data(struct irq_desc *old_desc, - struct irq_desc *desc, int node); -extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc); - #ifndef CONFIG_SPARSE_IRQ extern struct irq_desc irq_desc[NR_IRQS]; #endif -#ifdef CONFIG_NUMA_IRQ_DESC -extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node); -#else +/* Will be removed once the last users in power and sh are gone */ +extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node); static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { return desc; } -#endif - -extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node); #ifdef CONFIG_GENERIC_HARDIRQS -- cgit v1.2.3 From 40ffa93791985ab300fd488072e9f37ccf72e88c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Oct 2010 21:08:14 +0200 Subject: x86: Remove stale pmtimer_64.c This file is unused since the apic unification in 2.6.29, but nobody noticed. Signed-off-by: Thomas Gleixner --- include/linux/acpi_pmtmr.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi_pmtmr.h b/include/linux/acpi_pmtmr.h index 7e3d2859be50..1d0ef1ae8036 100644 --- a/include/linux/acpi_pmtmr.h +++ b/include/linux/acpi_pmtmr.h @@ -25,8 +25,6 @@ static inline u32 acpi_pm_read_early(void) return acpi_pm_read_verified() & ACPI_PM_MASK; } -extern void pmtimer_wait(unsigned); - #else static inline u32 acpi_pm_read_early(void) -- cgit v1.2.3 From 620162505e5d46bc4494b1761743e4b0b3bf8e16 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Tue, 5 Oct 2010 18:01:51 +0900 Subject: lockdep: Add improved subclass caching Current lockdep_map only caches one class with subclass == 0, and looks up hash table of classes when subclass != 0. It seems that this has no problem because the case of subclass != 0 is rare. But locks of struct rq are acquired with subclass == 1 when task migration is executed. Task migration is high frequent event, so I modified lockdep to cache subclasses. I measured the score of perf bench sched messaging. This patch has slightly but certain (order of milli seconds or 10 milli seconds) effect when lots of tasks are running. I'll show the result in the tail of this description. NR_LOCKDEP_CACHING_CLASSES specifies how many classes can be cached in the instances of lockdep_map. I discussed with Peter Zijlstra in LinuxCon Japan about this approach and he taught me that caching every subclasses(8) is cleary waste of memory. So number of cached classes should be configurable. === Score comparison of benchmarks === # "min" means best score, and "max" means worst score for i in `seq 1 10`; do ./perf bench -f simple sched messaging; done before: min: 0.565000, max: 0.583000, avg: 0.572500 after: min: 0.559000, max: 0.568000, avg: 0.563300 # with more processes for i in `seq 1 10`; do ./perf bench -f simple sched messaging -g 40; done before: min: 2.274000, max: 2.298000, avg: 2.286300 after: min: 2.242000, max: 2.270000, avg: 2.259700 Signed-off-by: Hitoshi Mitake Cc: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1286269311-28336-2-git-send-email-mitake@dcl.info.waseda.ac.jp> Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 06aed8305bf3..2186a64ee4b5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -31,6 +31,17 @@ extern int lock_stat; #define MAX_LOCKDEP_SUBCLASSES 8UL +/* + * NR_LOCKDEP_CACHING_CLASSES ... Number of classes + * cached in the instance of lockdep_map + * + * Currently main class (subclass == 0) and signle depth subclass + * are cached in lockdep_map. This optimization is mainly targeting + * on rq->lock. double_rq_lock() acquires this highly competitive with + * single depth. + */ +#define NR_LOCKDEP_CACHING_CLASSES 2 + /* * Lock-classes are keyed via unique addresses, by embedding the * lockclass-key into the kernel (or module) .data section. (For @@ -138,7 +149,7 @@ void clear_lock_stats(struct lock_class *class); */ struct lockdep_map { struct lock_class_key *key; - struct lock_class *class_cache; + struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES]; const char *name; #ifdef CONFIG_LOCK_STAT int cpu; -- cgit v1.2.3 From 75e1056f5c57050415b64cb761a3acc35d91f013 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:16 -0700 Subject: sched: Fix softirq time accounting Peter Zijlstra found a bug in the way softirq time is accounted in VIRT_CPU_ACCOUNTING on this thread: http://lkml.indiana.edu/hypermail//linux/kernel/1009.2/01366.html The problem is, softirq processing uses local_bh_disable internally. There is no way, later in the flow, to differentiate between whether softirq is being processed or is it just that bh has been disabled. So, a hardirq when bh is disabled results in time being wrongly accounted as softirq. Looking at the code a bit more, the problem exists in !VIRT_CPU_ACCOUNTING as well. As account_system_time() in normal tick based accouting also uses softirq_count, which will be set even when not in softirq with bh disabled. Peter also suggested solution of using 2*SOFTIRQ_OFFSET as irq count for local_bh_{disable,enable} and using just SOFTIRQ_OFFSET while softirq processing. The patch below does that and adds API in_serving_softirq() which returns whether we are currently processing softirq or not. Also changes one of the usages of softirq_count in net/sched/cls_cgroup.c to in_serving_softirq. Looks like many usages of in_softirq really want in_serving_softirq. Those changes can be made individually on a case by case basis. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-2-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- include/linux/hardirq.h | 5 +++++ include/linux/sched.h | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index d5b387669dab..e37a77cbd588 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -64,6 +64,8 @@ #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT) +#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) + #ifndef PREEMPT_ACTIVE #define PREEMPT_ACTIVE_BITS 1 #define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) @@ -82,10 +84,13 @@ /* * Are we doing bottom half or hardware interrupt processing? * Are we in a softirq context? Interrupt context? + * in_softirq - Are we currently processing softirq or have bh disabled? + * in_serving_softirq - Are we currently processing softirq? */ #define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) +#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) /* * Are we in NMI context? diff --git a/include/linux/sched.h b/include/linux/sched.h index cdf56693ecbf..8744e50cb083 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2366,9 +2366,9 @@ extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_softirq(void); -#define cond_resched_softirq() ({ \ - __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ - __cond_resched_softirq(); \ +#define cond_resched_softirq() ({ \ + __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ + __cond_resched_softirq(); \ }) /* -- cgit v1.2.3 From e1e10a265d28273ab8c70be19d43dcbdeead6c5a Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:17 -0700 Subject: sched: Consolidate account_system_vtime extern declaration Just a minor cleanup patch that makes things easier to the following patches. No functionality change in this patch. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-3-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- include/linux/hardirq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index e37a77cbd588..41367c5c3c68 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -141,6 +141,8 @@ struct task_struct; static inline void account_system_vtime(struct task_struct *tsk) { } +#else +extern void account_system_vtime(struct task_struct *tsk); #endif #if defined(CONFIG_NO_HZ) -- cgit v1.2.3 From 6cdd5199daf0cb7b0fcc8dca941af08492612887 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:18 -0700 Subject: sched: Add a PF flag for ksoftirqd identification To account softirq time cleanly in scheduler, we need to identify whether softirq is invoked in ksoftirqd context or softirq at hardirq tail context. Add PF_KSOFTIRQD for that purpose. As all PF flag bits are currently taken, create space by moving one of the infrequently used bits (PF_THREAD_BOUND) down in task_struct to be along with some other state fields. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-4-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8744e50cb083..aca0ce675939 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1682,6 +1682,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * /* * Per process flags */ +#define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */ #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ -- cgit v1.2.3 From b52bfee445d315549d41eacf2fa7c156e7d153d5 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:19 -0700 Subject: sched: Add IRQ_TIME_ACCOUNTING, finer accounting of irq time s390/powerpc/ia64 have support for CONFIG_VIRT_CPU_ACCOUNTING which does the fine granularity accounting of user, system, hardirq, softirq times. Adding that option on archs like x86 will be challenging however, given the state of TSC reliability on various platforms and also the overhead it will add in syscall entry exit. Instead, add a lighter variant that only does finer accounting of hardirq and softirq times, providing precise irq times (instead of timer tick based samples). This accounting is added with a new config option CONFIG_IRQ_TIME_ACCOUNTING so that there won't be any overhead for users not interested in paying the perf penalty. This accounting is based on sched_clock, with the code being generic. So, other archs may find it useful as well. This patch just adds the core logic and does not enable this logic yet. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-5-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- include/linux/hardirq.h | 2 +- include/linux/sched.h | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 41367c5c3c68..ff43e9268449 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -137,7 +137,7 @@ extern void synchronize_irq(unsigned int irq); struct task_struct; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) static inline void account_system_vtime(struct task_struct *tsk) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index aca0ce675939..2cca9a92f5e5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1826,6 +1826,19 @@ extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); #endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * An i/f to runtime opt-in for irq time accounting based off of sched_clock. + * The reason for this explicit opt-in is not to have perf penalty with + * slow sched_clocks. + */ +extern void enable_sched_clock_irqtime(void); +extern void disable_sched_clock_irqtime(void); +#else +static inline void enable_sched_clock_irqtime(void) {} +static inline void disable_sched_clock_irqtime(void) {} +#endif + extern unsigned long long task_sched_runtime(struct task_struct *task); extern unsigned long long thread_group_sched_runtime(struct task_struct *task); -- cgit v1.2.3 From c957ef2c59e952803766ddc22e89981ab534606f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 20 Oct 2010 11:07:02 +0800 Subject: percpu: Introduce a read-mostly percpu API Add a new readmostly percpu section and API. This can be used to avoid dirtying data lines which are generally not written to, which is especially important for data which may be accessed by processors other than the one for which the percpu area belongs to. [ hpa: moved it *after* the page-aligned section, for obvious reasons. ] Signed-off-by: Shaohua Li LKML-Reference: <1287544022.4571.7.camel@sli10-conroe.sh.intel.com> Cc: Eric Dumazet Signed-off-by: H. Peter Anvin --- include/linux/percpu-defs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index ce2dc655cd1d..27ef6b190ea6 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -138,6 +138,15 @@ DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \ __aligned(PAGE_SIZE) +/* + * Declaration/definition used for per-CPU variables that must be read mostly. + */ +#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, "..readmostly") + +#define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "..readmostly") + /* * Intermodule exports for per-CPU variables. sparse forgets about * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to -- cgit v1.2.3 From 3d14c5d2b6e15c21d8e5467dc62d33127c23a644 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Tue, 6 Apr 2010 15:14:15 -0700 Subject: ceph: factor out libceph from Ceph file system This factors out protocol and low-level storage parts of ceph into a separate libceph module living in net/ceph and include/linux/ceph. This is mostly a matter of moving files around. However, a few key pieces of the interface change as well: - ceph_client becomes ceph_fs_client and ceph_client, where the latter captures the mon and osd clients, and the fs_client gets the mds client and file system specific pieces. - Mount option parsing and debugfs setup is correspondingly broken into two pieces. - The mon client gets a generic handler callback for otherwise unknown messages (mds map, in this case). - The basic supported/required feature bits can be expanded (and are by ceph_fs_client). No functional change, aside from some subtle error handling cases that got cleaned up in the refactoring process. Signed-off-by: Sage Weil --- include/linux/ceph/auth.h | 92 +++++ include/linux/ceph/buffer.h | 39 +++ include/linux/ceph/ceph_debug.h | 38 +++ include/linux/ceph/ceph_frag.h | 109 ++++++ include/linux/ceph/ceph_fs.h | 728 ++++++++++++++++++++++++++++++++++++++++ include/linux/ceph/ceph_hash.h | 13 + include/linux/ceph/debugfs.h | 33 ++ include/linux/ceph/decode.h | 201 +++++++++++ include/linux/ceph/libceph.h | 249 ++++++++++++++ include/linux/ceph/mdsmap.h | 62 ++++ include/linux/ceph/messenger.h | 261 ++++++++++++++ include/linux/ceph/mon_client.h | 122 +++++++ include/linux/ceph/msgpool.h | 25 ++ include/linux/ceph/msgr.h | 175 ++++++++++ include/linux/ceph/osd_client.h | 234 +++++++++++++ include/linux/ceph/osdmap.h | 130 +++++++ include/linux/ceph/pagelist.h | 54 +++ include/linux/ceph/rados.h | 405 ++++++++++++++++++++++ include/linux/ceph/types.h | 29 ++ include/linux/crush/crush.h | 180 ++++++++++ include/linux/crush/hash.h | 17 + include/linux/crush/mapper.h | 20 ++ 22 files changed, 3216 insertions(+) create mode 100644 include/linux/ceph/auth.h create mode 100644 include/linux/ceph/buffer.h create mode 100644 include/linux/ceph/ceph_debug.h create mode 100644 include/linux/ceph/ceph_frag.h create mode 100644 include/linux/ceph/ceph_fs.h create mode 100644 include/linux/ceph/ceph_hash.h create mode 100644 include/linux/ceph/debugfs.h create mode 100644 include/linux/ceph/decode.h create mode 100644 include/linux/ceph/libceph.h create mode 100644 include/linux/ceph/mdsmap.h create mode 100644 include/linux/ceph/messenger.h create mode 100644 include/linux/ceph/mon_client.h create mode 100644 include/linux/ceph/msgpool.h create mode 100644 include/linux/ceph/msgr.h create mode 100644 include/linux/ceph/osd_client.h create mode 100644 include/linux/ceph/osdmap.h create mode 100644 include/linux/ceph/pagelist.h create mode 100644 include/linux/ceph/rados.h create mode 100644 include/linux/ceph/types.h create mode 100644 include/linux/crush/crush.h create mode 100644 include/linux/crush/hash.h create mode 100644 include/linux/crush/mapper.h (limited to 'include/linux') diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h new file mode 100644 index 000000000000..7fff521d7eb5 --- /dev/null +++ b/include/linux/ceph/auth.h @@ -0,0 +1,92 @@ +#ifndef _FS_CEPH_AUTH_H +#define _FS_CEPH_AUTH_H + +#include +#include + +/* + * Abstract interface for communicating with the authenticate module. + * There is some handshake that takes place between us and the monitor + * to acquire the necessary keys. These are used to generate an + * 'authorizer' that we use when connecting to a service (mds, osd). + */ + +struct ceph_auth_client; +struct ceph_authorizer; + +struct ceph_auth_client_ops { + const char *name; + + /* + * true if we are authenticated and can connect to + * services. + */ + int (*is_authenticated)(struct ceph_auth_client *ac); + + /* + * true if we should (re)authenticate, e.g., when our tickets + * are getting old and crusty. + */ + int (*should_authenticate)(struct ceph_auth_client *ac); + + /* + * build requests and process replies during monitor + * handshake. if handle_reply returns -EAGAIN, we build + * another request. + */ + int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end); + int (*handle_reply)(struct ceph_auth_client *ac, int result, + void *buf, void *end); + + /* + * Create authorizer for connecting to a service, and verify + * the response to authenticate the service. + */ + int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type, + struct ceph_authorizer **a, + void **buf, size_t *len, + void **reply_buf, size_t *reply_len); + int (*verify_authorizer_reply)(struct ceph_auth_client *ac, + struct ceph_authorizer *a, size_t len); + void (*destroy_authorizer)(struct ceph_auth_client *ac, + struct ceph_authorizer *a); + void (*invalidate_authorizer)(struct ceph_auth_client *ac, + int peer_type); + + /* reset when we (re)connect to a monitor */ + void (*reset)(struct ceph_auth_client *ac); + + void (*destroy)(struct ceph_auth_client *ac); +}; + +struct ceph_auth_client { + u32 protocol; /* CEPH_AUTH_* */ + void *private; /* for use by protocol implementation */ + const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */ + + bool negotiating; /* true if negotiating protocol */ + const char *name; /* entity name */ + u64 global_id; /* our unique id in system */ + const char *secret; /* our secret key */ + unsigned want_keys; /* which services we want */ +}; + +extern struct ceph_auth_client *ceph_auth_init(const char *name, + const char *secret); +extern void ceph_auth_destroy(struct ceph_auth_client *ac); + +extern void ceph_auth_reset(struct ceph_auth_client *ac); + +extern int ceph_auth_build_hello(struct ceph_auth_client *ac, + void *buf, size_t len); +extern int ceph_handle_auth_reply(struct ceph_auth_client *ac, + void *buf, size_t len, + void *reply_buf, size_t reply_len); +extern int ceph_entity_name_encode(const char *name, void **p, void *end); + +extern int ceph_build_auth(struct ceph_auth_client *ac, + void *msg_buf, size_t msg_len); + +extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); + +#endif diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h new file mode 100644 index 000000000000..58d19014068f --- /dev/null +++ b/include/linux/ceph/buffer.h @@ -0,0 +1,39 @@ +#ifndef __FS_CEPH_BUFFER_H +#define __FS_CEPH_BUFFER_H + +#include +#include +#include +#include +#include + +/* + * a simple reference counted buffer. + * + * use kmalloc for small sizes (<= one page), vmalloc for larger + * sizes. + */ +struct ceph_buffer { + struct kref kref; + struct kvec vec; + size_t alloc_len; + bool is_vmalloc; +}; + +extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp); +extern void ceph_buffer_release(struct kref *kref); + +static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b) +{ + kref_get(&b->kref); + return b; +} + +static inline void ceph_buffer_put(struct ceph_buffer *b) +{ + kref_put(&b->kref, ceph_buffer_release); +} + +extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end); + +#endif diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h new file mode 100644 index 000000000000..aa2e19182d99 --- /dev/null +++ b/include/linux/ceph/ceph_debug.h @@ -0,0 +1,38 @@ +#ifndef _FS_CEPH_DEBUG_H +#define _FS_CEPH_DEBUG_H + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG + +/* + * wrap pr_debug to include a filename:lineno prefix on each line. + * this incurs some overhead (kernel size and execution time) due to + * the extra function call at each call site. + */ + +# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) +extern const char *ceph_file_part(const char *s, int len); +# define dout(fmt, ...) \ + pr_debug("%.*s %12.12s:%-4d : " fmt, \ + 8 - (int)sizeof(KBUILD_MODNAME), " ", \ + ceph_file_part(__FILE__, sizeof(__FILE__)), \ + __LINE__, ##__VA_ARGS__) +# else +/* faux printk call just to see any compiler warnings. */ +# define dout(fmt, ...) do { \ + if (0) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ + } while (0) +# endif + +#else + +/* + * or, just wrap pr_debug + */ +# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__) + +#endif + +#endif diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h new file mode 100644 index 000000000000..5babb8e95352 --- /dev/null +++ b/include/linux/ceph/ceph_frag.h @@ -0,0 +1,109 @@ +#ifndef FS_CEPH_FRAG_H +#define FS_CEPH_FRAG_H + +/* + * "Frags" are a way to describe a subset of a 32-bit number space, + * using a mask and a value to match against that mask. Any given frag + * (subset of the number space) can be partitioned into 2^n sub-frags. + * + * Frags are encoded into a 32-bit word: + * 8 upper bits = "bits" + * 24 lower bits = "value" + * (We could go to 5+27 bits, but who cares.) + * + * We use the _most_ significant bits of the 24 bit value. This makes + * values logically sort. + * + * Unfortunately, because the "bits" field is still in the high bits, we + * can't sort encoded frags numerically. However, it does allow you + * to feed encoded frags as values into frag_contains_value. + */ +static inline __u32 ceph_frag_make(__u32 b, __u32 v) +{ + return (b << 24) | + (v & (0xffffffu << (24-b)) & 0xffffffu); +} +static inline __u32 ceph_frag_bits(__u32 f) +{ + return f >> 24; +} +static inline __u32 ceph_frag_value(__u32 f) +{ + return f & 0xffffffu; +} +static inline __u32 ceph_frag_mask(__u32 f) +{ + return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu; +} +static inline __u32 ceph_frag_mask_shift(__u32 f) +{ + return 24 - ceph_frag_bits(f); +} + +static inline int ceph_frag_contains_value(__u32 f, __u32 v) +{ + return (v & ceph_frag_mask(f)) == ceph_frag_value(f); +} +static inline int ceph_frag_contains_frag(__u32 f, __u32 sub) +{ + /* is sub as specific as us, and contained by us? */ + return ceph_frag_bits(sub) >= ceph_frag_bits(f) && + (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f); +} + +static inline __u32 ceph_frag_parent(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f) - 1, + ceph_frag_value(f) & (ceph_frag_mask(f) << 1)); +} +static inline int ceph_frag_is_left_child(__u32 f) +{ + return ceph_frag_bits(f) > 0 && + (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0; +} +static inline int ceph_frag_is_right_child(__u32 f) +{ + return ceph_frag_bits(f) > 0 && + (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1; +} +static inline __u32 ceph_frag_sibling(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f), + ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f))); +} +static inline __u32 ceph_frag_left_child(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f)); +} +static inline __u32 ceph_frag_right_child(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f)+1, + ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f)))); +} +static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) +{ + int newbits = ceph_frag_bits(f) + by; + return ceph_frag_make(newbits, + ceph_frag_value(f) | (i << (24 - newbits))); +} +static inline int ceph_frag_is_leftmost(__u32 f) +{ + return ceph_frag_value(f) == 0; +} +static inline int ceph_frag_is_rightmost(__u32 f) +{ + return ceph_frag_value(f) == ceph_frag_mask(f); +} +static inline __u32 ceph_frag_next(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f), + ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f))); +} + +/* + * comparator to sort frags logically, as when traversing the + * number space in ascending order... + */ +int ceph_frag_compare(__u32 a, __u32 b); + +#endif diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h new file mode 100644 index 000000000000..d5619ac86711 --- /dev/null +++ b/include/linux/ceph/ceph_fs.h @@ -0,0 +1,728 @@ +/* + * ceph_fs.h - Ceph constants and data types to share between kernel and + * user space. + * + * Most types in this file are defined as little-endian, and are + * primarily intended to describe data structures that pass over the + * wire or that are stored on disk. + * + * LGPL2 + */ + +#ifndef CEPH_FS_H +#define CEPH_FS_H + +#include "msgr.h" +#include "rados.h" + +/* + * subprotocol versions. when specific messages types or high-level + * protocols change, bump the affected components. we keep rev + * internal cluster protocols separately from the public, + * client-facing protocol. + */ +#define CEPH_OSD_PROTOCOL 8 /* cluster internal */ +#define CEPH_MDS_PROTOCOL 12 /* cluster internal */ +#define CEPH_MON_PROTOCOL 5 /* cluster internal */ +#define CEPH_OSDC_PROTOCOL 24 /* server/client */ +#define CEPH_MDSC_PROTOCOL 32 /* server/client */ +#define CEPH_MONC_PROTOCOL 15 /* server/client */ + + +#define CEPH_INO_ROOT 1 +#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ + +/* arbitrary limit on max # of monitors (cluster of 3 is typical) */ +#define CEPH_MAX_MON 31 + + +/* + * feature bits + */ +#define CEPH_FEATURE_UID (1<<0) +#define CEPH_FEATURE_NOSRCADDR (1<<1) +#define CEPH_FEATURE_MONCLOCKCHECK (1<<2) +#define CEPH_FEATURE_FLOCK (1<<3) + + +/* + * ceph_file_layout - describe data layout for a file/inode + */ +struct ceph_file_layout { + /* file -> object mapping */ + __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple + of page size. */ + __le32 fl_stripe_count; /* over this many objects */ + __le32 fl_object_size; /* until objects are this big, then move to + new objects */ + __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */ + + /* pg -> disk layout */ + __le32 fl_object_stripe_unit; /* for per-object parity, if any */ + + /* object -> pg layout */ + __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */ + __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ +} __attribute__ ((packed)); + +#define CEPH_MIN_STRIPE_UNIT 65536 + +int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); + + +/* crypto algorithms */ +#define CEPH_CRYPTO_NONE 0x0 +#define CEPH_CRYPTO_AES 0x1 + +#define CEPH_AES_IV "cephsageyudagreg" + +/* security/authentication protocols */ +#define CEPH_AUTH_UNKNOWN 0x0 +#define CEPH_AUTH_NONE 0x1 +#define CEPH_AUTH_CEPHX 0x2 + +#define CEPH_AUTH_UID_DEFAULT ((__u64) -1) + + +/********************************************* + * message layer + */ + +/* + * message types + */ + +/* misc */ +#define CEPH_MSG_SHUTDOWN 1 +#define CEPH_MSG_PING 2 + +/* client <-> monitor */ +#define CEPH_MSG_MON_MAP 4 +#define CEPH_MSG_MON_GET_MAP 5 +#define CEPH_MSG_STATFS 13 +#define CEPH_MSG_STATFS_REPLY 14 +#define CEPH_MSG_MON_SUBSCRIBE 15 +#define CEPH_MSG_MON_SUBSCRIBE_ACK 16 +#define CEPH_MSG_AUTH 17 +#define CEPH_MSG_AUTH_REPLY 18 + +/* client <-> mds */ +#define CEPH_MSG_MDS_MAP 21 + +#define CEPH_MSG_CLIENT_SESSION 22 +#define CEPH_MSG_CLIENT_RECONNECT 23 + +#define CEPH_MSG_CLIENT_REQUEST 24 +#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 +#define CEPH_MSG_CLIENT_REPLY 26 +#define CEPH_MSG_CLIENT_CAPS 0x310 +#define CEPH_MSG_CLIENT_LEASE 0x311 +#define CEPH_MSG_CLIENT_SNAP 0x312 +#define CEPH_MSG_CLIENT_CAPRELEASE 0x313 + +/* pool ops */ +#define CEPH_MSG_POOLOP_REPLY 48 +#define CEPH_MSG_POOLOP 49 + + +/* osd */ +#define CEPH_MSG_OSD_MAP 41 +#define CEPH_MSG_OSD_OP 42 +#define CEPH_MSG_OSD_OPREPLY 43 + +/* pool operations */ +enum { + POOL_OP_CREATE = 0x01, + POOL_OP_DELETE = 0x02, + POOL_OP_AUID_CHANGE = 0x03, + POOL_OP_CREATE_SNAP = 0x11, + POOL_OP_DELETE_SNAP = 0x12, + POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, + POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, +}; + +struct ceph_mon_request_header { + __le64 have_version; + __le16 session_mon; + __le64 session_mon_tid; +} __attribute__ ((packed)); + +struct ceph_mon_statfs { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; +} __attribute__ ((packed)); + +struct ceph_statfs { + __le64 kb, kb_used, kb_avail; + __le64 num_objects; +} __attribute__ ((packed)); + +struct ceph_mon_statfs_reply { + struct ceph_fsid fsid; + __le64 version; + struct ceph_statfs st; +} __attribute__ ((packed)); + +const char *ceph_pool_op_name(int op); + +struct ceph_mon_poolop { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 pool; + __le32 op; + __le64 auid; + __le64 snapid; + __le32 name_len; +} __attribute__ ((packed)); + +struct ceph_mon_poolop_reply { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 reply_code; + __le32 epoch; + char has_data; + char data[0]; +} __attribute__ ((packed)); + +struct ceph_mon_unmanaged_snap { + __le64 snapid; +} __attribute__ ((packed)); + +struct ceph_osd_getmap { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 start; +} __attribute__ ((packed)); + +struct ceph_mds_getmap { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; +} __attribute__ ((packed)); + +struct ceph_client_mount { + struct ceph_mon_request_header monhdr; +} __attribute__ ((packed)); + +struct ceph_mon_subscribe_item { + __le64 have_version; __le64 have; + __u8 onetime; +} __attribute__ ((packed)); + +struct ceph_mon_subscribe_ack { + __le32 duration; /* seconds */ + struct ceph_fsid fsid; +} __attribute__ ((packed)); + +/* + * mds states + * > 0 -> in + * <= 0 -> out + */ +#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */ +#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees. + empty log. */ +#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */ +#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */ +#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ +#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ +#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ + +#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ +#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed + operations (import, rename, etc.) */ +#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */ +#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */ +#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */ +#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */ +#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */ + +extern const char *ceph_mds_state_name(int s); + + +/* + * metadata lock types. + * - these are bitmasks.. we can compose them + * - they also define the lock ordering by the MDS + * - a few of these are internal to the mds + */ +#define CEPH_LOCK_DVERSION 1 +#define CEPH_LOCK_DN 2 +#define CEPH_LOCK_ISNAP 16 +#define CEPH_LOCK_IVERSION 32 /* mds internal */ +#define CEPH_LOCK_IFILE 64 +#define CEPH_LOCK_IAUTH 128 +#define CEPH_LOCK_ILINK 256 +#define CEPH_LOCK_IDFT 512 /* dir frag tree */ +#define CEPH_LOCK_INEST 1024 /* mds internal */ +#define CEPH_LOCK_IXATTR 2048 +#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */ +#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ + +/* client_session ops */ +enum { + CEPH_SESSION_REQUEST_OPEN, + CEPH_SESSION_OPEN, + CEPH_SESSION_REQUEST_CLOSE, + CEPH_SESSION_CLOSE, + CEPH_SESSION_REQUEST_RENEWCAPS, + CEPH_SESSION_RENEWCAPS, + CEPH_SESSION_STALE, + CEPH_SESSION_RECALL_STATE, +}; + +extern const char *ceph_session_op_name(int op); + +struct ceph_mds_session_head { + __le32 op; + __le64 seq; + struct ceph_timespec stamp; + __le32 max_caps, max_leases; +} __attribute__ ((packed)); + +/* client_request */ +/* + * metadata ops. + * & 0x001000 -> write op + * & 0x010000 -> follow symlink (e.g. stat(), not lstat()). + & & 0x100000 -> use weird ino/path trace + */ +#define CEPH_MDS_OP_WRITE 0x001000 +enum { + CEPH_MDS_OP_LOOKUP = 0x00100, + CEPH_MDS_OP_GETATTR = 0x00101, + CEPH_MDS_OP_LOOKUPHASH = 0x00102, + CEPH_MDS_OP_LOOKUPPARENT = 0x00103, + + CEPH_MDS_OP_SETXATTR = 0x01105, + CEPH_MDS_OP_RMXATTR = 0x01106, + CEPH_MDS_OP_SETLAYOUT = 0x01107, + CEPH_MDS_OP_SETATTR = 0x01108, + CEPH_MDS_OP_SETFILELOCK= 0x01109, + CEPH_MDS_OP_GETFILELOCK= 0x00110, + + CEPH_MDS_OP_MKNOD = 0x01201, + CEPH_MDS_OP_LINK = 0x01202, + CEPH_MDS_OP_UNLINK = 0x01203, + CEPH_MDS_OP_RENAME = 0x01204, + CEPH_MDS_OP_MKDIR = 0x01220, + CEPH_MDS_OP_RMDIR = 0x01221, + CEPH_MDS_OP_SYMLINK = 0x01222, + + CEPH_MDS_OP_CREATE = 0x01301, + CEPH_MDS_OP_OPEN = 0x00302, + CEPH_MDS_OP_READDIR = 0x00305, + + CEPH_MDS_OP_LOOKUPSNAP = 0x00400, + CEPH_MDS_OP_MKSNAP = 0x01400, + CEPH_MDS_OP_RMSNAP = 0x01401, + CEPH_MDS_OP_LSSNAP = 0x00402, +}; + +extern const char *ceph_mds_op_name(int op); + + +#define CEPH_SETATTR_MODE 1 +#define CEPH_SETATTR_UID 2 +#define CEPH_SETATTR_GID 4 +#define CEPH_SETATTR_MTIME 8 +#define CEPH_SETATTR_ATIME 16 +#define CEPH_SETATTR_SIZE 32 +#define CEPH_SETATTR_CTIME 64 + +union ceph_mds_request_args { + struct { + __le32 mask; /* CEPH_CAP_* */ + } __attribute__ ((packed)) getattr; + struct { + __le32 mode; + __le32 uid; + __le32 gid; + struct ceph_timespec mtime; + struct ceph_timespec atime; + __le64 size, old_size; /* old_size needed by truncate */ + __le32 mask; /* CEPH_SETATTR_* */ + } __attribute__ ((packed)) setattr; + struct { + __le32 frag; /* which dir fragment */ + __le32 max_entries; /* how many dentries to grab */ + __le32 max_bytes; + } __attribute__ ((packed)) readdir; + struct { + __le32 mode; + __le32 rdev; + } __attribute__ ((packed)) mknod; + struct { + __le32 mode; + } __attribute__ ((packed)) mkdir; + struct { + __le32 flags; + __le32 mode; + __le32 stripe_unit; /* layout for newly created file */ + __le32 stripe_count; /* ... */ + __le32 object_size; + __le32 file_replication; + __le32 preferred; + } __attribute__ ((packed)) open; + struct { + __le32 flags; + } __attribute__ ((packed)) setxattr; + struct { + struct ceph_file_layout layout; + } __attribute__ ((packed)) setlayout; + struct { + __u8 rule; /* currently fcntl or flock */ + __u8 type; /* shared, exclusive, remove*/ + __le64 pid; /* process id requesting the lock */ + __le64 pid_namespace; + __le64 start; /* initial location to lock */ + __le64 length; /* num bytes to lock from start */ + __u8 wait; /* will caller wait for lock to become available? */ + } __attribute__ ((packed)) filelock_change; +} __attribute__ ((packed)); + +#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ +#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ + +struct ceph_mds_request_head { + __le64 oldest_client_tid; + __le32 mdsmap_epoch; /* on client */ + __le32 flags; /* CEPH_MDS_FLAG_* */ + __u8 num_retry, num_fwd; /* count retry, fwd attempts */ + __le16 num_releases; /* # include cap/lease release records */ + __le32 op; /* mds op code */ + __le32 caller_uid, caller_gid; + __le64 ino; /* use this ino for openc, mkdir, mknod, + etc. (if replaying) */ + union ceph_mds_request_args args; +} __attribute__ ((packed)); + +/* cap/lease release record */ +struct ceph_mds_request_release { + __le64 ino, cap_id; /* ino and unique cap id */ + __le32 caps, wanted; /* new issued, wanted */ + __le32 seq, issue_seq, mseq; + __le32 dname_seq; /* if releasing a dentry lease, a */ + __le32 dname_len; /* string follows. */ +} __attribute__ ((packed)); + +/* client reply */ +struct ceph_mds_reply_head { + __le32 op; + __le32 result; + __le32 mdsmap_epoch; + __u8 safe; /* true if committed to disk */ + __u8 is_dentry, is_target; /* true if dentry, target inode records + are included with reply */ +} __attribute__ ((packed)); + +/* one for each node split */ +struct ceph_frag_tree_split { + __le32 frag; /* this frag splits... */ + __le32 by; /* ...by this many bits */ +} __attribute__ ((packed)); + +struct ceph_frag_tree_head { + __le32 nsplits; /* num ceph_frag_tree_split records */ + struct ceph_frag_tree_split splits[]; +} __attribute__ ((packed)); + +/* capability issue, for bundling with mds reply */ +struct ceph_mds_reply_cap { + __le32 caps, wanted; /* caps issued, wanted */ + __le64 cap_id; + __le32 seq, mseq; + __le64 realm; /* snap realm */ + __u8 flags; /* CEPH_CAP_FLAG_* */ +} __attribute__ ((packed)); + +#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */ + +/* inode record, for bundling with mds reply */ +struct ceph_mds_reply_inode { + __le64 ino; + __le64 snapid; + __le32 rdev; + __le64 version; /* inode version */ + __le64 xattr_version; /* version for xattr blob */ + struct ceph_mds_reply_cap cap; /* caps issued for this inode */ + struct ceph_file_layout layout; + struct ceph_timespec ctime, mtime, atime; + __le32 time_warp_seq; + __le64 size, max_size, truncate_size; + __le32 truncate_seq; + __le32 mode, uid, gid; + __le32 nlink; + __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */ + struct ceph_timespec rctime; + struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */ +} __attribute__ ((packed)); +/* followed by frag array, then symlink string, then xattr blob */ + +/* reply_lease follows dname, and reply_inode */ +struct ceph_mds_reply_lease { + __le16 mask; /* lease type(s) */ + __le32 duration_ms; /* lease duration */ + __le32 seq; +} __attribute__ ((packed)); + +struct ceph_mds_reply_dirfrag { + __le32 frag; /* fragment */ + __le32 auth; /* auth mds, if this is a delegation point */ + __le32 ndist; /* number of mds' this is replicated on */ + __le32 dist[]; +} __attribute__ ((packed)); + +#define CEPH_LOCK_FCNTL 1 +#define CEPH_LOCK_FLOCK 2 + +#define CEPH_LOCK_SHARED 1 +#define CEPH_LOCK_EXCL 2 +#define CEPH_LOCK_UNLOCK 4 + +struct ceph_filelock { + __le64 start;/* file offset to start lock at */ + __le64 length; /* num bytes to lock; 0 for all following start */ + __le64 client; /* which client holds the lock */ + __le64 pid; /* process id holding the lock on the client */ + __le64 pid_namespace; + __u8 type; /* shared lock, exclusive lock, or unlock */ +} __attribute__ ((packed)); + + +/* file access modes */ +#define CEPH_FILE_MODE_PIN 0 +#define CEPH_FILE_MODE_RD 1 +#define CEPH_FILE_MODE_WR 2 +#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */ +#define CEPH_FILE_MODE_LAZY 4 /* lazy io */ +#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */ + +int ceph_flags_to_mode(int flags); + + +/* capability bits */ +#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */ + +/* generic cap bits */ +#define CEPH_CAP_GSHARED 1 /* client can reads */ +#define CEPH_CAP_GEXCL 2 /* client can read and update */ +#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */ +#define CEPH_CAP_GRD 8 /* (file) client can read */ +#define CEPH_CAP_GWR 16 /* (file) client can write */ +#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */ +#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ +#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ + +/* per-lock shift */ +#define CEPH_CAP_SAUTH 2 +#define CEPH_CAP_SLINK 4 +#define CEPH_CAP_SXATTR 6 +#define CEPH_CAP_SFILE 8 +#define CEPH_CAP_SFLOCK 20 + +#define CEPH_CAP_BITS 22 + +/* composed values */ +#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) +#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH) +#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK) +#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK) +#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR) +#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR) +#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE) +#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK) +#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK) + + +/* cap masks (for getattr) */ +#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN +#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */ +#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN +#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED +#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED +#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED +#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED +#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED +#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED +#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED +#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */ +#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED +#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \ + CEPH_CAP_AUTH_SHARED | \ + CEPH_CAP_LINK_SHARED | \ + CEPH_CAP_FILE_SHARED | \ + CEPH_CAP_XATTR_SHARED) + +#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \ + CEPH_CAP_LINK_SHARED | \ + CEPH_CAP_XATTR_SHARED | \ + CEPH_CAP_FILE_SHARED) +#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \ + CEPH_CAP_FILE_CACHE) + +#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \ + CEPH_CAP_LINK_EXCL | \ + CEPH_CAP_XATTR_EXCL | \ + CEPH_CAP_FILE_EXCL) +#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \ + CEPH_CAP_FILE_EXCL) +#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR) +#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ + CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ + CEPH_CAP_PIN) + +#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ + CEPH_LOCK_IXATTR) + +int ceph_caps_for_mode(int mode); + +enum { + CEPH_CAP_OP_GRANT, /* mds->client grant */ + CEPH_CAP_OP_REVOKE, /* mds->client revoke */ + CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */ + CEPH_CAP_OP_EXPORT, /* mds has exported the cap */ + CEPH_CAP_OP_IMPORT, /* mds has imported the cap */ + CEPH_CAP_OP_UPDATE, /* client->mds update */ + CEPH_CAP_OP_DROP, /* client->mds drop cap bits */ + CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */ + CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */ + CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */ + CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */ + CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */ + CEPH_CAP_OP_RENEW, /* client->mds renewal request */ +}; + +extern const char *ceph_cap_op_name(int op); + +/* + * caps message, used for capability callbacks, acks, requests, etc. + */ +struct ceph_mds_caps { + __le32 op; /* CEPH_CAP_OP_* */ + __le64 ino, realm; + __le64 cap_id; + __le32 seq, issue_seq; + __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */ + __le32 migrate_seq; + __le64 snap_follows; + __le32 snap_trace_len; + + /* authlock */ + __le32 uid, gid, mode; + + /* linklock */ + __le32 nlink; + + /* xattrlock */ + __le32 xattr_len; + __le64 xattr_version; + + /* filelock */ + __le64 size, max_size, truncate_size; + __le32 truncate_seq; + struct ceph_timespec mtime, atime, ctime; + struct ceph_file_layout layout; + __le32 time_warp_seq; +} __attribute__ ((packed)); + +/* cap release msg head */ +struct ceph_mds_cap_release { + __le32 num; /* number of cap_items that follow */ +} __attribute__ ((packed)); + +struct ceph_mds_cap_item { + __le64 ino; + __le64 cap_id; + __le32 migrate_seq, seq; +} __attribute__ ((packed)); + +#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */ +#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */ +#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */ +#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */ + +extern const char *ceph_lease_op_name(int o); + +/* lease msg header */ +struct ceph_mds_lease { + __u8 action; /* CEPH_MDS_LEASE_* */ + __le16 mask; /* which lease */ + __le64 ino; + __le64 first, last; /* snap range */ + __le32 seq; + __le32 duration_ms; /* duration of renewal */ +} __attribute__ ((packed)); +/* followed by a __le32+string for dname */ + +/* client reconnect */ +struct ceph_mds_cap_reconnect { + __le64 cap_id; + __le32 wanted; + __le32 issued; + __le64 snaprealm; + __le64 pathbase; /* base ino for our path to this ino */ + __le32 flock_len; /* size of flock state blob, if any */ +} __attribute__ ((packed)); +/* followed by flock blob */ + +struct ceph_mds_cap_reconnect_v1 { + __le64 cap_id; + __le32 wanted; + __le32 issued; + __le64 size; + struct ceph_timespec mtime, atime; + __le64 snaprealm; + __le64 pathbase; /* base ino for our path to this ino */ +} __attribute__ ((packed)); + +struct ceph_mds_snaprealm_reconnect { + __le64 ino; /* snap realm base */ + __le64 seq; /* snap seq for this snap realm */ + __le64 parent; /* parent realm */ +} __attribute__ ((packed)); + +/* + * snaps + */ +enum { + CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */ + CEPH_SNAP_OP_CREATE, + CEPH_SNAP_OP_DESTROY, + CEPH_SNAP_OP_SPLIT, +}; + +extern const char *ceph_snap_op_name(int o); + +/* snap msg header */ +struct ceph_mds_snap_head { + __le32 op; /* CEPH_SNAP_OP_* */ + __le64 split; /* ino to split off, if any */ + __le32 num_split_inos; /* # inos belonging to new child realm */ + __le32 num_split_realms; /* # child realms udner new child realm */ + __le32 trace_len; /* size of snap trace blob */ +} __attribute__ ((packed)); +/* followed by split ino list, then split realms, then the trace blob */ + +/* + * encode info about a snaprealm, as viewed by a client + */ +struct ceph_mds_snap_realm { + __le64 ino; /* ino */ + __le64 created; /* snap: when created */ + __le64 parent; /* ino: parent realm */ + __le64 parent_since; /* snap: same parent since */ + __le64 seq; /* snap: version */ + __le32 num_snaps; + __le32 num_prior_parent_snaps; +} __attribute__ ((packed)); +/* followed by my snap list, then prior parent snap list */ + +#endif diff --git a/include/linux/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h new file mode 100644 index 000000000000..d099c3f90236 --- /dev/null +++ b/include/linux/ceph/ceph_hash.h @@ -0,0 +1,13 @@ +#ifndef FS_CEPH_HASH_H +#define FS_CEPH_HASH_H + +#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */ +#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */ + +extern unsigned ceph_str_hash_linux(const char *s, unsigned len); +extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len); + +extern unsigned ceph_str_hash(int type, const char *s, unsigned len); +extern const char *ceph_str_hash_name(int type); + +#endif diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h new file mode 100644 index 000000000000..2a79702e092b --- /dev/null +++ b/include/linux/ceph/debugfs.h @@ -0,0 +1,33 @@ +#ifndef _FS_CEPH_DEBUGFS_H +#define _FS_CEPH_DEBUGFS_H + +#include "ceph_debug.h" +#include "types.h" + +#define CEPH_DEFINE_SHOW_FUNC(name) \ +static int name##_open(struct inode *inode, struct file *file) \ +{ \ + struct seq_file *sf; \ + int ret; \ + \ + ret = single_open(file, name, NULL); \ + sf = file->private_data; \ + sf->private = inode->i_private; \ + return ret; \ +} \ + \ +static const struct file_operations name##_fops = { \ + .open = name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +/* debugfs.c */ +extern int ceph_debugfs_init(void); +extern void ceph_debugfs_cleanup(void); +extern int ceph_debugfs_client_init(struct ceph_client *client); +extern void ceph_debugfs_client_cleanup(struct ceph_client *client); + +#endif + diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h new file mode 100644 index 000000000000..c5b6939fb32a --- /dev/null +++ b/include/linux/ceph/decode.h @@ -0,0 +1,201 @@ +#ifndef __CEPH_DECODE_H +#define __CEPH_DECODE_H + +#include +#include + +#include "types.h" + +/* + * in all cases, + * void **p pointer to position pointer + * void *end pointer to end of buffer (last byte + 1) + */ + +static inline u64 ceph_decode_64(void **p) +{ + u64 v = get_unaligned_le64(*p); + *p += sizeof(u64); + return v; +} +static inline u32 ceph_decode_32(void **p) +{ + u32 v = get_unaligned_le32(*p); + *p += sizeof(u32); + return v; +} +static inline u16 ceph_decode_16(void **p) +{ + u16 v = get_unaligned_le16(*p); + *p += sizeof(u16); + return v; +} +static inline u8 ceph_decode_8(void **p) +{ + u8 v = *(u8 *)*p; + (*p)++; + return v; +} +static inline void ceph_decode_copy(void **p, void *pv, size_t n) +{ + memcpy(pv, *p, n); + *p += n; +} + +/* + * bounds check input. + */ +#define ceph_decode_need(p, end, n, bad) \ + do { \ + if (unlikely(*(p) + (n) > (end))) \ + goto bad; \ + } while (0) + +#define ceph_decode_64_safe(p, end, v, bad) \ + do { \ + ceph_decode_need(p, end, sizeof(u64), bad); \ + v = ceph_decode_64(p); \ + } while (0) +#define ceph_decode_32_safe(p, end, v, bad) \ + do { \ + ceph_decode_need(p, end, sizeof(u32), bad); \ + v = ceph_decode_32(p); \ + } while (0) +#define ceph_decode_16_safe(p, end, v, bad) \ + do { \ + ceph_decode_need(p, end, sizeof(u16), bad); \ + v = ceph_decode_16(p); \ + } while (0) +#define ceph_decode_8_safe(p, end, v, bad) \ + do { \ + ceph_decode_need(p, end, sizeof(u8), bad); \ + v = ceph_decode_8(p); \ + } while (0) + +#define ceph_decode_copy_safe(p, end, pv, n, bad) \ + do { \ + ceph_decode_need(p, end, n, bad); \ + ceph_decode_copy(p, pv, n); \ + } while (0) + +/* + * struct ceph_timespec <-> struct timespec + */ +static inline void ceph_decode_timespec(struct timespec *ts, + const struct ceph_timespec *tv) +{ + ts->tv_sec = le32_to_cpu(tv->tv_sec); + ts->tv_nsec = le32_to_cpu(tv->tv_nsec); +} +static inline void ceph_encode_timespec(struct ceph_timespec *tv, + const struct timespec *ts) +{ + tv->tv_sec = cpu_to_le32(ts->tv_sec); + tv->tv_nsec = cpu_to_le32(ts->tv_nsec); +} + +/* + * sockaddr_storage <-> ceph_sockaddr + */ +static inline void ceph_encode_addr(struct ceph_entity_addr *a) +{ + __be16 ss_family = htons(a->in_addr.ss_family); + a->in_addr.ss_family = *(__u16 *)&ss_family; +} +static inline void ceph_decode_addr(struct ceph_entity_addr *a) +{ + __be16 ss_family = *(__be16 *)&a->in_addr.ss_family; + a->in_addr.ss_family = ntohs(ss_family); + WARN_ON(a->in_addr.ss_family == 512); +} + +/* + * encoders + */ +static inline void ceph_encode_64(void **p, u64 v) +{ + put_unaligned_le64(v, (__le64 *)*p); + *p += sizeof(u64); +} +static inline void ceph_encode_32(void **p, u32 v) +{ + put_unaligned_le32(v, (__le32 *)*p); + *p += sizeof(u32); +} +static inline void ceph_encode_16(void **p, u16 v) +{ + put_unaligned_le16(v, (__le16 *)*p); + *p += sizeof(u16); +} +static inline void ceph_encode_8(void **p, u8 v) +{ + *(u8 *)*p = v; + (*p)++; +} +static inline void ceph_encode_copy(void **p, const void *s, int len) +{ + memcpy(*p, s, len); + *p += len; +} + +/* + * filepath, string encoders + */ +static inline void ceph_encode_filepath(void **p, void *end, + u64 ino, const char *path) +{ + u32 len = path ? strlen(path) : 0; + BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end); + ceph_encode_8(p, 1); + ceph_encode_64(p, ino); + ceph_encode_32(p, len); + if (len) + memcpy(*p, path, len); + *p += len; +} + +static inline void ceph_encode_string(void **p, void *end, + const char *s, u32 len) +{ + BUG_ON(*p + sizeof(len) + len > end); + ceph_encode_32(p, len); + if (len) + memcpy(*p, s, len); + *p += len; +} + +#define ceph_encode_need(p, end, n, bad) \ + do { \ + if (unlikely(*(p) + (n) > (end))) \ + goto bad; \ + } while (0) + +#define ceph_encode_64_safe(p, end, v, bad) \ + do { \ + ceph_encode_need(p, end, sizeof(u64), bad); \ + ceph_encode_64(p, v); \ + } while (0) +#define ceph_encode_32_safe(p, end, v, bad) \ + do { \ + ceph_encode_need(p, end, sizeof(u32), bad); \ + ceph_encode_32(p, v); \ + } while (0) +#define ceph_encode_16_safe(p, end, v, bad) \ + do { \ + ceph_encode_need(p, end, sizeof(u16), bad); \ + ceph_encode_16(p, v); \ + } while (0) + +#define ceph_encode_copy_safe(p, end, pv, n, bad) \ + do { \ + ceph_encode_need(p, end, n, bad); \ + ceph_encode_copy(p, pv, n); \ + } while (0) +#define ceph_encode_string_safe(p, end, s, n, bad) \ + do { \ + ceph_encode_need(p, end, n, bad); \ + ceph_encode_string(p, end, s, n); \ + } while (0) + + +#endif diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h new file mode 100644 index 000000000000..f22b2e941686 --- /dev/null +++ b/include/linux/ceph/libceph.h @@ -0,0 +1,249 @@ +#ifndef _FS_CEPH_LIBCEPH_H +#define _FS_CEPH_LIBCEPH_H + +#include "ceph_debug.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "messenger.h" +#include "msgpool.h" +#include "mon_client.h" +#include "osd_client.h" +#include "ceph_fs.h" + +/* + * Supported features + */ +#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_DEFAULT CEPH_FEATURE_NOSRCADDR + +/* + * mount options + */ +#define CEPH_OPT_FSID (1<<0) +#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ +#define CEPH_OPT_MYIP (1<<2) /* specified my ip */ +#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ + +#define CEPH_OPT_DEFAULT (0); + +#define ceph_set_opt(client, opt) \ + (client)->options->flags |= CEPH_OPT_##opt; +#define ceph_test_opt(client, opt) \ + (!!((client)->options->flags & CEPH_OPT_##opt)) + +struct ceph_options { + int flags; + struct ceph_fsid fsid; + struct ceph_entity_addr my_addr; + int mount_timeout; + int osd_idle_ttl; + int osd_timeout; + int osd_keepalive_timeout; + + /* + * any type that can't be simply compared or doesn't need need + * to be compared should go beyond this point, + * ceph_compare_options() should be updated accordingly + */ + + struct ceph_entity_addr *mon_addr; /* should be the first + pointer type of args */ + int num_mon; + char *name; + char *secret; +}; + +/* + * defaults + */ +#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 +#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ +#define CEPH_OSD_KEEPALIVE_DEFAULT 5 +#define CEPH_OSD_IDLE_TTL_DEFAULT 60 +#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ + +#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) +#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) + +#define CEPH_AUTH_NAME_DEFAULT "guest" + +/* + * Delay telling the MDS we no longer want caps, in case we reopen + * the file. Delay a minimum amount of time, even if we send a cap + * message for some other reason. Otherwise, take the oppotunity to + * update the mds to avoid sending another message later. + */ +#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ +#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ + +#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) + +/* mount state */ +enum { + CEPH_MOUNT_MOUNTING, + CEPH_MOUNT_MOUNTED, + CEPH_MOUNT_UNMOUNTING, + CEPH_MOUNT_UNMOUNTED, + CEPH_MOUNT_SHUTDOWN, +}; + +/* + * subtract jiffies + */ +static inline unsigned long time_sub(unsigned long a, unsigned long b) +{ + BUG_ON(time_after(b, a)); + return (long)a - (long)b; +} + +struct ceph_mds_client; + +/* + * per client state + * + * possibly shared by multiple mount points, if they are + * mounting the same ceph filesystem/cluster. + */ +struct ceph_client { + struct ceph_fsid fsid; + bool have_fsid; + + void *private; + + struct ceph_options *options; + + struct mutex mount_mutex; /* serialize mount attempts */ + wait_queue_head_t auth_wq; + int auth_err; + + int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *); + + u32 supported_features; + u32 required_features; + + struct ceph_messenger *msgr; /* messenger instance */ + struct ceph_mon_client monc; + struct ceph_osd_client osdc; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_dir; + struct dentry *debugfs_monmap; + struct dentry *debugfs_osdmap; +#endif +}; + + + +/* + * snapshots + */ + +/* + * A "snap context" is the set of existing snapshots when we + * write data. It is used by the OSD to guide its COW behavior. + * + * The ceph_snap_context is refcounted, and attached to each dirty + * page, indicating which context the dirty data belonged when it was + * dirtied. + */ +struct ceph_snap_context { + atomic_t nref; + u64 seq; + int num_snaps; + u64 snaps[]; +}; + +static inline struct ceph_snap_context * +ceph_get_snap_context(struct ceph_snap_context *sc) +{ + /* + printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), + atomic_read(&sc->nref)+1); + */ + if (sc) + atomic_inc(&sc->nref); + return sc; +} + +static inline void ceph_put_snap_context(struct ceph_snap_context *sc) +{ + if (!sc) + return; + /* + printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), + atomic_read(&sc->nref)-1); + */ + if (atomic_dec_and_test(&sc->nref)) { + /*printk(" deleting snap_context %p\n", sc);*/ + kfree(sc); + } +} + +/* + * calculate the number of pages a given length and offset map onto, + * if we align the data. + */ +static inline int calc_pages_for(u64 off, u64 len) +{ + return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - + (off >> PAGE_CACHE_SHIFT); +} + +/* ceph_common.c */ +extern const char *ceph_msg_type_name(int type); +extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); +extern struct kmem_cache *ceph_inode_cachep; +extern struct kmem_cache *ceph_cap_cachep; +extern struct kmem_cache *ceph_dentry_cachep; +extern struct kmem_cache *ceph_file_cachep; + +extern int ceph_parse_options(struct ceph_options **popt, char *options, + const char *dev_name, const char *dev_name_end, + int (*parse_extra_token)(char *c, void *private), + void *private); +extern void ceph_destroy_options(struct ceph_options *opt); +extern int ceph_compare_options(struct ceph_options *new_opt, + struct ceph_client *client); +extern struct ceph_client *ceph_create_client(struct ceph_options *opt, + void *private); +extern u64 ceph_client_id(struct ceph_client *client); +extern void ceph_destroy_client(struct ceph_client *client); +extern int __ceph_open_session(struct ceph_client *client, + unsigned long started); +extern int ceph_open_session(struct ceph_client *client); + +/* pagevec.c */ +extern void ceph_release_page_vector(struct page **pages, int num_pages); + +extern struct page **ceph_get_direct_page_vector(const char __user *data, + int num_pages, + loff_t off, size_t len); +extern void ceph_put_page_vector(struct page **pages, int num_pages); +extern void ceph_release_page_vector(struct page **pages, int num_pages); +extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); +extern int ceph_copy_user_to_page_vector(struct page **pages, + const char __user *data, + loff_t off, size_t len); +extern int ceph_copy_to_page_vector(struct page **pages, + const char *data, + loff_t off, size_t len); +extern int ceph_copy_from_page_vector(struct page **pages, + char *data, + loff_t off, size_t len); +extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data, + loff_t off, size_t len); +extern void ceph_zero_page_vector_range(int off, int len, struct page **pages); + + +#endif /* _FS_CEPH_SUPER_H */ diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h new file mode 100644 index 000000000000..4c5cb0880bba --- /dev/null +++ b/include/linux/ceph/mdsmap.h @@ -0,0 +1,62 @@ +#ifndef _FS_CEPH_MDSMAP_H +#define _FS_CEPH_MDSMAP_H + +#include "types.h" + +/* + * mds map - describe servers in the mds cluster. + * + * we limit fields to those the client actually xcares about + */ +struct ceph_mds_info { + u64 global_id; + struct ceph_entity_addr addr; + s32 state; + int num_export_targets; + bool laggy; + u32 *export_targets; +}; + +struct ceph_mdsmap { + u32 m_epoch, m_client_epoch, m_last_failure; + u32 m_root; + u32 m_session_timeout; /* seconds */ + u32 m_session_autoclose; /* seconds */ + u64 m_max_file_size; + u32 m_max_mds; /* size of m_addr, m_state arrays */ + struct ceph_mds_info *m_info; + + /* which object pools file data can be stored in */ + int m_num_data_pg_pools; + u32 *m_data_pg_pools; + u32 m_cas_pg_pool; +}; + +static inline struct ceph_entity_addr * +ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) +{ + if (w >= m->m_max_mds) + return NULL; + return &m->m_info[w].addr; +} + +static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) +{ + BUG_ON(w < 0); + if (w >= m->m_max_mds) + return CEPH_MDS_STATE_DNE; + return m->m_info[w].state; +} + +static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) +{ + if (w >= 0 && w < m->m_max_mds) + return m->m_info[w].laggy; + return false; +} + +extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); +extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); +extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); + +#endif diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h new file mode 100644 index 000000000000..5956d62c3057 --- /dev/null +++ b/include/linux/ceph/messenger.h @@ -0,0 +1,261 @@ +#ifndef __FS_CEPH_MESSENGER_H +#define __FS_CEPH_MESSENGER_H + +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "buffer.h" + +struct ceph_msg; +struct ceph_connection; + +extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */ + +/* + * Ceph defines these callbacks for handling connection events. + */ +struct ceph_connection_operations { + struct ceph_connection *(*get)(struct ceph_connection *); + void (*put)(struct ceph_connection *); + + /* handle an incoming message. */ + void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m); + + /* authorize an outgoing connection */ + int (*get_authorizer) (struct ceph_connection *con, + void **buf, int *len, int *proto, + void **reply_buf, int *reply_len, int force_new); + int (*verify_authorizer_reply) (struct ceph_connection *con, int len); + int (*invalidate_authorizer)(struct ceph_connection *con); + + /* protocol version mismatch */ + void (*bad_proto) (struct ceph_connection *con); + + /* there was some error on the socket (disconnect, whatever) */ + void (*fault) (struct ceph_connection *con); + + /* a remote host as terminated a message exchange session, and messages + * we sent (or they tried to send us) may be lost. */ + void (*peer_reset) (struct ceph_connection *con); + + struct ceph_msg * (*alloc_msg) (struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip); +}; + +/* use format string %s%d */ +#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num) + +struct ceph_messenger { + struct ceph_entity_inst inst; /* my name+address */ + struct ceph_entity_addr my_enc_addr; + struct page *zero_page; /* used in certain error cases */ + + bool nocrc; + + /* + * the global_seq counts connections i (attempt to) initiate + * in order to disambiguate certain connect race conditions. + */ + u32 global_seq; + spinlock_t global_seq_lock; + + u32 supported_features; + u32 required_features; +}; + +/* + * a single message. it contains a header (src, dest, message type, etc.), + * footer (crc values, mainly), a "front" message body, and possibly a + * data payload (stored in some number of pages). + */ +struct ceph_msg { + struct ceph_msg_header hdr; /* header */ + struct ceph_msg_footer footer; /* footer */ + struct kvec front; /* unaligned blobs of message */ + struct ceph_buffer *middle; + struct page **pages; /* data payload. NOT OWNER. */ + unsigned nr_pages; /* size of page array */ + struct ceph_pagelist *pagelist; /* instead of pages */ + struct list_head list_head; + struct kref kref; + struct bio *bio; /* instead of pages/pagelist */ + struct bio *bio_iter; /* bio iterator */ + int bio_seg; /* current bio segment */ + struct ceph_pagelist *trail; /* the trailing part of the data */ + bool front_is_vmalloc; + bool more_to_follow; + bool needs_out_seq; + int front_max; + + struct ceph_msgpool *pool; +}; + +struct ceph_msg_pos { + int page, page_pos; /* which page; offset in page */ + int data_pos; /* offset in data payload */ + int did_page_crc; /* true if we've calculated crc for current page */ +}; + +/* ceph connection fault delay defaults, for exponential backoff */ +#define BASE_DELAY_INTERVAL (HZ/2) +#define MAX_DELAY_INTERVAL (5 * 60 * HZ) + +/* + * ceph_connection state bit flags + * + * QUEUED and BUSY are used together to ensure that only a single + * thread is currently opening, reading or writing data to the socket. + */ +#define LOSSYTX 0 /* we can close channel or drop messages on errors */ +#define CONNECTING 1 +#define NEGOTIATING 2 +#define KEEPALIVE_PENDING 3 +#define WRITE_PENDING 4 /* we have data ready to send */ +#define QUEUED 5 /* there is work queued on this connection */ +#define BUSY 6 /* work is being done */ +#define STANDBY 8 /* no outgoing messages, socket closed. we keep + * the ceph_connection around to maintain shared + * state with the peer. */ +#define CLOSED 10 /* we've closed the connection */ +#define SOCK_CLOSED 11 /* socket state changed to closed */ +#define OPENING 13 /* open connection w/ (possibly new) peer */ +#define DEAD 14 /* dead, about to kfree */ + +/* + * A single connection with another host. + * + * We maintain a queue of outgoing messages, and some session state to + * ensure that we can preserve the lossless, ordered delivery of + * messages in the case of a TCP disconnect. + */ +struct ceph_connection { + void *private; + atomic_t nref; + + const struct ceph_connection_operations *ops; + + struct ceph_messenger *msgr; + struct socket *sock; + unsigned long state; /* connection state (see flags above) */ + const char *error_msg; /* error message, if any */ + + struct ceph_entity_addr peer_addr; /* peer address */ + struct ceph_entity_name peer_name; /* peer name */ + struct ceph_entity_addr peer_addr_for_me; + unsigned peer_features; + u32 connect_seq; /* identify the most recent connection + attempt for this connection, client */ + u32 peer_global_seq; /* peer's global seq for this connection */ + + int auth_retry; /* true if we need a newer authorizer */ + void *auth_reply_buf; /* where to put the authorizer reply */ + int auth_reply_buf_len; + + struct mutex mutex; + + /* out queue */ + struct list_head out_queue; + struct list_head out_sent; /* sending or sent but unacked */ + u64 out_seq; /* last message queued for send */ + bool out_keepalive_pending; + + u64 in_seq, in_seq_acked; /* last message received, acked */ + + /* connection negotiation temps */ + char in_banner[CEPH_BANNER_MAX_LEN]; + union { + struct { /* outgoing connection */ + struct ceph_msg_connect out_connect; + struct ceph_msg_connect_reply in_reply; + }; + struct { /* incoming */ + struct ceph_msg_connect in_connect; + struct ceph_msg_connect_reply out_reply; + }; + }; + struct ceph_entity_addr actual_peer_addr; + + /* message out temps */ + struct ceph_msg *out_msg; /* sending message (== tail of + out_sent) */ + bool out_msg_done; + struct ceph_msg_pos out_msg_pos; + + struct kvec out_kvec[8], /* sending header/footer data */ + *out_kvec_cur; + int out_kvec_left; /* kvec's left in out_kvec */ + int out_skip; /* skip this many bytes */ + int out_kvec_bytes; /* total bytes left */ + bool out_kvec_is_msg; /* kvec refers to out_msg */ + int out_more; /* there is more data after the kvecs */ + __le64 out_temp_ack; /* for writing an ack */ + + /* message in temps */ + struct ceph_msg_header in_hdr; + struct ceph_msg *in_msg; + struct ceph_msg_pos in_msg_pos; + u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ + + char in_tag; /* protocol control byte */ + int in_base_pos; /* bytes read */ + __le64 in_temp_ack; /* for reading an ack */ + + struct delayed_work work; /* send|recv work */ + unsigned long delay; /* current delay interval */ +}; + + +extern const char *ceph_pr_addr(const struct sockaddr_storage *ss); +extern int ceph_parse_ips(const char *c, const char *end, + struct ceph_entity_addr *addr, + int max_count, int *count); + + +extern int ceph_msgr_init(void); +extern void ceph_msgr_exit(void); +extern void ceph_msgr_flush(void); + +extern struct ceph_messenger *ceph_messenger_create( + struct ceph_entity_addr *myaddr, + u32 features, u32 required); +extern void ceph_messenger_destroy(struct ceph_messenger *); + +extern void ceph_con_init(struct ceph_messenger *msgr, + struct ceph_connection *con); +extern void ceph_con_open(struct ceph_connection *con, + struct ceph_entity_addr *addr); +extern bool ceph_con_opened(struct ceph_connection *con); +extern void ceph_con_close(struct ceph_connection *con); +extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); +extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg); +extern void ceph_con_revoke_message(struct ceph_connection *con, + struct ceph_msg *msg); +extern void ceph_con_keepalive(struct ceph_connection *con); +extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); +extern void ceph_con_put(struct ceph_connection *con); + +extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags); +extern void ceph_msg_kfree(struct ceph_msg *m); + + +static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) +{ + kref_get(&msg->kref); + return msg; +} +extern void ceph_msg_last_put(struct kref *kref); +static inline void ceph_msg_put(struct ceph_msg *msg) +{ + kref_put(&msg->kref, ceph_msg_last_put); +} + +extern void ceph_msg_dump(struct ceph_msg *msg); + +#endif diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h new file mode 100644 index 000000000000..545f85917780 --- /dev/null +++ b/include/linux/ceph/mon_client.h @@ -0,0 +1,122 @@ +#ifndef _FS_CEPH_MON_CLIENT_H +#define _FS_CEPH_MON_CLIENT_H + +#include +#include +#include + +#include "messenger.h" + +struct ceph_client; +struct ceph_mount_args; +struct ceph_auth_client; + +/* + * The monitor map enumerates the set of all monitors. + */ +struct ceph_monmap { + struct ceph_fsid fsid; + u32 epoch; + u32 num_mon; + struct ceph_entity_inst mon_inst[0]; +}; + +struct ceph_mon_client; +struct ceph_mon_generic_request; + + +/* + * Generic mechanism for resending monitor requests. + */ +typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc, + int newmon); + +/* a pending monitor request */ +struct ceph_mon_request { + struct ceph_mon_client *monc; + struct delayed_work delayed_work; + unsigned long delay; + ceph_monc_request_func_t do_request; +}; + +/* + * ceph_mon_generic_request is being used for the statfs and poolop requests + * which are bening done a bit differently because we need to get data back + * to the caller + */ +struct ceph_mon_generic_request { + struct kref kref; + u64 tid; + struct rb_node node; + int result; + void *buf; + int buf_len; + struct completion completion; + struct ceph_msg *request; /* original request */ + struct ceph_msg *reply; /* and reply */ +}; + +struct ceph_mon_client { + struct ceph_client *client; + struct ceph_monmap *monmap; + + struct mutex mutex; + struct delayed_work delayed_work; + + struct ceph_auth_client *auth; + struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack; + int pending_auth; + + bool hunting; + int cur_mon; /* last monitor i contacted */ + unsigned long sub_sent, sub_renew_after; + struct ceph_connection *con; + bool have_fsid; + + /* pending generic requests */ + struct rb_root generic_request_tree; + int num_generic_requests; + u64 last_tid; + + /* mds/osd map */ + int want_mdsmap; + int want_next_osdmap; /* 1 = want, 2 = want+asked */ + u32 have_osdmap, have_mdsmap; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_file; +#endif +}; + +extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end); +extern int ceph_monmap_contains(struct ceph_monmap *m, + struct ceph_entity_addr *addr); + +extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); +extern void ceph_monc_stop(struct ceph_mon_client *monc); + +/* + * The model here is to indicate that we need a new map of at least + * epoch @want, and also call in when we receive a map. We will + * periodically rerequest the map from the monitor cluster until we + * get what we want. + */ +extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have); +extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); + +extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); + +extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, + struct ceph_statfs *buf); + +extern int ceph_monc_open_session(struct ceph_mon_client *monc); + +extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); + +extern int ceph_monc_create_snapid(struct ceph_mon_client *monc, + u32 pool, u64 *snapid); + +extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc, + u32 pool, u64 snapid); + +#endif diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h new file mode 100644 index 000000000000..a362605f9368 --- /dev/null +++ b/include/linux/ceph/msgpool.h @@ -0,0 +1,25 @@ +#ifndef _FS_CEPH_MSGPOOL +#define _FS_CEPH_MSGPOOL + +#include +#include "messenger.h" + +/* + * we use memory pools for preallocating messages we may receive, to + * avoid unexpected OOM conditions. + */ +struct ceph_msgpool { + const char *name; + mempool_t *pool; + int front_len; /* preallocated payload size */ +}; + +extern int ceph_msgpool_init(struct ceph_msgpool *pool, + int front_len, int size, bool blocking, + const char *name); +extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); +extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, + int front_len); +extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); + +#endif diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h new file mode 100644 index 000000000000..680d3d648cac --- /dev/null +++ b/include/linux/ceph/msgr.h @@ -0,0 +1,175 @@ +#ifndef CEPH_MSGR_H +#define CEPH_MSGR_H + +/* + * Data types for message passing layer used by Ceph. + */ + +#define CEPH_MON_PORT 6789 /* default monitor port */ + +/* + * client-side processes will try to bind to ports in this + * range, simply for the benefit of tools like nmap or wireshark + * that would like to identify the protocol. + */ +#define CEPH_PORT_FIRST 6789 +#define CEPH_PORT_START 6800 /* non-monitors start here */ +#define CEPH_PORT_LAST 6900 + +/* + * tcp connection banner. include a protocol version. and adjust + * whenever the wire protocol changes. try to keep this string length + * constant. + */ +#define CEPH_BANNER "ceph v027" +#define CEPH_BANNER_MAX_LEN 30 + + +/* + * Rollover-safe type and comparator for 32-bit sequence numbers. + * Comparator returns -1, 0, or 1. + */ +typedef __u32 ceph_seq_t; + +static inline __s32 ceph_seq_cmp(__u32 a, __u32 b) +{ + return (__s32)a - (__s32)b; +} + + +/* + * entity_name -- logical name for a process participating in the + * network, e.g. 'mds0' or 'osd3'. + */ +struct ceph_entity_name { + __u8 type; /* CEPH_ENTITY_TYPE_* */ + __le64 num; +} __attribute__ ((packed)); + +#define CEPH_ENTITY_TYPE_MON 0x01 +#define CEPH_ENTITY_TYPE_MDS 0x02 +#define CEPH_ENTITY_TYPE_OSD 0x04 +#define CEPH_ENTITY_TYPE_CLIENT 0x08 +#define CEPH_ENTITY_TYPE_AUTH 0x20 + +#define CEPH_ENTITY_TYPE_ANY 0xFF + +extern const char *ceph_entity_type_name(int type); + +/* + * entity_addr -- network address + */ +struct ceph_entity_addr { + __le32 type; + __le32 nonce; /* unique id for process (e.g. pid) */ + struct sockaddr_storage in_addr; +} __attribute__ ((packed)); + +struct ceph_entity_inst { + struct ceph_entity_name name; + struct ceph_entity_addr addr; +} __attribute__ ((packed)); + + +/* used by message exchange protocol */ +#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */ +#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */ +#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing + incoming connection */ +#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again + with higher cseq */ +#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again + with higher gseq */ +#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */ +#define CEPH_MSGR_TAG_MSG 7 /* message */ +#define CEPH_MSGR_TAG_ACK 8 /* message ack */ +#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */ +#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ +#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ +#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ + + +/* + * connection negotiation + */ +struct ceph_msg_connect { + __le64 features; /* supported feature bits */ + __le32 host_type; /* CEPH_ENTITY_TYPE_* */ + __le32 global_seq; /* count connections initiated by this host */ + __le32 connect_seq; /* count connections initiated in this session */ + __le32 protocol_version; + __le32 authorizer_protocol; + __le32 authorizer_len; + __u8 flags; /* CEPH_MSG_CONNECT_* */ +} __attribute__ ((packed)); + +struct ceph_msg_connect_reply { + __u8 tag; + __le64 features; /* feature bits for this session */ + __le32 global_seq; + __le32 connect_seq; + __le32 protocol_version; + __le32 authorizer_len; + __u8 flags; +} __attribute__ ((packed)); + +#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */ + + +/* + * message header + */ +struct ceph_msg_header_old { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 front_len; /* bytes in main payload */ + __le32 middle_len;/* bytes in middle payload */ + __le32 data_len; /* bytes of data payload */ + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + struct ceph_entity_inst src, orig_src; + __le32 reserved; + __le32 crc; /* header crc32c */ +} __attribute__ ((packed)); + +struct ceph_msg_header { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 front_len; /* bytes in main payload */ + __le32 middle_len;/* bytes in middle payload */ + __le32 data_len; /* bytes of data payload */ + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + struct ceph_entity_name src; + __le32 reserved; + __le32 crc; /* header crc32c */ +} __attribute__ ((packed)); + +#define CEPH_MSG_PRIO_LOW 64 +#define CEPH_MSG_PRIO_DEFAULT 127 +#define CEPH_MSG_PRIO_HIGH 196 +#define CEPH_MSG_PRIO_HIGHEST 255 + +/* + * follows data payload + */ +struct ceph_msg_footer { + __le32 front_crc, middle_crc, data_crc; + __u8 flags; +} __attribute__ ((packed)); + +#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */ +#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */ + + +#endif diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h new file mode 100644 index 000000000000..6c91fb032c39 --- /dev/null +++ b/include/linux/ceph/osd_client.h @@ -0,0 +1,234 @@ +#ifndef _FS_CEPH_OSD_CLIENT_H +#define _FS_CEPH_OSD_CLIENT_H + +#include +#include +#include +#include + +#include "types.h" +#include "osdmap.h" +#include "messenger.h" + +struct ceph_msg; +struct ceph_snap_context; +struct ceph_osd_request; +struct ceph_osd_client; +struct ceph_authorizer; +struct ceph_pagelist; + +/* + * completion callback for async writepages + */ +typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, + struct ceph_msg *); + +/* a given osd we're communicating with */ +struct ceph_osd { + atomic_t o_ref; + struct ceph_osd_client *o_osdc; + int o_osd; + int o_incarnation; + struct rb_node o_node; + struct ceph_connection o_con; + struct list_head o_requests; + struct list_head o_osd_lru; + struct ceph_authorizer *o_authorizer; + void *o_authorizer_buf, *o_authorizer_reply_buf; + size_t o_authorizer_buf_len, o_authorizer_reply_buf_len; + unsigned long lru_ttl; + int o_marked_for_keepalive; + struct list_head o_keepalive_item; +}; + +/* an in-flight request */ +struct ceph_osd_request { + u64 r_tid; /* unique for this client */ + struct rb_node r_node; + struct list_head r_req_lru_item; + struct list_head r_osd_item; + struct ceph_osd *r_osd; + struct ceph_pg r_pgid; + int r_pg_osds[CEPH_PG_MAX_SIZE]; + int r_num_pg_osds; + + struct ceph_connection *r_con_filling_msg; + + struct ceph_msg *r_request, *r_reply; + int r_result; + int r_flags; /* any additional flags for the osd */ + u32 r_sent; /* >0 if r_request is sending/sent */ + int r_got_reply; + + struct ceph_osd_client *r_osdc; + struct kref r_kref; + bool r_mempool; + struct completion r_completion, r_safe_completion; + ceph_osdc_callback_t r_callback, r_safe_callback; + struct ceph_eversion r_reassert_version; + struct list_head r_unsafe_item; + + struct inode *r_inode; /* for use by callbacks */ + void *r_priv; /* ditto */ + + char r_oid[40]; /* object name */ + int r_oid_len; + unsigned long r_stamp; /* send OR check time */ + bool r_resend; /* msg send failed, needs retry */ + + struct ceph_file_layout r_file_layout; + struct ceph_snap_context *r_snapc; /* snap context for writes */ + unsigned r_num_pages; /* size of page array (follows) */ + struct page **r_pages; /* pages for data payload */ + int r_pages_from_pool; + int r_own_pages; /* if true, i own page list */ +#ifdef CONFIG_BLOCK + struct bio *r_bio; /* instead of pages */ +#endif + + struct ceph_pagelist *r_trail; /* trailing part of the data */ +}; + +struct ceph_osd_client { + struct ceph_client *client; + + struct ceph_osdmap *osdmap; /* current map */ + struct rw_semaphore map_sem; + struct completion map_waiters; + u64 last_requested_map; + + struct mutex request_mutex; + struct rb_root osds; /* osds */ + struct list_head osd_lru; /* idle osds */ + u64 timeout_tid; /* tid of timeout triggering rq */ + u64 last_tid; /* tid of last request */ + struct rb_root requests; /* pending requests */ + struct list_head req_lru; /* pending requests lru */ + int num_requests; + struct delayed_work timeout_work; + struct delayed_work osds_timeout_work; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_file; +#endif + + mempool_t *req_mempool; + + struct ceph_msgpool msgpool_op; + struct ceph_msgpool msgpool_op_reply; +}; + +struct ceph_osd_req_op { + u16 op; /* CEPH_OSD_OP_* */ + u32 flags; /* CEPH_OSD_FLAG_* */ + union { + struct { + u64 offset, length; + u64 truncate_size; + u32 truncate_seq; + } extent; + struct { + const char *name; + u32 name_len; + const char *val; + u32 value_len; + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ + } xattr; + struct { + const char *class_name; + __u8 class_len; + const char *method_name; + __u8 method_len; + __u8 argc; + const char *indata; + u32 indata_len; + } cls; + struct { + u64 cookie, count; + } pgls; + struct { + u64 snapid; + } snap; + }; + u32 payload_len; +}; + +extern int ceph_osdc_init(struct ceph_osd_client *osdc, + struct ceph_client *client); +extern void ceph_osdc_stop(struct ceph_osd_client *osdc); + +extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, + struct ceph_msg *msg); +extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, + struct ceph_msg *msg); + +extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc, + struct ceph_file_layout *layout, + u64 snapid, + u64 off, u64 *plen, u64 *bno, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op); + +extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, + int flags, + struct ceph_snap_context *snapc, + struct ceph_osd_req_op *ops, + bool use_mempool, + gfp_t gfp_flags, + struct page **pages, + struct bio *bio); + +extern void ceph_osdc_build_request(struct ceph_osd_request *req, + u64 off, u64 *plen, + struct ceph_osd_req_op *src_ops, + struct ceph_snap_context *snapc, + struct timespec *mtime, + const char *oid, + int oid_len); + +extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, + struct ceph_file_layout *layout, + struct ceph_vino vino, + u64 offset, u64 *len, int op, int flags, + struct ceph_snap_context *snapc, + int do_sync, u32 truncate_seq, + u64 truncate_size, + struct timespec *mtime, + bool use_mempool, int num_reply); + +static inline void ceph_osdc_get_request(struct ceph_osd_request *req) +{ + kref_get(&req->r_kref); +} +extern void ceph_osdc_release_request(struct kref *kref); +static inline void ceph_osdc_put_request(struct ceph_osd_request *req) +{ + kref_put(&req->r_kref, ceph_osdc_release_request); +} + +extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req, + bool nofail); +extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +extern void ceph_osdc_sync(struct ceph_osd_client *osdc); + +extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, + struct ceph_vino vino, + struct ceph_file_layout *layout, + u64 off, u64 *plen, + u32 truncate_seq, u64 truncate_size, + struct page **pages, int nr_pages); + +extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, + struct ceph_vino vino, + struct ceph_file_layout *layout, + struct ceph_snap_context *sc, + u64 off, u64 len, + u32 truncate_seq, u64 truncate_size, + struct timespec *mtime, + struct page **pages, int nr_pages, + int flags, int do_sync, bool nofail); + +#endif + diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h new file mode 100644 index 000000000000..ba4c205cbb01 --- /dev/null +++ b/include/linux/ceph/osdmap.h @@ -0,0 +1,130 @@ +#ifndef _FS_CEPH_OSDMAP_H +#define _FS_CEPH_OSDMAP_H + +#include +#include "types.h" +#include "ceph_fs.h" +#include + +/* + * The osd map describes the current membership of the osd cluster and + * specifies the mapping of objects to placement groups and placement + * groups to (sets of) osds. That is, it completely specifies the + * (desired) distribution of all data objects in the system at some + * point in time. + * + * Each map version is identified by an epoch, which increases monotonically. + * + * The map can be updated either via an incremental map (diff) describing + * the change between two successive epochs, or as a fully encoded map. + */ +struct ceph_pg_pool_info { + struct rb_node node; + int id; + struct ceph_pg_pool v; + int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; + char *name; +}; + +struct ceph_pg_mapping { + struct rb_node node; + struct ceph_pg pgid; + int len; + int osds[]; +}; + +struct ceph_osdmap { + struct ceph_fsid fsid; + u32 epoch; + u32 mkfs_epoch; + struct ceph_timespec created, modified; + + u32 flags; /* CEPH_OSDMAP_* */ + + u32 max_osd; /* size of osd_state, _offload, _addr arrays */ + u8 *osd_state; /* CEPH_OSD_* */ + u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */ + struct ceph_entity_addr *osd_addr; + + struct rb_root pg_temp; + struct rb_root pg_pools; + u32 pool_max; + + /* the CRUSH map specifies the mapping of placement groups to + * the list of osds that store+replicate them. */ + struct crush_map *crush; +}; + +/* + * file layout helpers + */ +#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) +#define ceph_file_layout_stripe_count(l) \ + ((__s32)le32_to_cpu((l).fl_stripe_count)) +#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) +#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) +#define ceph_file_layout_object_su(l) \ + ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) +#define ceph_file_layout_pg_preferred(l) \ + ((__s32)le32_to_cpu((l).fl_pg_preferred)) +#define ceph_file_layout_pg_pool(l) \ + ((__s32)le32_to_cpu((l).fl_pg_pool)) + +static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) +{ + return le32_to_cpu(l->fl_stripe_unit) * + le32_to_cpu(l->fl_stripe_count); +} + +/* "period" == bytes before i start on a new set of objects */ +static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) +{ + return le32_to_cpu(l->fl_object_size) * + le32_to_cpu(l->fl_stripe_count); +} + + +static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) +{ + return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); +} + +static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag) +{ + return map && (map->flags & flag); +} + +extern char *ceph_osdmap_state_str(char *str, int len, int state); + +static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, + int osd) +{ + if (osd >= map->max_osd) + return NULL; + return &map->osd_addr[osd]; +} + +extern struct ceph_osdmap *osdmap_decode(void **p, void *end); +extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, + struct ceph_osdmap *map, + struct ceph_messenger *msgr); +extern void ceph_osdmap_destroy(struct ceph_osdmap *map); + +/* calculate mapping of a file extent to an object */ +extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, + u64 off, u64 *plen, + u64 *bno, u64 *oxoff, u64 *oxlen); + +/* calculate mapping of object to a placement group */ +extern int ceph_calc_object_layout(struct ceph_object_layout *ol, + const char *oid, + struct ceph_file_layout *fl, + struct ceph_osdmap *osdmap); +extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, + int *acting); +extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, + struct ceph_pg pgid); + +extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); + +#endif diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h new file mode 100644 index 000000000000..cc9327aa1c98 --- /dev/null +++ b/include/linux/ceph/pagelist.h @@ -0,0 +1,54 @@ +#ifndef __FS_CEPH_PAGELIST_H +#define __FS_CEPH_PAGELIST_H + +#include + +struct ceph_pagelist { + struct list_head head; + void *mapped_tail; + size_t length; + size_t room; +}; + +static inline void ceph_pagelist_init(struct ceph_pagelist *pl) +{ + INIT_LIST_HEAD(&pl->head); + pl->mapped_tail = NULL; + pl->length = 0; + pl->room = 0; +} +extern int ceph_pagelist_release(struct ceph_pagelist *pl); + +extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l); + +static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) +{ + __le64 ev = cpu_to_le64(v); + return ceph_pagelist_append(pl, &ev, sizeof(ev)); +} +static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v) +{ + __le32 ev = cpu_to_le32(v); + return ceph_pagelist_append(pl, &ev, sizeof(ev)); +} +static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v) +{ + __le16 ev = cpu_to_le16(v); + return ceph_pagelist_append(pl, &ev, sizeof(ev)); +} +static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v) +{ + return ceph_pagelist_append(pl, &v, 1); +} +static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl, + char *s, size_t len) +{ + int ret = ceph_pagelist_encode_32(pl, len); + if (ret) + return ret; + if (len) + return ceph_pagelist_append(pl, s, len); + return 0; +} + +#endif diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h new file mode 100644 index 000000000000..6d5247f2e81b --- /dev/null +++ b/include/linux/ceph/rados.h @@ -0,0 +1,405 @@ +#ifndef CEPH_RADOS_H +#define CEPH_RADOS_H + +/* + * Data types for the Ceph distributed object storage layer RADOS + * (Reliable Autonomic Distributed Object Store). + */ + +#include "msgr.h" + +/* + * osdmap encoding versions + */ +#define CEPH_OSDMAP_INC_VERSION 5 +#define CEPH_OSDMAP_INC_VERSION_EXT 5 +#define CEPH_OSDMAP_VERSION 5 +#define CEPH_OSDMAP_VERSION_EXT 5 + +/* + * fs id + */ +struct ceph_fsid { + unsigned char fsid[16]; +}; + +static inline int ceph_fsid_compare(const struct ceph_fsid *a, + const struct ceph_fsid *b) +{ + return memcmp(a, b, sizeof(*a)); +} + +/* + * ino, object, etc. + */ +typedef __le64 ceph_snapid_t; +#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */ +#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */ +#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */ + +struct ceph_timespec { + __le32 tv_sec; + __le32 tv_nsec; +} __attribute__ ((packed)); + + +/* + * object layout - how objects are mapped into PGs + */ +#define CEPH_OBJECT_LAYOUT_HASH 1 +#define CEPH_OBJECT_LAYOUT_LINEAR 2 +#define CEPH_OBJECT_LAYOUT_HASHINO 3 + +/* + * pg layout -- how PGs are mapped onto (sets of) OSDs + */ +#define CEPH_PG_LAYOUT_CRUSH 0 +#define CEPH_PG_LAYOUT_HASH 1 +#define CEPH_PG_LAYOUT_LINEAR 2 +#define CEPH_PG_LAYOUT_HYBRID 3 + +#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ + +/* + * placement group. + * we encode this into one __le64. + */ +struct ceph_pg { + __le16 preferred; /* preferred primary osd */ + __le16 ps; /* placement seed */ + __le32 pool; /* object pool */ +} __attribute__ ((packed)); + +/* + * pg_pool is a set of pgs storing a pool of objects + * + * pg_num -- base number of pseudorandomly placed pgs + * + * pgp_num -- effective number when calculating pg placement. this + * is used for pg_num increases. new pgs result in data being "split" + * into new pgs. for this to proceed smoothly, new pgs are intiially + * colocated with their parents; that is, pgp_num doesn't increase + * until the new pgs have successfully split. only _then_ are the new + * pgs placed independently. + * + * lpg_num -- localized pg count (per device). replicas are randomly + * selected. + * + * lpgp_num -- as above. + */ +#define CEPH_PG_TYPE_REP 1 +#define CEPH_PG_TYPE_RAID4 2 +#define CEPH_PG_POOL_VERSION 2 +struct ceph_pg_pool { + __u8 type; /* CEPH_PG_TYPE_* */ + __u8 size; /* number of osds in each pg */ + __u8 crush_ruleset; /* crush placement rule */ + __u8 object_hash; /* hash mapping object name to ps */ + __le32 pg_num, pgp_num; /* number of pg's */ + __le32 lpg_num, lpgp_num; /* number of localized pg's */ + __le32 last_change; /* most recent epoch changed */ + __le64 snap_seq; /* seq for per-pool snapshot */ + __le32 snap_epoch; /* epoch of last snap */ + __le32 num_snaps; + __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */ + __le64 auid; /* who owns the pg */ +} __attribute__ ((packed)); + +/* + * stable_mod func is used to control number of placement groups. + * similar to straight-up modulo, but produces a stable mapping as b + * increases over time. b is the number of bins, and bmask is the + * containing power of 2 minus 1. + * + * b <= bmask and bmask=(2**n)-1 + * e.g., b=12 -> bmask=15, b=123 -> bmask=127 + */ +static inline int ceph_stable_mod(int x, int b, int bmask) +{ + if ((x & bmask) < b) + return x & bmask; + else + return x & (bmask >> 1); +} + +/* + * object layout - how a given object should be stored. + */ +struct ceph_object_layout { + struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ + __le32 ol_stripe_unit; /* for per-object parity, if any */ +} __attribute__ ((packed)); + +/* + * compound epoch+version, used by storage layer to serialize mutations + */ +struct ceph_eversion { + __le32 epoch; + __le64 version; +} __attribute__ ((packed)); + +/* + * osd map bits + */ + +/* status bits */ +#define CEPH_OSD_EXISTS 1 +#define CEPH_OSD_UP 2 + +/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ +#define CEPH_OSD_IN 0x10000 +#define CEPH_OSD_OUT 0 + + +/* + * osd map flag bits + */ +#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ +#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ +#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ +#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ +#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ + +/* + * osd ops + */ +#define CEPH_OSD_OP_MODE 0xf000 +#define CEPH_OSD_OP_MODE_RD 0x1000 +#define CEPH_OSD_OP_MODE_WR 0x2000 +#define CEPH_OSD_OP_MODE_RMW 0x3000 +#define CEPH_OSD_OP_MODE_SUB 0x4000 + +#define CEPH_OSD_OP_TYPE 0x0f00 +#define CEPH_OSD_OP_TYPE_LOCK 0x0100 +#define CEPH_OSD_OP_TYPE_DATA 0x0200 +#define CEPH_OSD_OP_TYPE_ATTR 0x0300 +#define CEPH_OSD_OP_TYPE_EXEC 0x0400 +#define CEPH_OSD_OP_TYPE_PG 0x0500 + +enum { + /** data **/ + /* read */ + CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, + CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, + + /* fancy read */ + CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, + + /* write */ + CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, + CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2, + CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3, + CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4, + CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5, + + /* fancy write */ + CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6, + CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7, + CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8, + CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9, + + CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10, + CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11, + CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12, + + CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, + CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14, + + /** attrs **/ + /* read */ + CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, + CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, + CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3, + + /* write */ + CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, + CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2, + CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3, + CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, + + /** subop **/ + CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, + CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, + CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, + CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, + CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, + + /** lock **/ + CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, + CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2, + CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3, + CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4, + CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5, + CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, + + /** exec **/ + CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, + + /** pg **/ + CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, +}; + +static inline int ceph_osd_op_type_lock(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK; +} +static inline int ceph_osd_op_type_data(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA; +} +static inline int ceph_osd_op_type_attr(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR; +} +static inline int ceph_osd_op_type_exec(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC; +} +static inline int ceph_osd_op_type_pg(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; +} + +static inline int ceph_osd_op_mode_subop(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB; +} +static inline int ceph_osd_op_mode_read(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; +} +static inline int ceph_osd_op_mode_modify(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; +} + +/* + * note that the following tmap stuff is also defined in the ceph librados.h + * any modification here needs to be updated there + */ +#define CEPH_OSD_TMAP_HDR 'h' +#define CEPH_OSD_TMAP_SET 's' +#define CEPH_OSD_TMAP_RM 'r' + +extern const char *ceph_osd_op_name(int op); + + +/* + * osd op flags + * + * An op may be READ, WRITE, or READ|WRITE. + */ +enum { + CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ + CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ + CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ + CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ + CEPH_OSD_FLAG_READ = 16, /* op may read */ + CEPH_OSD_FLAG_WRITE = 32, /* op may write */ + CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ + CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ + CEPH_OSD_FLAG_BALANCE_READS = 256, + CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ + CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ + CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ + CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */ +}; + +enum { + CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ +}; + +#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ +#define EBLACKLISTED ESHUTDOWN /* blacklisted */ + +/* xattr comparison */ +enum { + CEPH_OSD_CMPXATTR_OP_NOP = 0, + CEPH_OSD_CMPXATTR_OP_EQ = 1, + CEPH_OSD_CMPXATTR_OP_NE = 2, + CEPH_OSD_CMPXATTR_OP_GT = 3, + CEPH_OSD_CMPXATTR_OP_GTE = 4, + CEPH_OSD_CMPXATTR_OP_LT = 5, + CEPH_OSD_CMPXATTR_OP_LTE = 6 +}; + +enum { + CEPH_OSD_CMPXATTR_MODE_STRING = 1, + CEPH_OSD_CMPXATTR_MODE_U64 = 2 +}; + +/* + * an individual object operation. each may be accompanied by some data + * payload + */ +struct ceph_osd_op { + __le16 op; /* CEPH_OSD_OP_* */ + __le32 flags; /* CEPH_OSD_FLAG_* */ + union { + struct { + __le64 offset, length; + __le64 truncate_size; + __le32 truncate_seq; + } __attribute__ ((packed)) extent; + struct { + __le32 name_len; + __le32 value_len; + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ + } __attribute__ ((packed)) xattr; + struct { + __u8 class_len; + __u8 method_len; + __u8 argc; + __le32 indata_len; + } __attribute__ ((packed)) cls; + struct { + __le64 cookie, count; + } __attribute__ ((packed)) pgls; + struct { + __le64 snapid; + } __attribute__ ((packed)) snap; + }; + __le32 payload_len; +} __attribute__ ((packed)); + +/* + * osd request message header. each request may include multiple + * ceph_osd_op object operations. + */ +struct ceph_osd_request_head { + __le32 client_inc; /* client incarnation */ + struct ceph_object_layout layout; /* pgid */ + __le32 osdmap_epoch; /* client's osdmap epoch */ + + __le32 flags; + + struct ceph_timespec mtime; /* for mutations only */ + struct ceph_eversion reassert_version; /* if we are replaying op */ + + __le32 object_len; /* length of object name */ + + __le64 snapid; /* snapid to read */ + __le64 snap_seq; /* writer's snap context */ + __le32 num_snaps; + + __le16 num_ops; + struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */ +} __attribute__ ((packed)); + +struct ceph_osd_reply_head { + __le32 client_inc; /* client incarnation */ + __le32 flags; + struct ceph_object_layout layout; + __le32 osdmap_epoch; + struct ceph_eversion reassert_version; /* for replaying uncommitted */ + + __le32 result; /* result code */ + + __le32 object_len; /* length of object name */ + __le32 num_ops; + struct ceph_osd_op ops[0]; /* ops[], object */ +} __attribute__ ((packed)); + + +#endif diff --git a/include/linux/ceph/types.h b/include/linux/ceph/types.h new file mode 100644 index 000000000000..28b35a005ec2 --- /dev/null +++ b/include/linux/ceph/types.h @@ -0,0 +1,29 @@ +#ifndef _FS_CEPH_TYPES_H +#define _FS_CEPH_TYPES_H + +/* needed before including ceph_fs.h */ +#include +#include +#include +#include + +#include "ceph_fs.h" +#include "ceph_frag.h" +#include "ceph_hash.h" + +/* + * Identify inodes by both their ino AND snapshot id (a u64). + */ +struct ceph_vino { + u64 ino; + u64 snap; +}; + + +/* context for the caps reservation mechanism */ +struct ceph_cap_reservation { + int count; +}; + + +#endif diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h new file mode 100644 index 000000000000..97e435b191f4 --- /dev/null +++ b/include/linux/crush/crush.h @@ -0,0 +1,180 @@ +#ifndef CEPH_CRUSH_CRUSH_H +#define CEPH_CRUSH_CRUSH_H + +#include + +/* + * CRUSH is a pseudo-random data distribution algorithm that + * efficiently distributes input values (typically, data objects) + * across a heterogeneous, structured storage cluster. + * + * The algorithm was originally described in detail in this paper + * (although the algorithm has evolved somewhat since then): + * + * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf + * + * LGPL2 + */ + + +#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ + + +#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ +#define CRUSH_MAX_SET 10 /* max size of a mapping result */ + + +/* + * CRUSH uses user-defined "rules" to describe how inputs should be + * mapped to devices. A rule consists of sequence of steps to perform + * to generate the set of output devices. + */ +struct crush_rule_step { + __u32 op; + __s32 arg1; + __s32 arg2; +}; + +/* step op codes */ +enum { + CRUSH_RULE_NOOP = 0, + CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */ + CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */ + /* arg2 = type */ + CRUSH_RULE_CHOOSE_INDEP = 3, /* same */ + CRUSH_RULE_EMIT = 4, /* no args */ + CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6, + CRUSH_RULE_CHOOSE_LEAF_INDEP = 7, +}; + +/* + * for specifying choose num (arg1) relative to the max parameter + * passed to do_rule + */ +#define CRUSH_CHOOSE_N 0 +#define CRUSH_CHOOSE_N_MINUS(x) (-(x)) + +/* + * The rule mask is used to describe what the rule is intended for. + * Given a ruleset and size of output set, we search through the + * rule list for a matching rule_mask. + */ +struct crush_rule_mask { + __u8 ruleset; + __u8 type; + __u8 min_size; + __u8 max_size; +}; + +struct crush_rule { + __u32 len; + struct crush_rule_mask mask; + struct crush_rule_step steps[0]; +}; + +#define crush_rule_size(len) (sizeof(struct crush_rule) + \ + (len)*sizeof(struct crush_rule_step)) + + + +/* + * A bucket is a named container of other items (either devices or + * other buckets). Items within a bucket are chosen using one of a + * few different algorithms. The table summarizes how the speed of + * each option measures up against mapping stability when items are + * added or removed. + * + * Bucket Alg Speed Additions Removals + * ------------------------------------------------ + * uniform O(1) poor poor + * list O(n) optimal poor + * tree O(log n) good good + * straw O(n) optimal optimal + */ +enum { + CRUSH_BUCKET_UNIFORM = 1, + CRUSH_BUCKET_LIST = 2, + CRUSH_BUCKET_TREE = 3, + CRUSH_BUCKET_STRAW = 4 +}; +extern const char *crush_bucket_alg_name(int alg); + +struct crush_bucket { + __s32 id; /* this'll be negative */ + __u16 type; /* non-zero; type=0 is reserved for devices */ + __u8 alg; /* one of CRUSH_BUCKET_* */ + __u8 hash; /* which hash function to use, CRUSH_HASH_* */ + __u32 weight; /* 16-bit fixed point */ + __u32 size; /* num items */ + __s32 *items; + + /* + * cached random permutation: used for uniform bucket and for + * the linear search fallback for the other bucket types. + */ + __u32 perm_x; /* @x for which *perm is defined */ + __u32 perm_n; /* num elements of *perm that are permuted/defined */ + __u32 *perm; +}; + +struct crush_bucket_uniform { + struct crush_bucket h; + __u32 item_weight; /* 16-bit fixed point; all items equally weighted */ +}; + +struct crush_bucket_list { + struct crush_bucket h; + __u32 *item_weights; /* 16-bit fixed point */ + __u32 *sum_weights; /* 16-bit fixed point. element i is sum + of weights 0..i, inclusive */ +}; + +struct crush_bucket_tree { + struct crush_bucket h; /* note: h.size is _tree_ size, not number of + actual items */ + __u8 num_nodes; + __u32 *node_weights; +}; + +struct crush_bucket_straw { + struct crush_bucket h; + __u32 *item_weights; /* 16-bit fixed point */ + __u32 *straws; /* 16-bit fixed point */ +}; + + + +/* + * CRUSH map includes all buckets, rules, etc. + */ +struct crush_map { + struct crush_bucket **buckets; + struct crush_rule **rules; + + /* + * Parent pointers to identify the parent bucket a device or + * bucket in the hierarchy. If an item appears more than + * once, this is the _last_ time it appeared (where buckets + * are processed in bucket id order, from -1 on down to + * -max_buckets. + */ + __u32 *bucket_parents; + __u32 *device_parents; + + __s32 max_buckets; + __u32 max_rules; + __s32 max_devices; +}; + + +/* crush.c */ +extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos); +extern void crush_calc_parents(struct crush_map *map); +extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b); +extern void crush_destroy_bucket_list(struct crush_bucket_list *b); +extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); +extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); +extern void crush_destroy_bucket(struct crush_bucket *b); +extern void crush_destroy(struct crush_map *map); + +#endif diff --git a/include/linux/crush/hash.h b/include/linux/crush/hash.h new file mode 100644 index 000000000000..91e884230d5d --- /dev/null +++ b/include/linux/crush/hash.h @@ -0,0 +1,17 @@ +#ifndef CEPH_CRUSH_HASH_H +#define CEPH_CRUSH_HASH_H + +#define CRUSH_HASH_RJENKINS1 0 + +#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1 + +extern const char *crush_hash_name(int type); + +extern __u32 crush_hash32(int type, __u32 a); +extern __u32 crush_hash32_2(int type, __u32 a, __u32 b); +extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c); +extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d); +extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, + __u32 e); + +#endif diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h new file mode 100644 index 000000000000..c46b99c18bb0 --- /dev/null +++ b/include/linux/crush/mapper.h @@ -0,0 +1,20 @@ +#ifndef CEPH_CRUSH_MAPPER_H +#define CEPH_CRUSH_MAPPER_H + +/* + * CRUSH functions for find rules and then mapping an input to an + * output set. + * + * LGPL2 + */ + +#include "crush.h" + +extern int crush_find_rule(struct crush_map *map, int pool, int type, int size); +extern int crush_do_rule(struct crush_map *map, + int ruleno, + int x, int *result, int result_max, + int forcefeed, /* -1 for none */ + __u32 *weights); + +#endif -- cgit v1.2.3 From ac0b74d8a1ced8ea86147467daf06b15b130dd94 Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Fri, 17 Sep 2010 10:10:55 -0700 Subject: ceph: add pagelist_reserve, pagelist_truncate, pagelist_set_cursor These facilitate preallocation of pages so that we can encode into the pagelist in an atomic context. Signed-off-by: Greg Farnum Signed-off-by: Sage Weil --- include/linux/ceph/pagelist.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h index cc9327aa1c98..9660d6b0a35d 100644 --- a/include/linux/ceph/pagelist.h +++ b/include/linux/ceph/pagelist.h @@ -8,6 +8,14 @@ struct ceph_pagelist { void *mapped_tail; size_t length; size_t room; + struct list_head free_list; + size_t num_pages_free; +}; + +struct ceph_pagelist_cursor { + struct ceph_pagelist *pl; /* pagelist, for error checking */ + struct list_head *page_lru; /* page in list */ + size_t room; /* room remaining to reset to */ }; static inline void ceph_pagelist_init(struct ceph_pagelist *pl) @@ -16,11 +24,24 @@ static inline void ceph_pagelist_init(struct ceph_pagelist *pl) pl->mapped_tail = NULL; pl->length = 0; pl->room = 0; + INIT_LIST_HEAD(&pl->free_list); + pl->num_pages_free = 0; } + extern int ceph_pagelist_release(struct ceph_pagelist *pl); extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l); +extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space); + +extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl); + +extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c); + +extern int ceph_pagelist_truncate(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c); + static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) { __le64 ev = cpu_to_le64(v); -- cgit v1.2.3 From 571dba52a34015a5a7aa5d480a86936878444a6f Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Fri, 24 Sep 2010 14:56:40 -0700 Subject: ceph: add CEPH_MDS_OP_SETDIRLAYOUT and associated ioctl. Signed-off-by: Sage Weil --- include/linux/ceph/ceph_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index d5619ac86711..c3c74aef289d 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -299,6 +299,7 @@ enum { CEPH_MDS_OP_SETATTR = 0x01108, CEPH_MDS_OP_SETFILELOCK= 0x01109, CEPH_MDS_OP_GETFILELOCK= 0x00110, + CEPH_MDS_OP_SETDIRLAYOUT=0x0110a, CEPH_MDS_OP_MKNOD = 0x01201, CEPH_MDS_OP_LINK = 0x01202, -- cgit v1.2.3 From b0ae19811375031ae3b3fecc65b702a9c6e5cc28 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 15 Oct 2010 04:21:18 +0900 Subject: security: remove unused parameter from security_task_setscheduler() All security modules shouldn't change sched_param parameter of security_task_setscheduler(). This is not only meaningless, but also make a harmful result if caller pass a static variable. This patch remove policy and sched_param parameter from security_task_setscheduler() becuase none of security module is using it. Cc: James Morris Signed-off-by: KOSAKI Motohiro Signed-off-by: James Morris --- include/linux/security.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index a22219afff09..294a0b228123 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -74,7 +74,7 @@ extern int cap_file_mmap(struct file *file, unsigned long reqprot, extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags); extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); -extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp); +extern int cap_task_setscheduler(struct task_struct *p); extern int cap_task_setioprio(struct task_struct *p, int ioprio); extern int cap_task_setnice(struct task_struct *p, int nice); extern int cap_syslog(int type, bool from_file); @@ -1501,8 +1501,7 @@ struct security_operations { int (*task_getioprio) (struct task_struct *p); int (*task_setrlimit) (struct task_struct *p, unsigned int resource, struct rlimit *new_rlim); - int (*task_setscheduler) (struct task_struct *p, int policy, - struct sched_param *lp); + int (*task_setscheduler) (struct task_struct *p); int (*task_getscheduler) (struct task_struct *p); int (*task_movememory) (struct task_struct *p); int (*task_kill) (struct task_struct *p, @@ -1752,8 +1751,7 @@ int security_task_setioprio(struct task_struct *p, int ioprio); int security_task_getioprio(struct task_struct *p); int security_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim); -int security_task_setscheduler(struct task_struct *p, - int policy, struct sched_param *lp); +int security_task_setscheduler(struct task_struct *p); int security_task_getscheduler(struct task_struct *p); int security_task_movememory(struct task_struct *p); int security_task_kill(struct task_struct *p, struct siginfo *info, @@ -2320,11 +2318,9 @@ static inline int security_task_setrlimit(struct task_struct *p, return 0; } -static inline int security_task_setscheduler(struct task_struct *p, - int policy, - struct sched_param *lp) +static inline int security_task_setscheduler(struct task_struct *p) { - return cap_task_setscheduler(p, policy, lp); + return cap_task_setscheduler(p); } static inline int security_task_getscheduler(struct task_struct *p) -- cgit v1.2.3 From 2606fd1fa5710205b23ee859563502aa18362447 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 13 Oct 2010 16:24:41 -0400 Subject: secmark: make secmark object handling generic Right now secmark has lots of direct selinux calls. Use all LSM calls and remove all SELinux specific knowledge. The only SELinux specific knowledge we leave is the mode. The only point is to make sure that other LSMs at least test this generic code before they assume it works. (They may also have to make changes if they do not represent labels as strings) Signed-off-by: Eric Paris Acked-by: Paul Moore Acked-by: Patrick McHardy Signed-off-by: James Morris --- include/linux/netfilter/xt_SECMARK.h | 12 ++----- include/linux/security.h | 25 ++++++++++++++ include/linux/selinux.h | 63 ------------------------------------ 3 files changed, 28 insertions(+), 72 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/xt_SECMARK.h b/include/linux/netfilter/xt_SECMARK.h index 6fcd3448b186..989092bd6274 100644 --- a/include/linux/netfilter/xt_SECMARK.h +++ b/include/linux/netfilter/xt_SECMARK.h @@ -11,18 +11,12 @@ * packets are being marked for. */ #define SECMARK_MODE_SEL 0x01 /* SELinux */ -#define SECMARK_SELCTX_MAX 256 - -struct xt_secmark_target_selinux_info { - __u32 selsid; - char selctx[SECMARK_SELCTX_MAX]; -}; +#define SECMARK_SECCTX_MAX 256 struct xt_secmark_target_info { __u8 mode; - union { - struct xt_secmark_target_selinux_info sel; - } u; + __u32 secid; + char secctx[SECMARK_SECCTX_MAX]; }; #endif /*_XT_SECMARK_H_target */ diff --git a/include/linux/security.h b/include/linux/security.h index 294a0b228123..d70adc394f62 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -959,6 +959,12 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Sets the new child socket's sid to the openreq sid. * @inet_conn_established: * Sets the connection's peersid to the secmark on skb. + * @secmark_relabel_packet: + * check if the process should be allowed to relabel packets to the given secid + * @security_secmark_refcount_inc + * tells the LSM to increment the number of secmark labeling rules loaded + * @security_secmark_refcount_dec + * tells the LSM to decrement the number of secmark labeling rules loaded * @req_classify_flow: * Sets the flow's sid to the openreq sid. * @tun_dev_create: @@ -1593,6 +1599,9 @@ struct security_operations { struct request_sock *req); void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req); void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb); + int (*secmark_relabel_packet) (u32 secid); + void (*secmark_refcount_inc) (void); + void (*secmark_refcount_dec) (void); void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl); int (*tun_dev_create)(void); void (*tun_dev_post_create)(struct sock *sk); @@ -2547,6 +2556,9 @@ void security_inet_csk_clone(struct sock *newsk, const struct request_sock *req); void security_inet_conn_established(struct sock *sk, struct sk_buff *skb); +int security_secmark_relabel_packet(u32 secid); +void security_secmark_refcount_inc(void); +void security_secmark_refcount_dec(void); int security_tun_dev_create(void); void security_tun_dev_post_create(struct sock *sk); int security_tun_dev_attach(struct sock *sk); @@ -2701,6 +2713,19 @@ static inline void security_inet_conn_established(struct sock *sk, { } +static inline int security_secmark_relabel_packet(u32 secid) +{ + return 0; +} + +static inline void security_secmark_refcount_inc(void) +{ +} + +static inline void security_secmark_refcount_dec(void) +{ +} + static inline int security_tun_dev_create(void) { return 0; diff --git a/include/linux/selinux.h b/include/linux/selinux.h index 82e0f26a1299..44f459612690 100644 --- a/include/linux/selinux.h +++ b/include/linux/selinux.h @@ -20,75 +20,12 @@ struct kern_ipc_perm; #ifdef CONFIG_SECURITY_SELINUX -/** - * selinux_string_to_sid - map a security context string to a security ID - * @str: the security context string to be mapped - * @sid: ID value returned via this. - * - * Returns 0 if successful, with the SID stored in sid. A value - * of zero for sid indicates no SID could be determined (but no error - * occurred). - */ -int selinux_string_to_sid(char *str, u32 *sid); - -/** - * selinux_secmark_relabel_packet_permission - secmark permission check - * @sid: SECMARK ID value to be applied to network packet - * - * Returns 0 if the current task is allowed to set the SECMARK label of - * packets with the supplied security ID. Note that it is implicit that - * the packet is always being relabeled from the default unlabeled value, - * and that the access control decision is made in the AVC. - */ -int selinux_secmark_relabel_packet_permission(u32 sid); - -/** - * selinux_secmark_refcount_inc - increments the secmark use counter - * - * SELinux keeps track of the current SECMARK targets in use so it knows - * when to apply SECMARK label access checks to network packets. This - * function incements this reference count to indicate that a new SECMARK - * target has been configured. - */ -void selinux_secmark_refcount_inc(void); - -/** - * selinux_secmark_refcount_dec - decrements the secmark use counter - * - * SELinux keeps track of the current SECMARK targets in use so it knows - * when to apply SECMARK label access checks to network packets. This - * function decements this reference count to indicate that one of the - * existing SECMARK targets has been removed/flushed. - */ -void selinux_secmark_refcount_dec(void); - /** * selinux_is_enabled - is SELinux enabled? */ bool selinux_is_enabled(void); #else -static inline int selinux_string_to_sid(const char *str, u32 *sid) -{ - *sid = 0; - return 0; -} - -static inline int selinux_secmark_relabel_packet_permission(u32 sid) -{ - return 0; -} - -static inline void selinux_secmark_refcount_inc(void) -{ - return; -} - -static inline void selinux_secmark_refcount_dec(void) -{ - return; -} - static inline bool selinux_is_enabled(void) { return false; -- cgit v1.2.3 From d5630b9d276bd389299ffea620b7c340ab19bcf5 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 13 Oct 2010 16:24:48 -0400 Subject: security: secid_to_secctx returns len when data is NULL With the (long ago) interface change to have the secid_to_secctx functions do the string allocation instead of having the caller do the allocation we lost the ability to query the security server for the length of the upcoming string. The SECMARK code would like to allocate a netlink skb with enough length to hold the string but it is just too unclean to do the string allocation twice or to do the allocation the first time and hold onto the string and slen. This patch adds the ability to call security_secid_to_secctx() with a NULL data pointer and it will just set the slen pointer. Signed-off-by: Eric Paris Reviewed-by: Paul Moore Signed-off-by: James Morris --- include/linux/security.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index d70adc394f62..b8246a8df7d2 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1285,9 +1285,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Return 0 if permission is granted. * * @secid_to_secctx: - * Convert secid to security context. + * Convert secid to security context. If secdata is NULL the length of + * the result will be returned in seclen, but no secdata will be returned. + * This does mean that the length could change between calls to check the + * length and the next call which actually allocates and returns the secdata. * @secid contains the security ID. * @secdata contains the pointer that stores the converted security context. + * @seclen pointer which contains the length of the data * @secctx_to_secid: * Convert security context to secid. * @secid contains the pointer to the generated security ID. -- cgit v1.2.3 From 1cc63249adfa957b34ca51effdee90ff8261d63f Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 13 Oct 2010 16:24:54 -0400 Subject: conntrack: export lsm context rather than internal secid via netlink The conntrack code can export the internal secid to userspace. These are dynamic, can change on lsm changes, and have no meaning in userspace. We should instead be sending lsm contexts to userspace instead. This patch sends the secctx (rather than secid) to userspace over the netlink socket. We use a new field CTA_SECCTX and stop using the the old CTA_SECMARK field since it did not send particularly useful information. Signed-off-by: Eric Paris Reviewed-by: Paul Moore Acked-by: Patrick McHardy Signed-off-by: James Morris --- include/linux/netfilter/nfnetlink_conntrack.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index 9ed534c991b9..70cd0603911c 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -39,8 +39,9 @@ enum ctattr_type { CTA_TUPLE_MASTER, CTA_NAT_SEQ_ADJ_ORIG, CTA_NAT_SEQ_ADJ_REPLY, - CTA_SECMARK, + CTA_SECMARK, /* obsolete */ CTA_ZONE, + CTA_SECCTX, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) @@ -172,4 +173,11 @@ enum ctattr_help { }; #define CTA_HELP_MAX (__CTA_HELP_MAX - 1) +enum ctattr_secctx { + CTA_SECCTX_UNSPEC, + CTA_SECCTX_NAME, + __CTA_SECCTX_MAX +}; +#define CTA_SECCTX_MAX (__CTA_SECCTX_MAX - 1) + #endif /* _IPCONNTRACK_NETLINK_H */ -- cgit v1.2.3 From 686a0f3d71203bbfcc186900bbb8ac2cfc3d803c Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 13 Oct 2010 17:50:02 -0400 Subject: kernel: rounddown helper function The roundup() helper function will round a given value up to a multiple of another given value. aka roundup(11, 7) would give 14 = 7 * 2. This new function does the opposite. It will round a given number down to the nearest multiple of the second number: rounddown(11, 7) would give 7. I need this in some future SELinux code and can carry the macro myself, but figured I would put it in the core kernel so others might find and use it if need be. Signed-off-by: Eric Paris Signed-off-by: James Morris --- include/linux/kernel.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2b0a35e6bc69..6d6eea7f7b1e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -59,6 +59,12 @@ extern const char linux_proc_banner[]; #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#define rounddown(x, y) ( \ +{ \ + typeof(x) __x = (x); \ + __x - (__x % (y)); \ +} \ +) #define DIV_ROUND_CLOSEST(x, divisor)( \ { \ typeof(divisor) __divisor = divisor; \ -- cgit v1.2.3 From b28efd54d9d5c8005a29cd8782335beb9daaa32d Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 13 Oct 2010 17:50:08 -0400 Subject: kernel: roundup should only reference arguments once Currently the roundup macro references it's arguments more than one time. This patch changes it so it will only use its arguments once. Suggested-by: Andrew Morton Signed-off-by: Eric Paris Signed-off-by: James Morris --- include/linux/kernel.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 6d6eea7f7b1e..1759ba5adce8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -58,7 +58,12 @@ extern const char linux_proc_banner[]; #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) -#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#define roundup(x, y) ( \ +{ \ + typeof(y) __y = y; \ + (((x) + (__y - 1)) / __y) * __y; \ +} \ +) #define rounddown(x, y) ( \ { \ typeof(x) __x = (x); \ -- cgit v1.2.3 From 30e1f7a8122145f44f45c95366e27b6bb0b08428 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 2 Sep 2010 17:26:48 +0200 Subject: EDAC: Export edac sysfs class to users. Move toplevel sysfs class to the stub and make it available to non-modularized code too. Add proper refcounting of its users and move the registration functionality into the reference counting routines. Signed-off-by: Borislav Petkov --- include/linux/edac.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/edac.h b/include/linux/edac.h index 7cf92e8a4196..36c66443bdfd 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -13,6 +13,7 @@ #define _LINUX_EDAC_H_ #include +#include #define EDAC_OPSTATE_INVAL -1 #define EDAC_OPSTATE_POLL 0 @@ -22,9 +23,12 @@ extern int edac_op_state; extern int edac_err_assert; extern atomic_t edac_handlers; +extern struct sysdev_class edac_class; extern int edac_handler_set(void); extern void edac_atomic_assert_error(void); +extern struct sysdev_class *edac_get_sysfs_class(void); +extern void edac_put_sysfs_class(void); static inline void opstate_init(void) { -- cgit v1.2.3