From 6b74ab97bc12ce74acec900f1d89a4aee2e4d70d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:26:49 -0700 Subject: mm: add a basic debugging framework for memory initialisation Boot initialisation is very complex, with significant numbers of architecture-specific routines, hooks and code ordering. While significant amounts of the initialisation is architecture-independent, it trusts the data received from the architecture layer. This is a mistake, and has resulted in a number of difficult-to-diagnose bugs. This patchset adds some validation and tracing to memory initialisation. It also introduces a few basic defensive measures. The validation code can be explicitly disabled for embedded systems. This patch: Add additional debugging and verification code for memory initialisation. Once enabled, the verification checks are always run and when required additional debugging information may be outputted via a mminit_loglevel= command-line parameter. The verification code is placed in a new file mm/mm_init.c. Ideally other mm initialisation code will be moved here over time. Signed-off-by: Mel Gorman Cc: Christoph Lameter Cc: Andy Whitcroft Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 882c51048993..e1d4764435ed 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -505,6 +505,18 @@ config DEBUG_WRITECOUNT If unsure, say N. +config DEBUG_MEMORY_INIT + bool "Debug memory initialisation" if EMBEDDED + default !EMBEDDED + help + Enable this for additional checks during memory initialisation. + The sanity checks verify aspects of the VM such as the memory model + and other information provided by the architecture. Verbose + information will be printed at KERN_DEBUG loglevel depending + on the mminit_loglevel= command-line option. + + If unsure, say Y + config DEBUG_LIST bool "Debug linked list manipulation" depends on DEBUG_KERNEL -- cgit v1.2.3 From 8b05c7e6e159d2f33c9275281b8b909a89eb7c5d Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 23 Jul 2008 21:26:53 -0700 Subject: add a helper function to test if an object is on the stack lib/debugobjects.c has a function to test if an object is on the stack. The block layer and ide needs it (they need to avoid DMA from/to stack buffers). This patch moves the function to include/linux/sched.h so that everyone can use it. lib/debugobjects.c uses current->stack but this patch uses a task_stack_page() accessor, which is a preferable way to access the stack. Signed-off-by: FUJITA Tomonori Cc: Christoph Lameter Cc: Andy Whitcroft Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/debugobjects.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 85b18d79be89..f86196390cfd 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -226,15 +226,13 @@ debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state), static void debug_object_is_on_stack(void *addr, int onstack) { - void *stack = current->stack; int is_on_stack; static int limit; if (limit > 4) return; - is_on_stack = (addr >= stack && addr < (stack + THREAD_SIZE)); - + is_on_stack = object_is_on_stack(addr); if (is_on_stack == onstack) return; -- cgit v1.2.3 From d3de851a445123f24ad8ece18662014b5e8a8b4e Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 23 Jul 2008 21:30:37 -0700 Subject: rtc: BCD codeshrink This updates to define the key routines as constant functions, which the macros will then call. Newer code can now call bcd2bin() instead of SCREAMING BCD2BIN() TO THE FOUR WINDS. This lets each driver shrink their codespace by using N function calls to a single (global) copy of those routines, instead of N inlined copies of these functions per driver. These routines aren't used in speed-critical code. Almost all callers are in the RTC framework. Typical per-driver savings is near 300 bytes. Signed-off-by: David Brownell Acked-by: Adrian Bunk Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Makefile | 2 +- lib/bcd.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 lib/bcd.c (limited to 'lib') diff --git a/lib/Makefile b/lib/Makefile index 818c4d455518..9085ad6fa53d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -18,7 +18,7 @@ lib-$(CONFIG_SMP) += cpumask.o lib-y += kobject.o kref.o klist.o -obj-y += div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ +obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) diff --git a/lib/bcd.c b/lib/bcd.c new file mode 100644 index 000000000000..d74257fd0fe7 --- /dev/null +++ b/lib/bcd.c @@ -0,0 +1,14 @@ +#include +#include + +unsigned bcd2bin(unsigned char val) +{ + return (val & 0x0f) + (val >> 4) * 10; +} +EXPORT_SYMBOL(bcd2bin); + +unsigned char bin2bcd(unsigned val) +{ + return ((val / 10) << 4) + val % 10; +} +EXPORT_SYMBOL(bin2bcd); -- cgit v1.2.3 From 545e400619b24b6b17b7f1f1e838e9ff6d036949 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Fri, 25 Jul 2008 01:45:27 -0700 Subject: lzo: use get/put_unaligned_* helpers Signed-off-by: Harvey Harrison Acked-by: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/lzo/lzo1x_decompress.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/lzo/lzo1x_decompress.c b/lib/lzo/lzo1x_decompress.c index 77f0f9b775a9..5dc6b29c1575 100644 --- a/lib/lzo/lzo1x_decompress.c +++ b/lib/lzo/lzo1x_decompress.c @@ -138,8 +138,7 @@ match: t += 31 + *ip++; } m_pos = op - 1; - m_pos -= le16_to_cpu(get_unaligned( - (const unsigned short *)ip)) >> 2; + m_pos -= get_unaligned_le16(ip) >> 2; ip += 2; } else if (t >= 16) { m_pos = op; @@ -157,8 +156,7 @@ match: } t += 7 + *ip++; } - m_pos -= le16_to_cpu(get_unaligned( - (const unsigned short *)ip)) >> 2; + m_pos -= get_unaligned_le16(ip) >> 2; ip += 2; if (m_pos == op) goto eof_found; -- cgit v1.2.3 From fd193829744bc77392395cf8f47889235c97f0a3 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Fri, 25 Jul 2008 01:45:31 -0700 Subject: lib: allow memparse() to accept a NULL and ignorable second parm Extend memparse() to allow the caller to use a NULL second parameter, which would represent no interest in returning the address of the end of the parsed string. In numerous cases, callers invoke memparse() to parse a possibly-suffixed string (such as "64K" or "2G" or whatever) and define a character pointer to accept the end pointer being returned by memparse() even though they have no interest in it and promptly throw it away. This (backward-compatible) enhancement allows callers to use NULL in the cases where they just don't care about getting back that end pointer. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Robert P. J. Day Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/cmdline.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/cmdline.c b/lib/cmdline.c index f596c08d213a..5ba8a942a478 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -116,7 +116,7 @@ char *get_options(const char *str, int nints, int *ints) /** * memparse - parse a string with mem suffixes into a number * @ptr: Where parse begins - * @retptr: (output) Pointer to next char after parse completes + * @retptr: (output) Optional pointer to next char after parse completes * * Parses a string into a number. The number stored at @ptr is * potentially suffixed with %K (for kilobytes, or 1024 bytes), @@ -126,11 +126,13 @@ char *get_options(const char *str, int nints, int *ints) * megabyte, or one gigabyte, respectively. */ -unsigned long long memparse (char *ptr, char **retptr) +unsigned long long memparse(char *ptr, char **retptr) { - unsigned long long ret = simple_strtoull (ptr, retptr, 0); + char *endptr; /* local pointer to end of parsed string */ - switch (**retptr) { + unsigned long long ret = simple_strtoull(ptr, &endptr, 0); + + switch (*endptr) { case 'G': case 'g': ret <<= 10; @@ -140,10 +142,14 @@ unsigned long long memparse (char *ptr, char **retptr) case 'K': case 'k': ret <<= 10; - (*retptr)++; + endptr++; default: break; } + + if (retptr) + *retptr = endptr; + return ret; } -- cgit v1.2.3 From e0ce0da9fefcc723dc006c35a7f91a32750abd40 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Fri, 25 Jul 2008 01:45:32 -0700 Subject: lists: remove a redundant conditional definition of list_add() Remove the conditional surrounding the definition of list_add() from list.h since, if you define CONFIG_DEBUG_LIST, the definition you will subsequently pick up from lib/list_debug.c will be absolutely identical, at which point you can remove that redundant definition from list_debug.c as well. Signed-off-by: Robert P. J. Day Cc: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_debug.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'lib') diff --git a/lib/list_debug.c b/lib/list_debug.c index 4350ba9655bd..45c03fd608dd 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -39,20 +39,6 @@ void __list_add(struct list_head *new, } EXPORT_SYMBOL(__list_add); -/** - * list_add - add a new entry - * @new: new entry to be added - * @head: list head to add it after - * - * Insert a new entry after the specified head. - * This is good for implementing stacks. - */ -void list_add(struct list_head *new, struct list_head *head) -{ - __list_add(new, head, head->next); -} -EXPORT_SYMBOL(list_add); - /** * list_del - deletes entry from list. * @entry: the element to delete from the list. -- cgit v1.2.3 From 2d6ffcca623a9a16df6cdfbe8250b7a5904a5f5e Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 25 Jul 2008 01:45:44 -0700 Subject: inflate: refactor inflate malloc code Inflate requires some dynamic memory allocation very early in the boot process and this is provided with a set of four functions: malloc/free/gzip_mark/gzip_release. The old inflate code used a mark/release strategy rather than implement free. This new version instead keeps a count on the number of outstanding allocations and when it hits zero, it resets the malloc arena. This allows removing all the mark and release implementations and unifying all the malloc/free implementations. The architecture-dependent code must define two addresses: - free_mem_ptr, the address of the beginning of the area in which allocations should be made - free_mem_end_ptr, the address of the end of the area in which allocations should be made. If set to 0, then no check is made on the number of allocations, it just grows as much as needed The architecture-dependent code can also provide an arch_decomp_wdog() function call. This function will be called several times during the decompression process, and allow to notify the watchdog that the system is still running. If an architecture provides such a call, then it must define ARCH_HAS_DECOMP_WDOG so that the generic inflate code calls arch_decomp_wdog(). Work initially done by Matt Mackall, updated to a recent version of the kernel and improved by me. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Thomas Petazzoni Cc: Matt Mackall Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Mikael Starvik Cc: Jesper Nilsson Cc: Haavard Skinnemoen Cc: David Howells Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andi Kleen Cc: "H. Peter Anvin" Acked-by: Paul Mundt Acked-by: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/inflate.c | 52 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/inflate.c b/lib/inflate.c index 9762294be062..1a8e8a978128 100644 --- a/lib/inflate.c +++ b/lib/inflate.c @@ -230,6 +230,45 @@ STATIC const ush mask_bits[] = { #define NEEDBITS(n) {while(k<(n)){b|=((ulg)NEXTBYTE())<>=(n);k-=(n);} +#ifndef NO_INFLATE_MALLOC +/* A trivial malloc implementation, adapted from + * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 + */ + +static unsigned long malloc_ptr; +static int malloc_count; + +static void *malloc(int size) +{ + void *p; + + if (size < 0) + error("Malloc error"); + if (!malloc_ptr) + malloc_ptr = free_mem_ptr; + + malloc_ptr = (malloc_ptr + 3) & ~3; /* Align */ + + p = (void *)malloc_ptr; + malloc_ptr += size; + + if (free_mem_end_ptr && malloc_ptr >= free_mem_end_ptr) + error("Out of memory"); + + malloc_count++; + return p; +} + +static void free(void *where) +{ + malloc_count--; + if (!malloc_count) + malloc_ptr = free_mem_ptr; +} +#else +#define malloc(a) kmalloc(a, GFP_KERNEL) +#define free(a) kfree(a) +#endif /* Huffman code decoding is performed using a multi-level table lookup. @@ -1045,7 +1084,6 @@ STATIC int INIT inflate(void) int e; /* last block flag */ int r; /* result code */ unsigned h; /* maximum struct huft's malloc'ed */ - void *ptr; /* initialize window, bit buffer */ wp = 0; @@ -1057,12 +1095,12 @@ STATIC int INIT inflate(void) h = 0; do { hufts = 0; - gzip_mark(&ptr); - if ((r = inflate_block(&e)) != 0) { - gzip_release(&ptr); - return r; - } - gzip_release(&ptr); +#ifdef ARCH_HAS_DECOMP_WDOG + arch_decomp_wdog(); +#endif + r = inflate_block(&e); + if (r) + return r; if (hufts > h) h = hufts; } while (!e); -- cgit v1.2.3 From d955c78ac4699ac9c3fe07be62982cda13d13267 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 25 Jul 2008 01:45:55 -0700 Subject: Example use of WARN() Now that WARN() exists, we can fold some of the printk's into it. Signed-off-by: Arjan van de Ven Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/kobject.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index 744401571ed7..bd732ffebc85 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -164,9 +164,8 @@ static int kobject_add_internal(struct kobject *kobj) return -ENOENT; if (!kobj->name || !kobj->name[0]) { - pr_debug("kobject: (%p): attempted to be registered with empty " + WARN(1, "kobject: (%p): attempted to be registered with empty " "name!\n", kobj); - WARN_ON(1); return -EINVAL; } @@ -583,12 +582,10 @@ static void kobject_release(struct kref *kref) void kobject_put(struct kobject *kobj) { if (kobj) { - if (!kobj->state_initialized) { - printk(KERN_WARNING "kobject: '%s' (%p): is not " + if (!kobj->state_initialized) + WARN(1, KERN_WARNING "kobject: '%s' (%p): is not " "initialized, yet kobject_put() is being " "called.\n", kobject_name(kobj), kobj); - WARN_ON(1); - } kref_put(&kobj->kref, kobject_release); } } -- cgit v1.2.3 From 924d9addb9b1474fc81a78a5c6706755efea7aaa Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 25 Jul 2008 01:45:55 -0700 Subject: list debugging: use WARN() instead of BUG() Arjan noted that the list_head debugging is BUG'ing when it detects corruption. By causing the box to panic immediately, we're possibly losing some bug reports. Changing this to a WARN() should mean we at the least start seeing reports collected at kerneloops.org Signed-off-by: Dave Jones Cc: Matthew Wilcox Cc: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_debug.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) (limited to 'lib') diff --git a/lib/list_debug.c b/lib/list_debug.c index 45c03fd608dd..1a39f4e3ae1f 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -20,18 +20,14 @@ void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { - if (unlikely(next->prev != prev)) { - printk(KERN_ERR "list_add corruption. next->prev should be " - "prev (%p), but was %p. (next=%p).\n", - prev, next->prev, next); - BUG(); - } - if (unlikely(prev->next != next)) { - printk(KERN_ERR "list_add corruption. prev->next should be " - "next (%p), but was %p. (prev=%p).\n", - next, prev->next, prev); - BUG(); - } + WARN(next->prev != prev, + "list_add corruption. next->prev should be " + "prev (%p), but was %p. (next=%p).\n", + prev, next->prev, next); + WARN(prev->next != next, + "list_add corruption. prev->next should be " + "next (%p), but was %p. (prev=%p).\n", + next, prev->next, prev); next->prev = new; new->next = next; new->prev = prev; @@ -47,16 +43,12 @@ EXPORT_SYMBOL(__list_add); */ void list_del(struct list_head *entry) { - if (unlikely(entry->prev->next != entry)) { - printk(KERN_ERR "list_del corruption. prev->next should be %p, " - "but was %p\n", entry, entry->prev->next); - BUG(); - } - if (unlikely(entry->next->prev != entry)) { - printk(KERN_ERR "list_del corruption. next->prev should be %p, " - "but was %p\n", entry, entry->next->prev); - BUG(); - } + WARN(entry->prev->next != entry, + "list_del corruption. prev->next should be %p, " + "but was %p\n", entry, entry->prev->next); + WARN(entry->next->prev != entry, + "list_del corruption. next->prev should be %p, " + "but was %p\n", entry, entry->next->prev); __list_del(entry->prev, entry->next); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; -- cgit v1.2.3 From 717115e1a5856b57af0f71e1df7149108294fc10 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 25 Jul 2008 01:45:58 -0700 Subject: printk ratelimiting rewrite All ratelimit user use same jiffies and burst params, so some messages (callbacks) will be lost. For example: a call printk_ratelimit(5 * HZ, 1) b call printk_ratelimit(5 * HZ, 1) before the 5*HZ timeout of a, then b will will be supressed. - rewrite __ratelimit, and use a ratelimit_state as parameter. Thanks for hints from andrew. - Add WARN_ON_RATELIMIT, update rcupreempt.h - remove __printk_ratelimit - use __ratelimit in net_ratelimit Signed-off-by: Dave Young Cc: "David S. Miller" Cc: "Paul E. McKenney" Cc: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/ratelimit.c | 55 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 25 deletions(-) (limited to 'lib') diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 485e3040dcd4..35136671b215 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -3,6 +3,9 @@ * * Isolated from kernel/printk.c by Dave Young * + * 2008-05-01 rewrite the function and use a ratelimit_state data struct as + * parameter. Now every user can use their own standalone ratelimit_state. + * * This file is released under the GPLv2. * */ @@ -11,41 +14,43 @@ #include #include +static DEFINE_SPINLOCK(ratelimit_lock); +static unsigned long flags; + /* * __ratelimit - rate limiting - * @ratelimit_jiffies: minimum time in jiffies between two callbacks - * @ratelimit_burst: number of callbacks we do before ratelimiting + * @rs: ratelimit_state data * - * This enforces a rate limit: not more than @ratelimit_burst callbacks - * in every ratelimit_jiffies + * This enforces a rate limit: not more than @rs->ratelimit_burst callbacks + * in every @rs->ratelimit_jiffies */ -int __ratelimit(int ratelimit_jiffies, int ratelimit_burst) +int __ratelimit(struct ratelimit_state *rs) { - static DEFINE_SPINLOCK(ratelimit_lock); - static unsigned toks = 10 * 5 * HZ; - static unsigned long last_msg; - static int missed; - unsigned long flags; - unsigned long now = jiffies; + if (!rs->interval) + return 1; spin_lock_irqsave(&ratelimit_lock, flags); - toks += now - last_msg; - last_msg = now; - if (toks > (ratelimit_burst * ratelimit_jiffies)) - toks = ratelimit_burst * ratelimit_jiffies; - if (toks >= ratelimit_jiffies) { - int lost = missed; + if (!rs->begin) + rs->begin = jiffies; - missed = 0; - toks -= ratelimit_jiffies; - spin_unlock_irqrestore(&ratelimit_lock, flags); - if (lost) - printk(KERN_WARNING "%s: %d messages suppressed\n", - __func__, lost); - return 1; + if (time_is_before_jiffies(rs->begin + rs->interval)) { + if (rs->missed) + printk(KERN_WARNING "%s: %d callbacks suppressed\n", + __func__, rs->missed); + rs->begin = 0; + rs->printed = 0; + rs->missed = 0; } - missed++; + if (rs->burst && rs->burst > rs->printed) + goto print; + + rs->missed++; spin_unlock_irqrestore(&ratelimit_lock, flags); return 0; + +print: + rs->printed++; + spin_unlock_irqrestore(&ratelimit_lock, flags); + return 1; } EXPORT_SYMBOL(__ratelimit); -- cgit v1.2.3 From 4ae537892ab9858f71c78701f4651ad1ca531a1b Mon Sep 17 00:00:00 2001 From: Nadia Derbey Date: Fri, 25 Jul 2008 01:47:58 -0700 Subject: idr: rename some of the idr APIs internal routines This is a trivial patch that renames: . alloc_layer to get_from_free_list since it idr_pre_get that actually allocates memory. . free_layer to move_to_free_list since memory is not actually freed there. This makes things more clear for the next patches. Signed-off-by: Nadia Derbey Reviewed-by: "Paul E. McKenney" Cc: Manfred Spraul Cc: Jim Houston Cc: Pierre Peiffer Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/idr.c b/lib/idr.c index 7a02e173f027..8170ace154fb 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -35,7 +35,7 @@ static struct kmem_cache *idr_layer_cache; -static struct idr_layer *alloc_layer(struct idr *idp) +static struct idr_layer *get_from_free_list(struct idr *idp) { struct idr_layer *p; unsigned long flags; @@ -51,14 +51,14 @@ static struct idr_layer *alloc_layer(struct idr *idp) } /* only called when idp->lock is held */ -static void __free_layer(struct idr *idp, struct idr_layer *p) +static void __move_to_free_list(struct idr *idp, struct idr_layer *p) { p->ary[0] = idp->id_free; idp->id_free = p; idp->id_free_cnt++; } -static void free_layer(struct idr *idp, struct idr_layer *p) +static void move_to_free_list(struct idr *idp, struct idr_layer *p) { unsigned long flags; @@ -66,7 +66,7 @@ static void free_layer(struct idr *idp, struct idr_layer *p) * Depends on the return element being zeroed. */ spin_lock_irqsave(&idp->lock, flags); - __free_layer(idp, p); + __move_to_free_list(idp, p); spin_unlock_irqrestore(&idp->lock, flags); } @@ -109,7 +109,7 @@ int idr_pre_get(struct idr *idp, gfp_t gfp_mask) new = kmem_cache_alloc(idr_layer_cache, gfp_mask); if (new == NULL) return (0); - free_layer(idp, new); + move_to_free_list(idp, new); } return 1; } @@ -167,7 +167,8 @@ static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa) * Create the layer below if it is missing. */ if (!p->ary[m]) { - if (!(new = alloc_layer(idp))) + new = get_from_free_list(idp); + if (!new) return -1; p->ary[m] = new; p->count++; @@ -192,7 +193,7 @@ build_up: p = idp->top; layers = idp->layers; if (unlikely(!p)) { - if (!(p = alloc_layer(idp))) + if (!(p = get_from_free_list(idp))) return -1; layers = 1; } @@ -204,7 +205,7 @@ build_up: layers++; if (!p->count) continue; - if (!(new = alloc_layer(idp))) { + if (!(new = get_from_free_list(idp))) { /* * The allocation failed. If we built part of * the structure tear it down. @@ -214,7 +215,7 @@ build_up: p = p->ary[0]; new->ary[0] = NULL; new->bitmap = new->count = 0; - __free_layer(idp, new); + __move_to_free_list(idp, new); } spin_unlock_irqrestore(&idp->lock, flags); return -1; @@ -351,7 +352,7 @@ static void sub_remove(struct idr *idp, int shift, int id) __clear_bit(n, &p->bitmap); p->ary[n] = NULL; while(*paa && ! --((**paa)->count)){ - free_layer(idp, **paa); + move_to_free_list(idp, **paa); **paa-- = NULL; } if (!*paa) @@ -378,12 +379,12 @@ void idr_remove(struct idr *idp, int id) p = idp->top->ary[0]; idp->top->bitmap = idp->top->count = 0; - free_layer(idp, idp->top); + move_to_free_list(idp, idp->top); idp->top = p; --idp->layers; } while (idp->id_free_cnt >= IDR_FREE_MAX) { - p = alloc_layer(idp); + p = get_from_free_list(idp); kmem_cache_free(idr_layer_cache, p); } return; @@ -426,7 +427,7 @@ void idr_remove_all(struct idr *idp) while (n < fls(id)) { if (p) { memset(p, 0, sizeof *p); - free_layer(idp, p); + move_to_free_list(idp, p); } n += IDR_BITS; p = *--paa; @@ -444,7 +445,7 @@ EXPORT_SYMBOL(idr_remove_all); void idr_destroy(struct idr *idp) { while (idp->id_free_cnt) { - struct idr_layer *p = alloc_layer(idp); + struct idr_layer *p = get_from_free_list(idp); kmem_cache_free(idr_layer_cache, p); } } @@ -749,7 +750,7 @@ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) * allocation. */ if (ida->idr.id_free_cnt || ida->free_bitmap) { - struct idr_layer *p = alloc_layer(&ida->idr); + struct idr_layer *p = get_from_free_list(&ida->idr); if (p) kmem_cache_free(idr_layer_cache, p); } -- cgit v1.2.3 From f098ad655f4dd8e3da98ffbeda9cedcc4459c01a Mon Sep 17 00:00:00 2001 From: Nadia Derbey Date: Fri, 25 Jul 2008 01:47:59 -0700 Subject: idr: fix a printk call Fix the incomplete printk call. Signed-off-by: Nadia Derbey Reviewed-by: "Paul E. McKenney" Cc: Manfred Spraul Cc: Jim Houston Cc: Pierre Peiffer Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/idr.c b/lib/idr.c index 8170ace154fb..9d905b131ecb 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -326,7 +326,8 @@ EXPORT_SYMBOL(idr_get_new); static void idr_remove_warning(int id) { - printk("idr_remove called for id=%d which is not allocated.\n", id); + printk(KERN_WARNING + "idr_remove called for id=%d which is not allocated.\n", id); dump_stack(); } -- cgit v1.2.3 From 944ca05c7b4972f2ebf37262e0f4933d178ad6db Mon Sep 17 00:00:00 2001 From: Nadia Derbey Date: Fri, 25 Jul 2008 01:47:59 -0700 Subject: idr: error checking factorization Do some code factorization in the return code analysis. Signed-off-by: Nadia Derbey Cc: "Paul E. McKenney" Cc: Manfred Spraul Cc: Jim Houston Cc: Pierre Peiffer Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) (limited to 'lib') diff --git a/lib/idr.c b/lib/idr.c index 9d905b131ecb..80ba06f29d36 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -143,7 +143,7 @@ static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa) /* if already at the top layer, we need to grow */ if (!(p = pa[l])) { *starting_id = id; - return -2; + return IDR_NEED_TO_GROW; } /* If we need to go up one layer, continue the @@ -160,7 +160,7 @@ static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa) id = ((id >> sh) ^ n ^ m) << sh; } if ((id >= MAX_ID_BIT) || (id < 0)) - return -3; + return IDR_NOMORE_SPACE; if (l == 0) break; /* @@ -229,7 +229,7 @@ build_up: idp->top = p; idp->layers = layers; v = sub_alloc(idp, &id, pa); - if (v == -2) + if (v == IDR_NEED_TO_GROW) goto build_up; return(v); } @@ -278,12 +278,8 @@ int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id) * This is a cheap hack until the IDR code can be fixed to * return proper error values. */ - if (rv < 0) { - if (rv == -1) - return -EAGAIN; - else /* Will be -3 */ - return -ENOSPC; - } + if (rv < 0) + return _idr_rc_to_errno(rv); *id = rv; return 0; } @@ -313,12 +309,8 @@ int idr_get_new(struct idr *idp, void *ptr, int *id) * This is a cheap hack until the IDR code can be fixed to * return proper error values. */ - if (rv < 0) { - if (rv == -1) - return -EAGAIN; - else /* Will be -3 */ - return -ENOSPC; - } + if (rv < 0) + return _idr_rc_to_errno(rv); *id = rv; return 0; } @@ -696,12 +688,8 @@ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) restart: /* get vacant slot */ t = idr_get_empty_slot(&ida->idr, idr_id, pa); - if (t < 0) { - if (t == -1) - return -EAGAIN; - else /* will be -3 */ - return -ENOSPC; - } + if (t < 0) + return _idr_rc_to_errno(t); if (t * IDA_BITMAP_BITS >= MAX_ID_BIT) return -ENOSPC; -- cgit v1.2.3 From 3219b3b7456d5cf15ba7b1fe7b1bcf15ce8840e2 Mon Sep 17 00:00:00 2001 From: Nadia Derbey Date: Fri, 25 Jul 2008 01:48:00 -0700 Subject: idr: make idr_get_new* rcu-safe Make the idr_get_new* routines rcu-safe. Signed-off-by: Nadia Derbey Reviewed-by: "Paul E. McKenney" Cc: Manfred Spraul Cc: Jim Houston Cc: Pierre Peiffer Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/idr.c b/lib/idr.c index 80ba06f29d36..44ab3b2a4eba 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -6,6 +6,8 @@ * Modified by George Anzinger to reuse immediately and to use * find bit instructions. Also removed _irq on spinlocks. * + * Modified by Nadia Derbey to make it RCU safe. + * * Small id to pointer translation service. * * It uses a radix tree like structure as a sparse array indexed @@ -96,7 +98,7 @@ static void idr_mark_full(struct idr_layer **pa, int id) * @gfp_mask: memory allocation flags * * This function should be called prior to locking and calling the - * following function. It preallocates enough memory to satisfy + * idr_get_new* functions. It preallocates enough memory to satisfy * the worst possible allocation. * * If the system is REALLY out of memory this function returns 0, @@ -170,7 +172,7 @@ static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa) new = get_from_free_list(idp); if (!new) return -1; - p->ary[m] = new; + rcu_assign_pointer(p->ary[m], new); p->count++; } pa[l--] = p; @@ -226,7 +228,7 @@ build_up: __set_bit(0, &new->bitmap); p = new; } - idp->top = p; + rcu_assign_pointer(idp->top, p); idp->layers = layers; v = sub_alloc(idp, &id, pa); if (v == IDR_NEED_TO_GROW) @@ -245,7 +247,8 @@ static int idr_get_new_above_int(struct idr *idp, void *ptr, int starting_id) * Successfully found an empty slot. Install the user * pointer and mark the slot full. */ - pa[0]->ary[id & IDR_MASK] = (struct idr_layer *)ptr; + rcu_assign_pointer(pa[0]->ary[id & IDR_MASK], + (struct idr_layer *)ptr); pa[0]->count++; idr_mark_full(pa, id); } @@ -710,7 +713,8 @@ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) return -EAGAIN; memset(bitmap, 0, sizeof(struct ida_bitmap)); - pa[0]->ary[idr_id & IDR_MASK] = (void *)bitmap; + rcu_assign_pointer(pa[0]->ary[idr_id & IDR_MASK], + (void *)bitmap); pa[0]->count++; } -- cgit v1.2.3 From f9c46d6ea5ce138a886c3a0f10a46130afab75f5 Mon Sep 17 00:00:00 2001 From: Nadia Derbey Date: Fri, 25 Jul 2008 01:48:01 -0700 Subject: idr: make idr_find rcu-safe Make idr_find rcu-safe: it can now be called inside an rcu_read critical section. Signed-off-by: Nadia Derbey Reviewed-by: "Paul E. McKenney" Cc: Manfred Spraul Cc: Jim Houston Cc: Pierre Peiffer Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/idr.c b/lib/idr.c index 44ab3b2a4eba..21e12af1f231 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -456,7 +456,8 @@ EXPORT_SYMBOL(idr_destroy); * return indicates that @id is not valid or you passed %NULL in * idr_get_new(). * - * The caller must serialize idr_find() vs idr_get_new() and idr_remove(). + * This function can be called under rcu_read_lock(), given that the leaf + * pointers lifetimes are correctly managed. */ void *idr_find(struct idr *idp, int id) { @@ -464,7 +465,7 @@ void *idr_find(struct idr *idp, int id) struct idr_layer *p; n = idp->layers * IDR_BITS; - p = idp->top; + p = rcu_dereference(idp->top); /* Mask off upper bits we don't use for the search. */ id &= MAX_ID_MASK; @@ -474,7 +475,7 @@ void *idr_find(struct idr *idp, int id) while (n > 0 && p) { n -= IDR_BITS; - p = p->ary[(id >> n) & IDR_MASK]; + p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]); } return((void *)p); } @@ -507,7 +508,7 @@ int idr_for_each(struct idr *idp, struct idr_layer **paa = &pa[0]; n = idp->layers * IDR_BITS; - p = idp->top; + p = rcu_dereference(idp->top); max = 1 << n; id = 0; @@ -515,7 +516,7 @@ int idr_for_each(struct idr *idp, while (n > 0 && p) { n -= IDR_BITS; *paa++ = p; - p = p->ary[(id >> n) & IDR_MASK]; + p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]); } if (p) { -- cgit v1.2.3 From cf481c20c476ad2c0febdace9ce23f5a4db19582 Mon Sep 17 00:00:00 2001 From: Nadia Derbey Date: Fri, 25 Jul 2008 01:48:02 -0700 Subject: idr: make idr_remove rcu-safe Introduce the free_layer() routine: it is the one that actually frees memory after a grace period has elapsed. Signed-off-by: Nadia Derbey Reviewed-by: "Paul E. McKenney" Cc: Manfred Spraul Cc: Jim Houston Cc: Pierre Peiffer Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 57 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 14 deletions(-) (limited to 'lib') diff --git a/lib/idr.c b/lib/idr.c index 21e12af1f231..3476f8203e97 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -52,6 +52,19 @@ static struct idr_layer *get_from_free_list(struct idr *idp) return(p); } +static void idr_layer_rcu_free(struct rcu_head *head) +{ + struct idr_layer *layer; + + layer = container_of(head, struct idr_layer, rcu_head); + kmem_cache_free(idr_layer_cache, layer); +} + +static inline void free_layer(struct idr_layer *p) +{ + call_rcu(&p->rcu_head, idr_layer_rcu_free); +} + /* only called when idp->lock is held */ static void __move_to_free_list(struct idr *idp, struct idr_layer *p) { @@ -331,6 +344,7 @@ static void sub_remove(struct idr *idp, int shift, int id) struct idr_layer *p = idp->top; struct idr_layer **pa[MAX_LEVEL]; struct idr_layer ***paa = &pa[0]; + struct idr_layer *to_free; int n; *paa = NULL; @@ -346,13 +360,18 @@ static void sub_remove(struct idr *idp, int shift, int id) n = id & IDR_MASK; if (likely(p != NULL && test_bit(n, &p->bitmap))){ __clear_bit(n, &p->bitmap); - p->ary[n] = NULL; + rcu_assign_pointer(p->ary[n], NULL); + to_free = NULL; while(*paa && ! --((**paa)->count)){ - move_to_free_list(idp, **paa); + if (to_free) + free_layer(to_free); + to_free = **paa; **paa-- = NULL; } if (!*paa) idp->layers = 0; + if (to_free) + free_layer(to_free); } else idr_remove_warning(id); } @@ -365,22 +384,34 @@ static void sub_remove(struct idr *idp, int shift, int id) void idr_remove(struct idr *idp, int id) { struct idr_layer *p; + struct idr_layer *to_free; /* Mask off upper bits we don't use for the search. */ id &= MAX_ID_MASK; sub_remove(idp, (idp->layers - 1) * IDR_BITS, id); if (idp->top && idp->top->count == 1 && (idp->layers > 1) && - idp->top->ary[0]) { // We can drop a layer - + idp->top->ary[0]) { + /* + * Single child at leftmost slot: we can shrink the tree. + * This level is not needed anymore since when layers are + * inserted, they are inserted at the top of the existing + * tree. + */ + to_free = idp->top; p = idp->top->ary[0]; - idp->top->bitmap = idp->top->count = 0; - move_to_free_list(idp, idp->top); - idp->top = p; + rcu_assign_pointer(idp->top, p); --idp->layers; + to_free->bitmap = to_free->count = 0; + free_layer(to_free); } while (idp->id_free_cnt >= IDR_FREE_MAX) { p = get_from_free_list(idp); + /* + * Note: we don't call the rcu callback here, since the only + * layers that fall into the freelist are those that have been + * preallocated. + */ kmem_cache_free(idr_layer_cache, p); } return; @@ -421,15 +452,13 @@ void idr_remove_all(struct idr *idp) id += 1 << n; while (n < fls(id)) { - if (p) { - memset(p, 0, sizeof *p); - move_to_free_list(idp, p); - } + if (p) + free_layer(p); n += IDR_BITS; p = *--paa; } } - idp->top = NULL; + rcu_assign_pointer(idp->top, NULL); idp->layers = 0; } EXPORT_SYMBOL(idr_remove_all); @@ -546,7 +575,7 @@ EXPORT_SYMBOL(idr_for_each); * A -ENOENT return indicates that @id was not found. * A -EINVAL return indicates that @id was not within valid constraints. * - * The caller must serialize vs idr_find(), idr_get_new(), and idr_remove(). + * The caller must serialize with writers. */ void *idr_replace(struct idr *idp, void *ptr, int id) { @@ -572,7 +601,7 @@ void *idr_replace(struct idr *idp, void *ptr, int id) return ERR_PTR(-ENOENT); old_p = p->ary[n]; - p->ary[n] = ptr; + rcu_assign_pointer(p->ary[n], ptr); return old_p; } -- cgit v1.2.3 From 8d8bb39b9eba32dd70e87fd5ad5c5dd4ba118e06 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 25 Jul 2008 19:44:49 -0700 Subject: dma-mapping: add the device argument to dma_mapping_error() Add per-device dma_mapping_ops support for CONFIG_X86_64 as POWER architecture does: This enables us to cleanly fix the Calgary IOMMU issue that some devices are not behind the IOMMU (http://lkml.org/lkml/2008/5/8/423). I think that per-device dma_mapping_ops support would be also helpful for KVM people to support PCI passthrough but Andi thinks that this makes it difficult to support the PCI passthrough (see the above thread). So I CC'ed this to KVM camp. Comments are appreciated. A pointer to dma_mapping_ops to struct dev_archdata is added. If the pointer is non NULL, DMA operations in asm/dma-mapping.h use it. If it's NULL, the system-wide dma_ops pointer is used as before. If it's useful for KVM people, I plan to implement a mechanism to register a hook called when a new pci (or dma capable) device is created (it works with hot plugging). It enables IOMMUs to set up an appropriate dma_mapping_ops per device. The major obstacle is that dma_mapping_error doesn't take a pointer to the device unlike other DMA operations. So x86 can't have dma_mapping_ops per device. Note all the POWER IOMMUs use the same dma_mapping_error function so this is not a problem for POWER but x86 IOMMUs use different dma_mapping_error functions. The first patch adds the device argument to dma_mapping_error. The patch is trivial but large since it touches lots of drivers and dma-mapping.h in all the architecture. This patch: dma_mapping_error() doesn't take a pointer to the device unlike other DMA operations. So we can't have dma_mapping_ops per device. Note that POWER already has dma_mapping_ops per device but all the POWER IOMMUs use the same dma_mapping_error function. x86 IOMMUs use device argument. [akpm@linux-foundation.org: fix sge] [akpm@linux-foundation.org: fix svc_rdma] [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: fix bnx2x] [akpm@linux-foundation.org: fix s2io] [akpm@linux-foundation.org: fix pasemi_mac] [akpm@linux-foundation.org: fix sdhci] [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: fix sparc] [akpm@linux-foundation.org: fix ibmvscsi] Signed-off-by: FUJITA Tomonori Cc: Muli Ben-Yehuda Cc: Andi Kleen Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Avi Kivity Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/swiotlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/swiotlb.c b/lib/swiotlb.c index d568894df8cc..977edbdbc1de 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -492,7 +492,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, */ dma_addr_t handle; handle = swiotlb_map_single(NULL, NULL, size, DMA_FROM_DEVICE); - if (swiotlb_dma_mapping_error(handle)) + if (swiotlb_dma_mapping_error(hwdev, handle)) return NULL; ret = bus_to_virt(handle); @@ -824,7 +824,7 @@ swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, } int -swiotlb_dma_mapping_error(dma_addr_t dma_addr) +swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) { return (dma_addr == virt_to_bus(io_tlb_overflow_buffer)); } -- cgit v1.2.3 From 47feff2c8eefe85099f87c43d3096855f0085ca0 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:29 -0700 Subject: radix-tree: add gang_lookup_slot, gang_lookup_slot_tag Introduce gang_lookup_slot() and gang_lookup_slot_tag() functions, which are used by lockless pagecache. Signed-off-by: Nick Piggin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Hugh Dickins Cc: "Paul E. McKenney" Reviewed-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 178 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 155 insertions(+), 23 deletions(-) (limited to 'lib') diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 56ec21a7f73d..9c4f1ffa2864 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -359,18 +359,17 @@ EXPORT_SYMBOL(radix_tree_insert); * Returns: the slot corresponding to the position @index in the * radix tree @root. This is useful for update-if-exists operations. * - * This function cannot be called under rcu_read_lock, it must be - * excluded from writers, as must the returned slot for subsequent - * use by radix_tree_deref_slot() and radix_tree_replace slot. - * Caller must hold tree write locked across slot lookup and - * replace. + * This function can be called under rcu_read_lock iff the slot is not + * modified by radix_tree_replace_slot, otherwise it must be called + * exclusive from other writers. Any dereference of the slot must be done + * using radix_tree_deref_slot. */ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) { unsigned int height, shift; struct radix_tree_node *node, **slot; - node = root->rnode; + node = rcu_dereference(root->rnode); if (node == NULL) return NULL; @@ -390,7 +389,7 @@ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) do { slot = (struct radix_tree_node **) (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); - node = *slot; + node = rcu_dereference(*slot); if (node == NULL) return NULL; @@ -667,7 +666,7 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root, EXPORT_SYMBOL(radix_tree_next_hole); static unsigned int -__lookup(struct radix_tree_node *slot, void **results, unsigned long index, +__lookup(struct radix_tree_node *slot, void ***results, unsigned long index, unsigned int max_items, unsigned long *next_index) { unsigned int nr_found = 0; @@ -701,11 +700,9 @@ __lookup(struct radix_tree_node *slot, void **results, unsigned long index, /* Bottom level: grab some items */ for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { - struct radix_tree_node *node; index++; - node = slot->slots[i]; - if (node) { - results[nr_found++] = rcu_dereference(node); + if (slot->slots[i]) { + results[nr_found++] = &(slot->slots[i]); if (nr_found == max_items) goto out; } @@ -759,13 +756,22 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, ret = 0; while (ret < max_items) { - unsigned int nr_found; + unsigned int nr_found, slots_found, i; unsigned long next_index; /* Index of next search */ if (cur_index > max_index) break; - nr_found = __lookup(node, results + ret, cur_index, + slots_found = __lookup(node, (void ***)results + ret, cur_index, max_items - ret, &next_index); + nr_found = 0; + for (i = 0; i < slots_found; i++) { + struct radix_tree_node *slot; + slot = *(((void ***)results)[ret + i]); + if (!slot) + continue; + results[ret + nr_found] = rcu_dereference(slot); + nr_found++; + } ret += nr_found; if (next_index == 0) break; @@ -776,12 +782,71 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, } EXPORT_SYMBOL(radix_tree_gang_lookup); +/** + * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * + * Performs an index-ascending scan of the tree for present items. Places + * their slots at *@results and returns the number of items which were + * placed at *@results. + * + * The implementation is naive. + * + * Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must + * be dereferenced with radix_tree_deref_slot, and if using only RCU + * protection, radix_tree_deref_slot may fail requiring a retry. + */ +unsigned int +radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items) +{ + unsigned long max_index; + struct radix_tree_node *node; + unsigned long cur_index = first_index; + unsigned int ret; + + node = rcu_dereference(root->rnode); + if (!node) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) { + if (first_index > 0) + return 0; + results[0] = (void **)&root->rnode; + return 1; + } + node = radix_tree_indirect_to_ptr(node); + + max_index = radix_tree_maxindex(node->height); + + ret = 0; + while (ret < max_items) { + unsigned int slots_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + slots_found = __lookup(node, results + ret, cur_index, + max_items - ret, &next_index); + ret += slots_found; + if (next_index == 0) + break; + cur_index = next_index; + } + + return ret; +} +EXPORT_SYMBOL(radix_tree_gang_lookup_slot); + /* * FIXME: the two tag_get()s here should use find_next_bit() instead of * open-coding the search. */ static unsigned int -__lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index, +__lookup_tag(struct radix_tree_node *slot, void ***results, unsigned long index, unsigned int max_items, unsigned long *next_index, unsigned int tag) { unsigned int nr_found = 0; @@ -811,11 +876,9 @@ __lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index, unsigned long j = index & RADIX_TREE_MAP_MASK; for ( ; j < RADIX_TREE_MAP_SIZE; j++) { - struct radix_tree_node *node; index++; if (!tag_get(slot, tag, j)) continue; - node = slot->slots[j]; /* * Even though the tag was found set, we need to * recheck that we have a non-NULL node, because @@ -826,9 +889,8 @@ __lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index, * lookup ->slots[x] without a lock (ie. can't * rely on its value remaining the same). */ - if (node) { - node = rcu_dereference(node); - results[nr_found++] = node; + if (slot->slots[j]) { + results[nr_found++] = &(slot->slots[j]); if (nr_found == max_items) goto out; } @@ -887,13 +949,22 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, ret = 0; while (ret < max_items) { - unsigned int nr_found; + unsigned int nr_found, slots_found, i; unsigned long next_index; /* Index of next search */ if (cur_index > max_index) break; - nr_found = __lookup_tag(node, results + ret, cur_index, - max_items - ret, &next_index, tag); + slots_found = __lookup_tag(node, (void ***)results + ret, + cur_index, max_items - ret, &next_index, tag); + nr_found = 0; + for (i = 0; i < slots_found; i++) { + struct radix_tree_node *slot; + slot = *(((void ***)results)[ret + i]); + if (!slot) + continue; + results[ret + nr_found] = rcu_dereference(slot); + nr_found++; + } ret += nr_found; if (next_index == 0) break; @@ -904,6 +975,67 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, } EXPORT_SYMBOL(radix_tree_gang_lookup_tag); +/** + * radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a + * radix tree based on a tag + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * @tag: the tag index (< RADIX_TREE_MAX_TAGS) + * + * Performs an index-ascending scan of the tree for present items which + * have the tag indexed by @tag set. Places the slots at *@results and + * returns the number of slots which were placed at *@results. + */ +unsigned int +radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items, + unsigned int tag) +{ + struct radix_tree_node *node; + unsigned long max_index; + unsigned long cur_index = first_index; + unsigned int ret; + + /* check the root's tag bit */ + if (!root_tag_get(root, tag)) + return 0; + + node = rcu_dereference(root->rnode); + if (!node) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) { + if (first_index > 0) + return 0; + results[0] = (void **)&root->rnode; + return 1; + } + node = radix_tree_indirect_to_ptr(node); + + max_index = radix_tree_maxindex(node->height); + + ret = 0; + while (ret < max_items) { + unsigned int slots_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + slots_found = __lookup_tag(node, results + ret, + cur_index, max_items - ret, &next_index, tag); + ret += slots_found; + if (next_index == 0) + break; + cur_index = next_index; + } + + return ret; +} +EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); + + /** * radix_tree_shrink - shrink height of a radix tree to minimal * @root radix tree root -- cgit v1.2.3 From 51cc50685a4275c6a02653670af9f108a64e01cf Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 25 Jul 2008 19:45:34 -0700 Subject: SL*B: drop kmem cache argument from constructor Kmem cache passed to constructor is only needed for constructors that are themselves multiplexeres. Nobody uses this "feature", nor does anybody uses passed kmem cache in non-trivial way, so pass only pointer to object. Non-trivial places are: arch/powerpc/mm/init_64.c arch/powerpc/mm/hugetlbpage.c This is flag day, yes. Signed-off-by: Alexey Dobriyan Acked-by: Pekka Enberg Acked-by: Christoph Lameter Cc: Jon Tollefson Cc: Nick Piggin Cc: Matt Mackall [akpm@linux-foundation.org: fix arch/powerpc/mm/hugetlbpage.c] [akpm@linux-foundation.org: fix mm/slab.c] [akpm@linux-foundation.org: fix ubifs] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 2 +- lib/radix-tree.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/idr.c b/lib/idr.c index 3476f8203e97..e728c7fccc4d 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -607,7 +607,7 @@ void *idr_replace(struct idr *idp, void *ptr, int id) } EXPORT_SYMBOL(idr_replace); -static void idr_cache_ctor(struct kmem_cache *idr_layer_cache, void *idr_layer) +static void idr_cache_ctor(void *idr_layer) { memset(idr_layer, 0, sizeof(struct idr_layer)); } diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 9c4f1ffa2864..be86b32bc874 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1183,7 +1183,7 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag) EXPORT_SYMBOL(radix_tree_tagged); static void -radix_tree_node_ctor(struct kmem_cache *cachep, void *node) +radix_tree_node_ctor(void *node) { memset(node, 0, sizeof(struct radix_tree_node)); } -- cgit v1.2.3 From 5cd2b459d326a424671dcd95f038649f7bf7cb96 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 25 Jul 2008 19:45:39 -0700 Subject: Use WARN() in lib/ Use WARN() instead of a printk+WARN_ON() pair; this way the message becomes part of the warning section for better reporting/collection. In addition, one of the if() clauses collapes into the WARN() entirely now. Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/debugobjects.c | 15 +++++---------- lib/iomap.c | 3 +-- lib/kobject_uevent.c | 6 ++---- lib/plist.c | 13 +++++++------ 4 files changed, 15 insertions(+), 22 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index f86196390cfd..45a6bde762d1 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -205,9 +205,8 @@ static void debug_print_object(struct debug_obj *obj, char *msg) if (limit < 5 && obj->descr != descr_test) { limit++; - printk(KERN_ERR "ODEBUG: %s %s object type: %s\n", msg, + WARN(1, KERN_ERR "ODEBUG: %s %s object type: %s\n", msg, obj_states[obj->state], obj->descr->name); - WARN_ON(1); } debug_objects_warnings++; } @@ -733,26 +732,22 @@ check_results(void *addr, enum debug_obj_state state, int fixups, int warnings) obj = lookup_object(addr, db); if (!obj && state != ODEBUG_STATE_NONE) { - printk(KERN_ERR "ODEBUG: selftest object not found\n"); - WARN_ON(1); + WARN(1, KERN_ERR "ODEBUG: selftest object not found\n"); goto out; } if (obj && obj->state != state) { - printk(KERN_ERR "ODEBUG: selftest wrong state: %d != %d\n", + WARN(1, KERN_ERR "ODEBUG: selftest wrong state: %d != %d\n", obj->state, state); - WARN_ON(1); goto out; } if (fixups != debug_objects_fixups) { - printk(KERN_ERR "ODEBUG: selftest fixups failed %d != %d\n", + WARN(1, KERN_ERR "ODEBUG: selftest fixups failed %d != %d\n", fixups, debug_objects_fixups); - WARN_ON(1); goto out; } if (warnings != debug_objects_warnings) { - printk(KERN_ERR "ODEBUG: selftest warnings failed %d != %d\n", + WARN(1, KERN_ERR "ODEBUG: selftest warnings failed %d != %d\n", warnings, debug_objects_warnings); - WARN_ON(1); goto out; } res = 0; diff --git a/lib/iomap.c b/lib/iomap.c index 37a3ea4cac9f..d32229385151 100644 --- a/lib/iomap.c +++ b/lib/iomap.c @@ -40,8 +40,7 @@ static void bad_io_access(unsigned long port, const char *access) static int count = 10; if (count) { count--; - printk(KERN_ERR "Bad IO access at port %#lx (%s)\n", port, access); - WARN_ON(1); + WARN(1, KERN_ERR "Bad IO access at port %#lx (%s)\n", port, access); } } diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 9f8d599459d1..3f914725bda8 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -285,8 +285,7 @@ int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...) int len; if (env->envp_idx >= ARRAY_SIZE(env->envp)) { - printk(KERN_ERR "add_uevent_var: too many keys\n"); - WARN_ON(1); + WARN(1, KERN_ERR "add_uevent_var: too many keys\n"); return -ENOMEM; } @@ -297,8 +296,7 @@ int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...) va_end(args); if (len >= (sizeof(env->buf) - env->buflen)) { - printk(KERN_ERR "add_uevent_var: buffer size too small\n"); - WARN_ON(1); + WARN(1, KERN_ERR "add_uevent_var: buffer size too small\n"); return -ENOMEM; } diff --git a/lib/plist.c b/lib/plist.c index 3074a02272f3..d6c64a824e1d 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -31,12 +31,13 @@ static void plist_check_prev_next(struct list_head *t, struct list_head *p, struct list_head *n) { - if (n->prev != p || p->next != n) { - printk("top: %p, n: %p, p: %p\n", t, t->next, t->prev); - printk("prev: %p, n: %p, p: %p\n", p, p->next, p->prev); - printk("next: %p, n: %p, p: %p\n", n, n->next, n->prev); - WARN_ON(1); - } + WARN(n->prev != p || p->next != n, + "top: %p, n: %p, p: %p\n" + "prev: %p, n: %p, p: %p\n" + "next: %p, n: %p, p: %p\n", + t, t->next, t->prev, + p, p->next, p->prev, + n, n->next, n->prev); } static void plist_check_list(struct list_head *top) -- cgit v1.2.3 From bbc698636ed48b6fcd323964e0f847a6a796325d Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:59 -0700 Subject: task_current_syscall This adds the new function task_current_syscall() on machines where the asm/syscall.h interface is supported (CONFIG_HAVE_ARCH_TRACEHOOK). It's exported for modules to use in the future. This function safely samples the state of a blocked thread to collect what system call it is blocked in, and the six system call argument registers. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Makefile | 2 ++ lib/syscall.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 lib/syscall.c (limited to 'lib') diff --git a/lib/Makefile b/lib/Makefile index 9085ad6fa53d..942c7250f603 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -78,6 +78,8 @@ lib-$(CONFIG_GENERIC_BUG) += bug.o obj-$(CONFIG_HAVE_LMB) += lmb.o +obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o + hostprogs-y := gen_crc32table clean-files := crc32table.h diff --git a/lib/syscall.c b/lib/syscall.c new file mode 100644 index 000000000000..a4f7067f72fa --- /dev/null +++ b/lib/syscall.c @@ -0,0 +1,75 @@ +#include +#include +#include +#include + +static int collect_syscall(struct task_struct *target, long *callno, + unsigned long args[6], unsigned int maxargs, + unsigned long *sp, unsigned long *pc) +{ + struct pt_regs *regs = task_pt_regs(target); + if (unlikely(!regs)) + return -EAGAIN; + + *sp = user_stack_pointer(regs); + *pc = instruction_pointer(regs); + + *callno = syscall_get_nr(target, regs); + if (*callno != -1L && maxargs > 0) + syscall_get_arguments(target, regs, 0, maxargs, args); + + return 0; +} + +/** + * task_current_syscall - Discover what a blocked task is doing. + * @target: thread to examine + * @callno: filled with system call number or -1 + * @args: filled with @maxargs system call arguments + * @maxargs: number of elements in @args to fill + * @sp: filled with user stack pointer + * @pc: filled with user PC + * + * If @target is blocked in a system call, returns zero with *@callno + * set to the the call's number and @args filled in with its arguments. + * Registers not used for system call arguments may not be available and + * it is not kosher to use &struct user_regset calls while the system + * call is still in progress. Note we may get this result if @target + * has finished its system call but not yet returned to user mode, such + * as when it's stopped for signal handling or syscall exit tracing. + * + * If @target is blocked in the kernel during a fault or exception, + * returns zero with *@callno set to -1 and does not fill in @args. + * If so, it's now safe to examine @target using &struct user_regset + * get() calls as long as we're sure @target won't return to user mode. + * + * Returns -%EAGAIN if @target does not remain blocked. + * + * Returns -%EINVAL if @maxargs is too large (maximum is six). + */ +int task_current_syscall(struct task_struct *target, long *callno, + unsigned long args[6], unsigned int maxargs, + unsigned long *sp, unsigned long *pc) +{ + long state; + unsigned long ncsw; + + if (unlikely(maxargs > 6)) + return -EINVAL; + + if (target == current) + return collect_syscall(target, callno, args, maxargs, sp, pc); + + state = target->state; + if (unlikely(!state)) + return -EAGAIN; + + ncsw = wait_task_inactive(target, state); + if (unlikely(!ncsw) || + unlikely(collect_syscall(target, callno, args, maxargs, sp, pc)) || + unlikely(wait_task_inactive(target, state) != ncsw)) + return -EAGAIN; + + return 0; +} +EXPORT_SYMBOL_GPL(task_current_syscall); -- cgit v1.2.3 From 454c63b02e530f10b4345343f63596dd705888d0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 25 Jul 2008 19:46:07 -0700 Subject: lib: generic show_mem() This implements a platform-independent version of show_mem(). Signed-off-by: Johannes Weiner Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Haavard Skinnemoen Cc: Bryan Wu Cc: Chris Zankel Cc: Ingo Molnar Cc: Jeff Dike Cc: David S. Miller Cc: Paul Mundt Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: David Howells Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Yoshinori Sato Cc: Ralf Baechle Cc: Greg Ungerer Cc: Geert Uytterhoeven Cc: Roman Zippel Cc: Hirokazu Takata Cc: Mikael Starvik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Makefile | 2 +- lib/show_mem.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 lib/show_mem.c (limited to 'lib') diff --git a/lib/Makefile b/lib/Makefile index 942c7250f603..3b1f94bbe9de 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -11,7 +11,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o \ idr.o int_sqrt.o extable.o prio_tree.o \ sha1.o irq_regs.o reciprocal_div.o argv_split.o \ - proportions.o prio_heap.o ratelimit.o + proportions.o prio_heap.o ratelimit.o show_mem.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/show_mem.c b/lib/show_mem.c new file mode 100644 index 000000000000..238e72a18ce1 --- /dev/null +++ b/lib/show_mem.c @@ -0,0 +1,63 @@ +/* + * Generic show_mem() implementation + * + * Copyright (C) 2008 Johannes Weiner + * All code subject to the GPL version 2. + */ + +#include +#include +#include + +void show_mem(void) +{ + pg_data_t *pgdat; + unsigned long total = 0, reserved = 0, shared = 0, + nonshared = 0, highmem = 0; + + printk(KERN_INFO "Mem-Info:\n"); + show_free_areas(); + + for_each_online_pgdat(pgdat) { + unsigned long i, flags; + + pgdat_resize_lock(pgdat, &flags); + for (i = 0; i < pgdat->node_spanned_pages; i++) { + struct page *page; + unsigned long pfn = pgdat->node_start_pfn + i; + + if (unlikely(!(i % MAX_ORDER_NR_PAGES))) + touch_nmi_watchdog(); + + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + + if (PageHighMem(page)) + highmem++; + + if (PageReserved(page)) + reserved++; + else if (page_count(page) == 1) + nonshared++; + else if (page_count(page) > 1) + shared += page_count(page) - 1; + + total++; + } + pgdat_resize_unlock(pgdat, &flags); + } + + printk(KERN_INFO "%lu pages RAM\n", total); +#ifdef CONFIG_HIGHMEM + printk(KERN_INFO "%lu pages HighMem\n", highmem); +#endif + printk(KERN_INFO "%lu pages reserved\n", reserved); + printk(KERN_INFO "%lu pages shared\n", shared); + printk(KERN_INFO "%lu pages non-shared\n", nonshared); +#ifdef CONFIG_QUICKLIST + printk(KERN_INFO "%lu pages in pagetable cache\n", + quicklist_total_size()); +#endif +} -- cgit v1.2.3