diff options
Diffstat (limited to 'kernel')
41 files changed, 1318 insertions, 456 deletions
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer new file mode 100644 index 000000000000..a3bb4cb52539 --- /dev/null +++ b/kernel/Kconfig.freezer @@ -0,0 +1,2 @@ +config FREEZER + def_bool PM_SLEEP || CGROUP_FREEZER diff --git a/kernel/Makefile b/kernel/Makefile index 4e1d7df7c3e2..066550aa61c5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_sched.o = -mno-spe -pg endif +obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o @@ -55,6 +56,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o +obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o obj-$(CONFIG_UTS_NS) += utsname.o diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a0123d75ec9a..046c1609606b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -241,7 +241,6 @@ static void unlink_css_set(struct css_set *cg) struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; - write_lock(&css_set_lock); hlist_del(&cg->hlist); css_set_count--; @@ -251,16 +250,25 @@ static void unlink_css_set(struct css_set *cg) list_del(&link->cgrp_link_list); kfree(link); } - - write_unlock(&css_set_lock); } -static void __release_css_set(struct kref *k, int taskexit) +static void __put_css_set(struct css_set *cg, int taskexit) { int i; - struct css_set *cg = container_of(k, struct css_set, ref); - + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cg->refcount, -1, 1)) + return; + write_lock(&css_set_lock); + if (!atomic_dec_and_test(&cg->refcount)) { + write_unlock(&css_set_lock); + return; + } unlink_css_set(cg); + write_unlock(&css_set_lock); rcu_read_lock(); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { @@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit) kfree(cg); } -static void release_css_set(struct kref *k) -{ - __release_css_set(k, 0); -} - -static void release_css_set_taskexit(struct kref *k) -{ - __release_css_set(k, 1); -} - /* * refcounted get/put for css_set objects */ static inline void get_css_set(struct css_set *cg) { - kref_get(&cg->ref); + atomic_inc(&cg->refcount); } static inline void put_css_set(struct css_set *cg) { - kref_put(&cg->ref, release_css_set); + __put_css_set(cg, 0); } static inline void put_css_set_taskexit(struct css_set *cg) { - kref_put(&cg->ref, release_css_set_taskexit); + __put_css_set(cg, 1); } /* @@ -427,7 +425,7 @@ static struct css_set *find_css_set( return NULL; } - kref_init(&res->ref); + atomic_set(&res->refcount, 1); INIT_LIST_HEAD(&res->cg_links); INIT_LIST_HEAD(&res->tasks); INIT_HLIST_NODE(&res->hlist); @@ -870,6 +868,14 @@ static struct super_operations cgroup_ops = { .remount_fs = cgroup_remount, }; +static void init_cgroup_housekeeping(struct cgroup *cgrp) +{ + INIT_LIST_HEAD(&cgrp->sibling); + INIT_LIST_HEAD(&cgrp->children); + INIT_LIST_HEAD(&cgrp->css_sets); + INIT_LIST_HEAD(&cgrp->release_list); + init_rwsem(&cgrp->pids_mutex); +} static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; @@ -878,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) root->number_of_cgroups = 1; cgrp->root = root; cgrp->top_cgroup = cgrp; - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->css_sets); - INIT_LIST_HEAD(&cgrp->release_list); + init_cgroup_housekeeping(cgrp); } static int cgroup_test_super(struct super_block *sb, void *data) @@ -1728,7 +1731,7 @@ int cgroup_task_count(const struct cgroup *cgrp) read_lock(&css_set_lock); list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { - count += atomic_read(&link->cg->ref.refcount); + count += atomic_read(&link->cg->refcount); } read_unlock(&css_set_lock); return count; @@ -1997,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * but we cannot guarantee that the information we produce is correct * unless we produce it entirely atomically. * - * Upon tasks file open(), a struct ctr_struct is allocated, that - * will have a pointer to an array (also allocated here). The struct - * ctr_struct * is stored in file->private_data. Its resources will - * be freed by release() when the file is closed. The array is used - * to sprintf the PIDs and then used by read(). */ -struct ctr_struct { - char *buf; - int bufsz; -}; /* * Load into 'pidarray' up to 'npids' of the tasks using cgroup @@ -2088,42 +2082,132 @@ static int cmppid(const void *a, const void *b) return *(pid_t *)a - *(pid_t *)b; } + /* - * Convert array 'a' of 'npids' pid_t's to a string of newline separated - * decimal pids in 'buf'. Don't write more than 'sz' chars, but return - * count 'cnt' of how many chars would be written if buf were large enough. + * seq_file methods for the "tasks" file. The seq_file position is the + * next pid to display; the seq_file iterator is a pointer to the pid + * in the cgroup->tasks_pids array. */ -static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) + +static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) { - int cnt = 0; - int i; + /* + * Initially we receive a position value that corresponds to + * one more than the last pid shown (or 0 on the first call or + * after a seek to the start). Use a binary-search to find the + * next pid to display, if any + */ + struct cgroup *cgrp = s->private; + int index = 0, pid = *pos; + int *iter; + + down_read(&cgrp->pids_mutex); + if (pid) { + int end = cgrp->pids_length; + int i; + while (index < end) { + int mid = (index + end) / 2; + if (cgrp->tasks_pids[mid] == pid) { + index = mid; + break; + } else if (cgrp->tasks_pids[mid] <= pid) + index = mid + 1; + else + end = mid; + } + } + /* If we're off the end of the array, we're done */ + if (index >= cgrp->pids_length) + return NULL; + /* Update the abstract position to be the actual pid that we found */ + iter = cgrp->tasks_pids + index; + *pos = *iter; + return iter; +} + +static void cgroup_tasks_stop(struct seq_file *s, void *v) +{ + struct cgroup *cgrp = s->private; + up_read(&cgrp->pids_mutex); +} + +static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct cgroup *cgrp = s->private; + int *p = v; + int *end = cgrp->tasks_pids + cgrp->pids_length; + + /* + * Advance to the next pid in the array. If this goes off the + * end, we're done + */ + p++; + if (p >= end) { + return NULL; + } else { + *pos = *p; + return p; + } +} + +static int cgroup_tasks_show(struct seq_file *s, void *v) +{ + return seq_printf(s, "%d\n", *(int *)v); +} + +static struct seq_operations cgroup_tasks_seq_operations = { + .start = cgroup_tasks_start, + .stop = cgroup_tasks_stop, + .next = cgroup_tasks_next, + .show = cgroup_tasks_show, +}; + +static void release_cgroup_pid_array(struct cgroup *cgrp) +{ + down_write(&cgrp->pids_mutex); + BUG_ON(!cgrp->pids_use_count); + if (!--cgrp->pids_use_count) { + kfree(cgrp->tasks_pids); + cgrp->tasks_pids = NULL; + cgrp->pids_length = 0; + } + up_write(&cgrp->pids_mutex); +} + +static int cgroup_tasks_release(struct inode *inode, struct file *file) +{ + struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - for (i = 0; i < npids; i++) - cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); - return cnt; + if (!(file->f_mode & FMODE_READ)) + return 0; + + release_cgroup_pid_array(cgrp); + return seq_release(inode, file); } +static struct file_operations cgroup_tasks_operations = { + .read = seq_read, + .llseek = seq_lseek, + .write = cgroup_file_write, + .release = cgroup_tasks_release, +}; + /* - * Handle an open on 'tasks' file. Prepare a buffer listing the + * Handle an open on 'tasks' file. Prepare an array containing the * process id's of tasks currently attached to the cgroup being opened. - * - * Does not require any specific cgroup mutexes, and does not take any. */ + static int cgroup_tasks_open(struct inode *unused, struct file *file) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct ctr_struct *ctr; pid_t *pidarray; int npids; - char c; + int retval; + /* Nothing to do for write-only files */ if (!(file->f_mode & FMODE_READ)) return 0; - ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); - if (!ctr) - goto err0; - /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the @@ -2131,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) * show up until sometime later on. */ npids = cgroup_task_count(cgrp); - if (npids) { - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); - if (!pidarray) - goto err1; - - npids = pid_array_load(pidarray, npids, cgrp); - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - - /* Call pid_array_to_buf() twice, first just to get bufsz */ - ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; - ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); - if (!ctr->buf) - goto err2; - ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); - - kfree(pidarray); - } else { - ctr->buf = NULL; - ctr->bufsz = 0; - } - file->private_data = ctr; - return 0; - -err2: - kfree(pidarray); -err1: - kfree(ctr); -err0: - return -ENOMEM; -} - -static ssize_t cgroup_tasks_read(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct ctr_struct *ctr = file->private_data; + pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); + if (!pidarray) + return -ENOMEM; + npids = pid_array_load(pidarray, npids, cgrp); + sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); -} + /* + * Store the array in the cgroup, freeing the old + * array if necessary + */ + down_write(&cgrp->pids_mutex); + kfree(cgrp->tasks_pids); + cgrp->tasks_pids = pidarray; + cgrp->pids_length = npids; + cgrp->pids_use_count++; + up_write(&cgrp->pids_mutex); -static int cgroup_tasks_release(struct inode *unused_inode, - struct file *file) -{ - struct ctr_struct *ctr; + file->f_op = &cgroup_tasks_operations; - if (file->f_mode & FMODE_READ) { - ctr = file->private_data; - kfree(ctr->buf); - kfree(ctr); + retval = seq_open(file, &cgroup_tasks_seq_operations); + if (retval) { + release_cgroup_pid_array(cgrp); + return retval; } + ((struct seq_file *)file->private_data)->private = cgrp; return 0; } @@ -2210,7 +2268,6 @@ static struct cftype files[] = { { .name = "tasks", .open = cgroup_tasks_open, - .read = cgroup_tasks_read, .write_u64 = cgroup_tasks_write, .release = cgroup_tasks_release, .private = FILE_TASKLIST, @@ -2300,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, mutex_lock(&cgroup_mutex); - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->css_sets); - INIT_LIST_HEAD(&cgrp->release_list); + init_cgroup_housekeeping(cgrp); cgrp->parent = parent; cgrp->root = parent->root; @@ -2495,8 +2549,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) int __init cgroup_init_early(void) { int i; - kref_init(&init_css_set.ref); - kref_get(&init_css_set.ref); + atomic_set(&init_css_set.refcount, 1); INIT_LIST_HEAD(&init_css_set.cg_links); INIT_LIST_HEAD(&init_css_set.tasks); INIT_HLIST_NODE(&init_css_set.hlist); @@ -2735,6 +2788,8 @@ void cgroup_fork_callbacks(struct task_struct *child) * Called on every change to mm->owner. mm_init_owner() does not * invoke this routine, since it assigns the mm->owner the first time * and does not change it. + * + * The callbacks are invoked with mmap_sem held in read mode. */ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) { @@ -2750,7 +2805,7 @@ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) if (oldcgrp == newcgrp) continue; if (ss->mm_owner_changed) - ss->mm_owner_changed(ss, oldcgrp, newcgrp); + ss->mm_owner_changed(ss, oldcgrp, newcgrp, new); } } } diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c index c3dc3aba4c02..daca6209202d 100644 --- a/kernel/cgroup_debug.c +++ b/kernel/cgroup_debug.c @@ -57,7 +57,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont, u64 count; rcu_read_lock(); - count = atomic_read(¤t->cgroups->ref.refcount); + count = atomic_read(¤t->cgroups->refcount); rcu_read_unlock(); return count; } @@ -90,7 +90,7 @@ static struct cftype files[] = { { .name = "releasable", .read_u64 = releasable_read, - } + }, }; static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c new file mode 100644 index 000000000000..e95056954498 --- /dev/null +++ b/kernel/cgroup_freezer.c @@ -0,0 +1,379 @@ +/* + * cgroup_freezer.c - control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater <clg@fr.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <linux/module.h> +#include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/freezer.h> +#include <linux/seq_file.h> + +enum freezer_state { + CGROUP_THAWED = 0, + CGROUP_FREEZING, + CGROUP_FROZEN, +}; + +struct freezer { + struct cgroup_subsys_state css; + enum freezer_state state; + spinlock_t lock; /* protects _writes_ to state */ +}; + +static inline struct freezer *cgroup_freezer( + struct cgroup *cgroup) +{ + return container_of( + cgroup_subsys_state(cgroup, freezer_subsys_id), + struct freezer, css); +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ + return container_of(task_subsys_state(task, freezer_subsys_id), + struct freezer, css); +} + +int cgroup_frozen(struct task_struct *task) +{ + struct freezer *freezer; + enum freezer_state state; + + task_lock(task); + freezer = task_freezer(task); + state = freezer->state; + task_unlock(task); + + return state == CGROUP_FROZEN; +} + +/* + * cgroups_write_string() limits the size of freezer state strings to + * CGROUP_LOCAL_BUFFER_SIZE + */ +static const char *freezer_state_strs[] = { + "THAWED", + "FREEZING", + "FROZEN", +}; + +/* + * State diagram + * Transitions are caused by userspace writes to the freezer.state file. + * The values in parenthesis are state labels. The rest are edge labels. + * + * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) + * ^ ^ | | + * | \_______THAWED_______/ | + * \__________________________THAWED____________/ + */ + +struct cgroup_subsys freezer_subsys; + +/* Locks taken and their ordering + * ------------------------------ + * css_set_lock + * cgroup_mutex (AKA cgroup_lock) + * task->alloc_lock (AKA task_lock) + * freezer->lock + * task->sighand->siglock + * + * cgroup code forces css_set_lock to be taken before task->alloc_lock + * + * freezer_create(), freezer_destroy(): + * cgroup_mutex [ by cgroup core ] + * + * can_attach(): + * cgroup_mutex + * + * cgroup_frozen(): + * task->alloc_lock (to get task's cgroup) + * + * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): + * task->alloc_lock (to get task's cgroup) + * freezer->lock + * sighand->siglock (if the cgroup is freezing) + * + * freezer_read(): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * + * freezer_write() (freeze): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * sighand->siglock + * + * freezer_write() (unfreeze): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * task->alloc_lock (to prevent races with freeze_task()) + * sighand->siglock + */ +static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + struct freezer *freezer; + + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); + if (!freezer) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&freezer->lock); + freezer->state = CGROUP_THAWED; + return &freezer->css; +} + +static void freezer_destroy(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + kfree(cgroup_freezer(cgroup)); +} + +/* Task is frozen or will freeze immediately when next it gets woken */ +static bool is_task_frozen_enough(struct task_struct *task) +{ + return frozen(task) || + (task_is_stopped_or_traced(task) && freezing(task)); +} + +/* + * The call to cgroup_lock() in the freezer.state write method prevents + * a write to that file racing against an attach, and hence the + * can_attach() result will remain valid until the attach completes. + */ +static int freezer_can_attach(struct cgroup_subsys *ss, + struct cgroup *new_cgroup, + struct task_struct *task) +{ + struct freezer *freezer; + int retval; + + /* Anything frozen can't move or be moved to/from */ + + if (is_task_frozen_enough(task)) + return -EBUSY; + + freezer = cgroup_freezer(new_cgroup); + if (freezer->state == CGROUP_FROZEN) + return -EBUSY; + + retval = 0; + task_lock(task); + freezer = task_freezer(task); + if (freezer->state == CGROUP_FROZEN) + retval = -EBUSY; + task_unlock(task); + return retval; +} + +static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +{ + struct freezer *freezer; + + task_lock(task); + freezer = task_freezer(task); + task_unlock(task); + + BUG_ON(freezer->state == CGROUP_FROZEN); + spin_lock_irq(&freezer->lock); + /* Locking avoids race with FREEZING -> THAWED transitions. */ + if (freezer->state == CGROUP_FREEZING) + freeze_task(task, true); + spin_unlock_irq(&freezer->lock); +} + +/* + * caller must hold freezer->lock + */ +static void update_freezer_state(struct cgroup *cgroup, + struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int nfrozen = 0, ntotal = 0; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + ntotal++; + if (is_task_frozen_enough(task)) + nfrozen++; + } + + /* + * Transition to FROZEN when no new tasks can be added ensures + * that we never exist in the FROZEN state while there are unfrozen + * tasks. + */ + if (nfrozen == ntotal) + freezer->state = CGROUP_FROZEN; + else if (nfrozen > 0) + freezer->state = CGROUP_FREEZING; + else + freezer->state = CGROUP_THAWED; + cgroup_iter_end(cgroup, &it); +} + +static int freezer_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct freezer *freezer; + enum freezer_state state; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + state = freezer->state; + if (state == CGROUP_FREEZING) { + /* We change from FREEZING to FROZEN lazily if the cgroup was + * only partially frozen when we exitted write. */ + update_freezer_state(cgroup, freezer); + state = freezer->state; + } + spin_unlock_irq(&freezer->lock); + cgroup_unlock(); + + seq_puts(m, freezer_state_strs[state]); + seq_putc(m, '\n'); + return 0; +} + +static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int num_cant_freeze_now = 0; + + freezer->state = CGROUP_FREEZING; + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + if (!freeze_task(task, true)) + continue; + if (is_task_frozen_enough(task)) + continue; + if (!freezing(task) && !freezer_should_skip(task)) + num_cant_freeze_now++; + } + cgroup_iter_end(cgroup, &it); + + return num_cant_freeze_now ? -EBUSY : 0; +} + +static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + int do_wake; + + task_lock(task); + do_wake = __thaw_process(task); + task_unlock(task); + if (do_wake) + wake_up_process(task); + } + cgroup_iter_end(cgroup, &it); + freezer->state = CGROUP_THAWED; + + return 0; +} + +static int freezer_change_state(struct cgroup *cgroup, + enum freezer_state goal_state) +{ + struct freezer *freezer; + int retval = 0; + + freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + update_freezer_state(cgroup, freezer); + if (goal_state == freezer->state) + goto out; + switch (freezer->state) { + case CGROUP_THAWED: + retval = try_to_freeze_cgroup(cgroup, freezer); + break; + case CGROUP_FREEZING: + if (goal_state == CGROUP_FROZEN) { + /* Userspace is retrying after + * "/bin/echo FROZEN > freezer.state" returned -EBUSY */ + retval = try_to_freeze_cgroup(cgroup, freezer); + break; + } + /* state == FREEZING and goal_state == THAWED, so unfreeze */ + case CGROUP_FROZEN: + retval = unfreeze_cgroup(cgroup, freezer); + break; + default: + break; + } +out: + spin_unlock_irq(&freezer->lock); + + return retval; +} + +static int freezer_write(struct cgroup *cgroup, + struct cftype *cft, + const char *buffer) +{ + int retval; + enum freezer_state goal_state; + + if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) + goal_state = CGROUP_THAWED; + else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) + goal_state = CGROUP_FROZEN; + else + return -EIO; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + retval = freezer_change_state(cgroup, goal_state); + cgroup_unlock(); + return retval; +} + +static struct cftype files[] = { + { + .name = "state", + .read_seq_string = freezer_read, + .write_string = freezer_write, + }, +}; + +static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); +} + +struct cgroup_subsys freezer_subsys = { + .name = "freezer", + .create = freezer_create, + .destroy = freezer_destroy, + .populate = freezer_populate, + .subsys_id = freezer_subsys_id, + .can_attach = freezer_can_attach, + .attach = NULL, + .fork = freezer_fork, + .exit = NULL, +}; diff --git a/kernel/compat.c b/kernel/compat.c index 32c254a8ab9a..143990e48cb9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,6 +26,64 @@ #include <asm/uaccess.h> +/* + * Note that the native side is already converted to a timespec, because + * that's what we want anyway. + */ +static int compat_get_timeval(struct timespec *o, + struct compat_timeval __user *i) +{ + long usec; + + if (get_user(o->tv_sec, &i->tv_sec) || + get_user(usec, &i->tv_usec)) + return -EFAULT; + o->tv_nsec = usec * 1000; + return 0; +} + +static int compat_put_timeval(struct compat_timeval __user *o, + struct timeval *i) +{ + return (put_user(i->tv_sec, &o->tv_sec) || + put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; +} + +asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + if (tv) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (compat_put_timeval(tv, &ktv)) + return -EFAULT; + } + if (tz) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + + return 0; +} + +asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + struct timespec kts; + struct timezone ktz; + + if (tv) { + if (compat_get_timeval(&kts, tv)) + return -EFAULT; + } + if (tz) { + if (copy_from_user(&ktz, tz, sizeof(ktz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); +} + int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || diff --git a/kernel/configs.c b/kernel/configs.c index 4c345210ed8c..abaee684ecbf 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -54,9 +54,6 @@ #ifdef CONFIG_IKCONFIG_PROC -/**************************************************/ -/* globals and useful constants */ - static ssize_t ikconfig_read_current(struct file *file, char __user *buf, size_t len, loff_t * offset) @@ -71,9 +68,6 @@ static const struct file_operations ikconfig_file_ops = { .read = ikconfig_read_current, }; -/***************************************************/ -/* ikconfig_init: start up everything we need to */ - static int __init ikconfig_init(void) { struct proc_dir_entry *entry; @@ -89,9 +83,6 @@ static int __init ikconfig_init(void) return 0; } -/***************************************************/ -/* ikconfig_cleanup: clean up our mess */ - static void __exit ikconfig_cleanup(void) { remove_proc_entry("config.gz", NULL); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index eab7bd6628e0..3e00526f52ec 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1172,7 +1172,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, { struct cpuset trialcs; int err; - int cpus_nonempty, balance_flag_changed; + int balance_flag_changed; trialcs = *cs; if (turning_on) @@ -1184,7 +1184,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, if (err < 0) return err; - cpus_nonempty = !cpus_empty(trialcs.cpus_allowed); balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(&trialcs)); @@ -1192,7 +1191,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs.flags; mutex_unlock(&callback_mutex); - if (cpus_nonempty && balance_flag_changed) + if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) async_rebuild_sched_domains(); return 0; @@ -2437,19 +2436,15 @@ const struct file_operations proc_cpuset_operations = { void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t"); - m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, - task->cpus_allowed); + seq_cpumask(m, &task->cpus_allowed); seq_printf(m, "\n"); seq_printf(m, "Cpus_allowed_list:\t"); - m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, - task->cpus_allowed); + seq_cpumask_list(m, &task->cpus_allowed); seq_printf(m, "\n"); seq_printf(m, "Mems_allowed:\t"); - m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, - task->mems_allowed); + seq_nodemask(m, &task->mems_allowed); seq_printf(m, "\n"); seq_printf(m, "Mems_allowed_list:\t"); - m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, - task->mems_allowed); + seq_nodemask_list(m, &task->mems_allowed); seq_printf(m, "\n"); } diff --git a/kernel/dma.c b/kernel/dma.c index d2c60a822790..f903189c5304 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -1,4 +1,4 @@ -/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $ +/* * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. * * Written by Hennus Bergman, 1992. diff --git a/kernel/exit.c b/kernel/exit.c index 85a83c831856..0ef4673e351b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -640,24 +640,23 @@ retry: assign_new_owner: BUG_ON(c == p); get_task_struct(c); + read_unlock(&tasklist_lock); + down_write(&mm->mmap_sem); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); + up_write(&mm->mmap_sem); put_task_struct(c); goto retry; } cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); + up_write(&mm->mmap_sem); put_task_struct(c); } #endif /* CONFIG_MM_OWNER */ diff --git a/kernel/freezer.c b/kernel/freezer.c new file mode 100644 index 000000000000..ba6248b323ef --- /dev/null +++ b/kernel/freezer.c @@ -0,0 +1,154 @@ +/* + * kernel/freezer.c - Function to freeze a process + * + * Originally from kernel/power/process.c + */ + +#include <linux/interrupt.h> +#include <linux/suspend.h> +#include <linux/module.h> +#include <linux/syscalls.h> +#include <linux/freezer.h> + +/* + * freezing is complete, mark current process as frozen + */ +static inline void frozen_process(void) +{ + if (!unlikely(current->flags & PF_NOFREEZE)) { + current->flags |= PF_FROZEN; + wmb(); + } + clear_freeze_flag(current); +} + +/* Refrigerator is place where frozen processes are stored :-). */ +void refrigerator(void) +{ + /* Hmm, should we be allowed to suspend when there are realtime + processes around? */ + long save; + + task_lock(current); + if (freezing(current)) { + frozen_process(); + task_unlock(current); + } else { + task_unlock(current); + return; + } + save = current->state; + pr_debug("%s entered refrigerator\n", current->comm); + + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); /* We sent fake signal, clean it up */ + spin_unlock_irq(¤t->sighand->siglock); + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!frozen(current)) + break; + schedule(); + } + pr_debug("%s left refrigerator\n", current->comm); + __set_current_state(save); +} +EXPORT_SYMBOL(refrigerator); + +static void fake_signal_wake_up(struct task_struct *p) +{ + unsigned long flags; + + spin_lock_irqsave(&p->sighand->siglock, flags); + signal_wake_up(p, 0); + spin_unlock_irqrestore(&p->sighand->siglock, flags); +} + +/** + * freeze_task - send a freeze request to given task + * @p: task to send the request to + * @sig_only: if set, the request will only be sent if the task has the + * PF_FREEZER_NOSIG flag unset + * Return value: 'false', if @sig_only is set and the task has + * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise + * + * The freeze request is sent by setting the tasks's TIF_FREEZE flag and + * either sending a fake signal to it or waking it up, depending on whether + * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task + * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its + * TIF_FREEZE flag will not be set. + */ +bool freeze_task(struct task_struct *p, bool sig_only) +{ + /* + * We first check if the task is freezing and next if it has already + * been frozen to avoid the race with frozen_process() which first marks + * the task as frozen and next clears its TIF_FREEZE. + */ + if (!freezing(p)) { + rmb(); + if (frozen(p)) + return false; + + if (!sig_only || should_send_signal(p)) + set_freeze_flag(p); + else + return false; + } + + if (should_send_signal(p)) { + if (!signal_pending(p)) + fake_signal_wake_up(p); + } else if (sig_only) { + return false; + } else { + wake_up_state(p, TASK_INTERRUPTIBLE); + } + + return true; +} + +void cancel_freezing(struct task_struct *p) +{ + unsigned long flags; + + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); + clear_freeze_flag(p); + spin_lock_irqsave(&p->sighand->siglock, flags); + recalc_sigpending_and_wake(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +} + +/* + * Wake up a frozen process + * + * task_lock() is needed to prevent the race with refrigerator() which may + * occur if the freezing of tasks fails. Namely, without the lock, if the + * freezing of tasks failed, thaw_tasks() might have run before a task in + * refrigerator() could call frozen_process(), in which case the task would be + * frozen and no one would thaw it. + */ +int __thaw_process(struct task_struct *p) +{ + if (frozen(p)) { + p->flags &= ~PF_FROZEN; + return 1; + } + clear_freeze_flag(p); + return 0; +} + +int thaw_process(struct task_struct *p) +{ + task_lock(p); + if (__thaw_process(p) == 1) { + task_unlock(p); + wake_up_process(p); + return 1; + } + task_unlock(p); + return 0; +} +EXPORT_SYMBOL(thaw_process); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 38fc10ac7541..5072cf1685a2 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -260,7 +260,6 @@ const char *kallsyms_lookup(unsigned long addr, /* see if it's in a module */ return module_address_lookup(addr, symbolsize, offset, modname, namebuf); - return NULL; } int lookup_symbol_name(unsigned long addr, char *symname) diff --git a/kernel/kexec.c b/kernel/kexec.c index aef265325cd3..777ac458ac99 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1371,6 +1371,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(node_online_map); VMCOREINFO_SYMBOL(swapper_pg_dir); VMCOREINFO_SYMBOL(_stext); + VMCOREINFO_SYMBOL(vmlist); #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(mem_map); @@ -1406,6 +1407,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vm_struct, addr); VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); diff --git a/kernel/kmod.c b/kernel/kmod.c index 2456d1a0befb..3d3c3ea3a023 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -113,7 +113,7 @@ int request_module(const char *fmt, ...) return ret; } EXPORT_SYMBOL(request_module); -#endif /* CONFIG_KMOD */ +#endif /* CONFIG_MODULES */ struct subprocess_info { struct work_struct work; @@ -265,7 +265,7 @@ static void __call_usermodehelper(struct work_struct *work) } } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP /* * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user @@ -288,39 +288,37 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) -static int usermodehelper_pm_callback(struct notifier_block *nfb, - unsigned long action, - void *ignored) +/** + * usermodehelper_disable - prevent new helpers from being started + */ +int usermodehelper_disable(void) { long retval; - switch (action) { - case PM_HIBERNATION_PREPARE: - case PM_SUSPEND_PREPARE: - usermodehelper_disabled = 1; - smp_mb(); - /* - * From now on call_usermodehelper_exec() won't start any new - * helpers, so it is sufficient if running_helpers turns out to - * be zero at one point (it may be increased later, but that - * doesn't matter). - */ - retval = wait_event_timeout(running_helpers_waitq, + usermodehelper_disabled = 1; + smp_mb(); + /* + * From now on call_usermodehelper_exec() won't start any new + * helpers, so it is sufficient if running_helpers turns out to + * be zero at one point (it may be increased later, but that + * doesn't matter). + */ + retval = wait_event_timeout(running_helpers_waitq, atomic_read(&running_helpers) == 0, RUNNING_HELPERS_TIMEOUT); - if (retval) { - return NOTIFY_OK; - } else { - usermodehelper_disabled = 0; - return NOTIFY_BAD; - } - case PM_POST_HIBERNATION: - case PM_POST_SUSPEND: - usermodehelper_disabled = 0; - return NOTIFY_OK; - } + if (retval) + return 0; - return NOTIFY_DONE; + usermodehelper_disabled = 0; + return -EAGAIN; +} + +/** + * usermodehelper_enable - allow new helpers to be started again + */ +void usermodehelper_enable(void) +{ + usermodehelper_disabled = 0; } static void helper_lock(void) @@ -334,18 +332,12 @@ static void helper_unlock(void) if (atomic_dec_and_test(&running_helpers)) wake_up(&running_helpers_waitq); } - -static void register_pm_notifier_callback(void) -{ - pm_notifier(usermodehelper_pm_callback, 0); -} -#else /* CONFIG_PM */ +#else /* CONFIG_PM_SLEEP */ #define usermodehelper_disabled 0 static inline void helper_lock(void) {} static inline void helper_unlock(void) {} -static inline void register_pm_notifier_callback(void) {} -#endif /* CONFIG_PM */ +#endif /* CONFIG_PM_SLEEP */ /** * call_usermodehelper_setup - prepare to call a usermode helper @@ -515,5 +507,4 @@ void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); BUG_ON(!khelper_wq); - register_pm_notifier_callback(); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 75bc2cd9ebc6..8b57a2597f21 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -404,7 +404,7 @@ void kretprobe_hash_lock(struct task_struct *tsk, spin_lock_irqsave(hlist_lock, *flags); } -void kretprobe_table_lock(unsigned long hash, unsigned long *flags) +static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e53bc30e9ba5..08dd8ed86c77 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/kexec.h> +#include <linux/profile.h> #include <linux/sched.h> #define KERNEL_ATTR_RO(_name) \ @@ -53,6 +54,37 @@ static ssize_t uevent_helper_store(struct kobject *kobj, KERNEL_ATTR_RW(uevent_helper); #endif +#ifdef CONFIG_PROFILING +static ssize_t profiling_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", prof_on); +} +static ssize_t profiling_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret; + + if (prof_on) + return -EEXIST; + /* + * This eventually calls into get_option() which + * has a ton of callers and is not const. It is + * easiest to cast it away here. + */ + profile_setup((char *)buf); + ret = profile_init(); + if (ret) + return ret; + ret = create_proc_profile(); + if (ret) + return ret; + return count; +} +KERNEL_ATTR_RW(profiling); +#endif + #ifdef CONFIG_KEXEC static ssize_t kexec_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -109,6 +141,9 @@ static struct attribute * kernel_attrs[] = { &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, #endif +#ifdef CONFIG_PROFILING + &profiling_attr.attr, +#endif #ifdef CONFIG_KEXEC &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, diff --git a/kernel/kthread.c b/kernel/kthread.c index 96cff2f8710b..14ec64fe175a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -171,12 +171,11 @@ EXPORT_SYMBOL(kthread_create); */ void kthread_bind(struct task_struct *k, unsigned int cpu) { - if (k->state != TASK_UNINTERRUPTIBLE) { + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) { WARN_ON(1); return; } - /* Must have done schedule() in kthread() before we set_task_cpu */ - wait_task_inactive(k, 0); set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); k->rt.nr_cpus_allowed = 1; diff --git a/kernel/module.c b/kernel/module.c index 9db11911e04b..25bc9ac9e226 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -100,7 +100,7 @@ static inline int strong_try_module_get(struct module *mod) static inline void add_taint_module(struct module *mod, unsigned flag) { add_taint(flag); - mod->taints |= flag; + mod->taints |= (1U << flag); } /* @@ -784,6 +784,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) mutex_lock(&module_mutex); /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); + unregister_dynamic_debug_module(mod->name); free_module(mod); out: @@ -923,7 +924,7 @@ static const char vermagic[] = VERMAGIC_STRING; static int try_to_force_load(struct module *mod, const char *symname) { #ifdef CONFIG_MODULE_FORCE_LOAD - if (!(tainted & TAINT_FORCED_MODULE)) + if (!test_taint(TAINT_FORCED_MODULE)) printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); add_taint_module(mod, TAINT_FORCED_MODULE); @@ -1033,7 +1034,7 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, const unsigned long *crc; ret = find_symbol(name, &owner, &crc, - !(mod->taints & TAINT_PROPRIETARY_MODULE), true); + !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); if (!IS_ERR_VALUE(ret)) { /* use_module can fail due to OOM, or module initialization or unloading */ @@ -1173,7 +1174,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, while (i-- > 0) sysfs_remove_bin_file(notes_attrs->dir, ¬es_attrs->attrs[i]); - kobject_del(notes_attrs->dir); + kobject_put(notes_attrs->dir); } kfree(notes_attrs); } @@ -1634,7 +1635,7 @@ static void set_license(struct module *mod, const char *license) license = "unspecified"; if (!license_is_gpl_compatible(license)) { - if (!(tainted & TAINT_PROPRIETARY_MODULE)) + if (!test_taint(TAINT_PROPRIETARY_MODULE)) printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); add_taint_module(mod, TAINT_PROPRIETARY_MODULE); @@ -1783,6 +1784,33 @@ static inline void add_kallsyms(struct module *mod, } #endif /* CONFIG_KALLSYMS */ +#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG +static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex) +{ + struct mod_debug *debug_info; + unsigned long pos, end; + unsigned int num_verbose; + + pos = sechdrs[verboseindex].sh_addr; + num_verbose = sechdrs[verboseindex].sh_size / + sizeof(struct mod_debug); + end = pos + (num_verbose * sizeof(struct mod_debug)); + + for (; pos < end; pos += sizeof(struct mod_debug)) { + debug_info = (struct mod_debug *)pos; + register_dynamic_debug_module(debug_info->modname, + debug_info->type, debug_info->logical_modname, + debug_info->flag_names, debug_info->hash, + debug_info->hash2); + } +} +#else +static inline void dynamic_printk_setup(Elf_Shdr *sechdrs, + unsigned int verboseindex) +{ +} +#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */ + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); @@ -1806,6 +1834,7 @@ static noinline struct module *load_module(void __user *umod, Elf_Ehdr *hdr; Elf_Shdr *sechdrs; char *secstrings, *args, *modmagic, *strtab = NULL; + char *staging; unsigned int i; unsigned int symindex = 0; unsigned int strindex = 0; @@ -1831,6 +1860,7 @@ static noinline struct module *load_module(void __user *umod, #endif unsigned int markersindex; unsigned int markersstringsindex; + unsigned int verboseindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -1960,6 +1990,14 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } + staging = get_modinfo(sechdrs, infoindex, "staging"); + if (staging) { + add_taint_module(mod, TAINT_CRAP); + printk(KERN_WARNING "%s: module is from the staging directory," + " the quality is unknown, you have been warned.\n", + mod->name); + } + /* Now copy in args */ args = strndup_user(uargs, ~0UL >> 1); if (IS_ERR(args)) { @@ -2117,6 +2155,7 @@ static noinline struct module *load_module(void __user *umod, markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); markersstringsindex = find_sec(hdr, sechdrs, secstrings, "__markers_strings"); + verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -2167,6 +2206,7 @@ static noinline struct module *load_module(void __user *umod, marker_update_probe_range(mod->markers, mod->markers + mod->num_markers); #endif + dynamic_printk_setup(sechdrs, verboseindex); err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2552,10 +2592,12 @@ static char *module_flags(struct module *mod, char *buf) mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - if (mod->taints & TAINT_PROPRIETARY_MODULE) + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) buf[bx++] = 'P'; - if (mod->taints & TAINT_FORCED_MODULE) + if (mod->taints & (1 << TAINT_FORCED_MODULE)) buf[bx++] = 'F'; + if (mod->taints & (1 << TAINT_CRAP)) + buf[bx++] = 'C'; /* * TAINT_FORCED_RMMOD: could be added. * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't diff --git a/kernel/panic.c b/kernel/panic.c index 12c5a0a6c89b..bda561ef3cdf 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,7 +23,7 @@ #include <linux/kallsyms.h> int panic_on_oops; -int tainted; +static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -143,6 +143,27 @@ NORET_TYPE void panic(const char * fmt, ...) EXPORT_SYMBOL(panic); + +struct tnt { + u8 bit; + char true; + char false; +}; + +static const struct tnt tnts[] = { + { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, + { TAINT_FORCED_MODULE, 'F', ' ' }, + { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_FORCED_RMMOD, 'R', ' ' }, + { TAINT_MACHINE_CHECK, 'M', ' ' }, + { TAINT_BAD_PAGE, 'B', ' ' }, + { TAINT_USER, 'U', ' ' }, + { TAINT_DIE, 'D', ' ' }, + { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, + { TAINT_WARN, 'W', ' ' }, + { TAINT_CRAP, 'C', ' ' }, +}; + /** * print_tainted - return a string to represent the kernel taint state. * @@ -155,35 +176,45 @@ EXPORT_SYMBOL(panic); * 'U' - Userspace-defined naughtiness. * 'A' - ACPI table overridden. * 'W' - Taint on warning. + * 'C' - modules from drivers/staging are loaded. * * The string is overwritten by the next call to print_taint(). */ - const char *print_tainted(void) { - static char buf[20]; - if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", - tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', - tainted & TAINT_FORCED_MODULE ? 'F' : ' ', - tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', - tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', - tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', - tainted & TAINT_BAD_PAGE ? 'B' : ' ', - tainted & TAINT_USER ? 'U' : ' ', - tainted & TAINT_DIE ? 'D' : ' ', - tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', - tainted & TAINT_WARN ? 'W' : ' '); - } - else + static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; + + if (tainted_mask) { + char *s; + int i; + + s = buf + sprintf(buf, "Tainted: "); + for (i = 0; i < ARRAY_SIZE(tnts); i++) { + const struct tnt *t = &tnts[i]; + *s++ = test_bit(t->bit, &tainted_mask) ? + t->true : t->false; + } + *s = 0; + } else snprintf(buf, sizeof(buf), "Not tainted"); return(buf); } +int test_taint(unsigned flag) +{ + return test_bit(flag, &tainted_mask); +} +EXPORT_SYMBOL(test_taint); + +unsigned long get_taint(void) +{ + return tainted_mask; +} + void add_taint(unsigned flag) { debug_locks = 0; /* can't trust the integrity of the kernel anymore */ - tainted |= flag; + set_bit(flag, &tainted_mask); } EXPORT_SYMBOL(add_taint); diff --git a/kernel/power/disk.c b/kernel/power/disk.c index bbd85c60f741..331f9836383f 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -14,6 +14,7 @@ #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> +#include <linux/kmod.h> #include <linux/delay.h> #include <linux/fs.h> #include <linux/mount.h> @@ -520,6 +521,10 @@ int hibernate(void) if (error) goto Exit; + error = usermodehelper_disable(); + if (error) + goto Exit; + /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) @@ -558,6 +563,7 @@ int hibernate(void) thaw_processes(); Finish: free_basic_memory_bitmaps(); + usermodehelper_enable(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); pm_restore_console(); @@ -634,6 +640,10 @@ static int software_resume(void) if (error) goto Finish; + error = usermodehelper_disable(); + if (error) + goto Finish; + error = create_basic_memory_bitmaps(); if (error) goto Finish; @@ -656,6 +666,7 @@ static int software_resume(void) thaw_processes(); Done: free_basic_memory_bitmaps(); + usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); pm_restore_console(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 540b16b68565..19122cf6d827 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -14,6 +14,7 @@ #include <linux/string.h> #include <linux/delay.h> #include <linux/errno.h> +#include <linux/kmod.h> #include <linux/init.h> #include <linux/console.h> #include <linux/cpu.h> @@ -237,6 +238,10 @@ static int suspend_prepare(void) if (error) goto Finish; + error = usermodehelper_disable(); + if (error) + goto Finish; + if (suspend_freeze_processes()) { error = -EAGAIN; goto Thaw; @@ -256,6 +261,7 @@ static int suspend_prepare(void) Thaw: suspend_thaw_processes(); + usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); @@ -376,6 +382,7 @@ int suspend_devices_and_enter(suspend_state_t state) static void suspend_finish(void) { suspend_thaw_processes(); + usermodehelper_enable(); pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); } diff --git a/kernel/power/process.c b/kernel/power/process.c index 278946aecaf0..ca634019497a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -28,121 +28,6 @@ static inline int freezeable(struct task_struct * p) return 1; } -/* - * freezing is complete, mark current process as frozen - */ -static inline void frozen_process(void) -{ - if (!unlikely(current->flags & PF_NOFREEZE)) { - current->flags |= PF_FROZEN; - wmb(); - } - clear_freeze_flag(current); -} - -/* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) -{ - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ - long save; - - task_lock(current); - if (freezing(current)) { - frozen_process(); - task_unlock(current); - } else { - task_unlock(current); - return; - } - save = current->state; - pr_debug("%s entered refrigerator\n", current->comm); - - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!frozen(current)) - break; - schedule(); - } - pr_debug("%s left refrigerator\n", current->comm); - __set_current_state(save); -} - -static void fake_signal_wake_up(struct task_struct *p) -{ - unsigned long flags; - - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); -} - -static inline bool should_send_signal(struct task_struct *p) -{ - return !(p->flags & PF_FREEZER_NOSIG); -} - -/** - * freeze_task - send a freeze request to given task - * @p: task to send the request to - * @sig_only: if set, the request will only be sent if the task has the - * PF_FREEZER_NOSIG flag unset - * Return value: 'false', if @sig_only is set and the task has - * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise - * - * The freeze request is sent by setting the tasks's TIF_FREEZE flag and - * either sending a fake signal to it or waking it up, depending on whether - * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task - * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its - * TIF_FREEZE flag will not be set. - */ -static bool freeze_task(struct task_struct *p, bool sig_only) -{ - /* - * We first check if the task is freezing and next if it has already - * been frozen to avoid the race with frozen_process() which first marks - * the task as frozen and next clears its TIF_FREEZE. - */ - if (!freezing(p)) { - rmb(); - if (frozen(p)) - return false; - - if (!sig_only || should_send_signal(p)) - set_freeze_flag(p); - else - return false; - } - - if (should_send_signal(p)) { - if (!signal_pending(p)) - fake_signal_wake_up(p); - } else if (sig_only) { - return false; - } else { - wake_up_state(p, TASK_INTERRUPTIBLE); - } - - return true; -} - -static void cancel_freezing(struct task_struct *p) -{ - unsigned long flags; - - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - clear_freeze_flag(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_and_wake(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } -} - static int try_to_freeze_tasks(bool sig_only) { struct task_struct *g, *p; @@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only) if (nosig_only && should_send_signal(p)) continue; + if (cgroup_frozen(p)) + continue; + thaw_process(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); @@ -264,4 +152,3 @@ void thaw_processes(void) printk("done.\n"); } -EXPORT_SYMBOL(refrigerator); diff --git a/kernel/power/user.c b/kernel/power/user.c index a6332a313262..005b93d839ba 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -212,13 +212,20 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_FREEZE: if (data->frozen) break; + printk("Syncing filesystems ... "); sys_sync(); printk("done.\n"); - error = freeze_processes(); + error = usermodehelper_disable(); if (error) + break; + + error = freeze_processes(); + if (error) { thaw_processes(); + usermodehelper_enable(); + } if (!error) data->frozen = 1; break; @@ -227,6 +234,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!data->frozen || data->ready) break; thaw_processes(); + usermodehelper_enable(); data->frozen = 0; break; diff --git a/kernel/printk.c b/kernel/printk.c index a430fd04008b..6341af77eb65 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -13,7 +13,7 @@ * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfred@colorfullife.com * Rewrote bits to get rid of console_lock - * 01Mar01 Andrew Morton <andrewm@uow.edu.au> + * 01Mar01 Andrew Morton */ #include <linux/kernel.h> @@ -577,9 +577,6 @@ static int have_callable_console(void) * @fmt: format string * * This is printk(). It can be called from any context. We want it to work. - * Be aware of the fact that if oops_in_progress is not set, we might try to - * wake klogd up which could deadlock on runqueue lock if printk() is called - * from scheduler code. * * We try to grab the console_sem. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output @@ -593,6 +590,8 @@ static int have_callable_console(void) * * See also: * printf(3) + * + * See the vsnprintf() documentation for format string extensions over C99. */ asmlinkage int printk(const char *fmt, ...) @@ -982,10 +981,25 @@ int is_console_locked(void) return console_locked; } -void wake_up_klogd(void) +static DEFINE_PER_CPU(int, printk_pending); + +void printk_tick(void) { - if (!oops_in_progress && waitqueue_active(&log_wait)) + if (__get_cpu_var(printk_pending)) { + __get_cpu_var(printk_pending) = 0; wake_up_interruptible(&log_wait); + } +} + +int printk_needs_cpu(int cpu) +{ + return per_cpu(printk_pending, cpu); +} + +void wake_up_klogd(void) +{ + if (waitqueue_active(&log_wait)) + __raw_get_cpu_var(printk_pending) = 1; } /** diff --git a/kernel/profile.c b/kernel/profile.c index cd26bed4cc26..a9e422df6bf6 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -22,6 +22,8 @@ #include <linux/cpu.h> #include <linux/highmem.h> #include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> #include <asm/sections.h> #include <asm/irq_regs.h> #include <asm/ptrace.h> @@ -50,11 +52,11 @@ static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); #endif /* CONFIG_SMP */ -static int __init profile_setup(char *str) +int profile_setup(char *str) { - static char __initdata schedstr[] = "schedule"; - static char __initdata sleepstr[] = "sleep"; - static char __initdata kvmstr[] = "kvm"; + static char schedstr[] = "schedule"; + static char sleepstr[] = "sleep"; + static char kvmstr[] = "kvm"; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { @@ -100,14 +102,33 @@ static int __init profile_setup(char *str) __setup("profile=", profile_setup); -void __init profile_init(void) +int profile_init(void) { + int buffer_bytes; if (!prof_on) - return; + return 0; /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; - prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); + buffer_bytes = prof_len*sizeof(atomic_t); + if (!slab_is_available()) { + prof_buffer = alloc_bootmem(buffer_bytes); + return 0; + } + + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); + if (prof_buffer) + return 0; + + prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); + if (prof_buffer) + return 0; + + prof_buffer = vmalloc(buffer_bytes); + if (prof_buffer) + return 0; + + return -ENOMEM; } /* Profile event notifications */ @@ -527,7 +548,7 @@ static void __init profile_nop(void *unused) { } -static int __init create_hash_tables(void) +static int create_hash_tables(void) { int cpu; @@ -575,14 +596,14 @@ out_cleanup: #define create_hash_tables() ({ 0; }) #endif -static int __init create_proc_profile(void) +int create_proc_profile(void) { struct proc_dir_entry *entry; if (!prof_on) return 0; if (create_hash_tables()) - return -1; + return -ENOMEM; entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &proc_profile_operations); if (!entry) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 356699a96d56..1e68e4c39e2c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -45,7 +45,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) * TASK_TRACED, resume it now. * Requires that irqs be disabled. */ -void ptrace_untrace(struct task_struct *child) +static void ptrace_untrace(struct task_struct *child) { spin_lock(&child->sighand->siglock); if (task_is_traced(child)) { diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index ca4bbbe04aa4..59236e8b9daa 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -54,9 +54,9 @@ #include <linux/cpu.h> #include <linux/random.h> #include <linux/delay.h> -#include <linux/byteorder/swabb.h> #include <linux/cpumask.h> #include <linux/rcupreempt_trace.h> +#include <asm/byteorder.h> /* * PREEMPT_RCU data structures. diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 90b5b123f7a1..85cb90588a55 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -42,10 +42,10 @@ #include <linux/freezer.h> #include <linux/cpu.h> #include <linux/delay.h> -#include <linux/byteorder/swabb.h> #include <linux/stat.h> #include <linux/srcu.h> #include <linux/slab.h> +#include <asm/byteorder.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " diff --git a/kernel/resource.c b/kernel/resource.c index 414d6fc9131e..4089d12af6e0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -38,10 +38,6 @@ EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); -#ifdef CONFIG_PROC_FS - -enum { MAX_IORES_LEVEL = 5 }; - static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -53,6 +49,10 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +#ifdef CONFIG_PROC_FS + +enum { MAX_IORES_LEVEL = 5 }; + static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { @@ -549,13 +549,9 @@ static void __init __reserve_region_with_split(struct resource *root, } if (!res) { - printk(KERN_DEBUG " __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n", - conflict->name, conflict->start, conflict->end, - name, start, end); - /* failed, split and try again */ - /* conflict coverred whole area */ + /* conflict covered whole area */ if (conflict->start <= start && conflict->end >= end) return; @@ -630,33 +626,34 @@ struct resource * __request_region(struct resource *parent, { struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); - if (res) { - res->name = name; - res->start = start; - res->end = start + n - 1; - res->flags = IORESOURCE_BUSY; + if (!res) + return NULL; - write_lock(&resource_lock); + res->name = name; + res->start = start; + res->end = start + n - 1; + res->flags = IORESOURCE_BUSY; - for (;;) { - struct resource *conflict; + write_lock(&resource_lock); - conflict = __request_resource(parent, res); - if (!conflict) - break; - if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) - continue; - } + for (;;) { + struct resource *conflict; - /* Uhhuh, that didn't work out.. */ - kfree(res); - res = NULL; + conflict = __request_resource(parent, res); + if (!conflict) break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; } - write_unlock(&resource_lock); + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; } + write_unlock(&resource_lock); return res; } EXPORT_SYMBOL(__request_region); @@ -831,3 +828,40 @@ static int __init reserve_setup(char *str) } __setup("reserve=", reserve_setup); + +/* + * Check if the requested addr and size spans more than any slot in the + * iomem resource tree. + */ +int iomem_map_sanity_check(resource_size_t addr, unsigned long size) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + continue; + if (p->end < addr) + continue; + if (p->start <= addr && (p->end >= addr + size - 1)) + continue; + printk(KERN_WARNING "resource map sanity check conflict: " + "0x%llx 0x%llx 0x%llx 0x%llx %s\n", + (unsigned long long)addr, + (unsigned long long)(addr + size - 1), + (unsigned long long)p->start, + (unsigned long long)p->end, + p->name); + err = -1; + break; + } + read_unlock(&resource_lock); + + return err; +} diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e8ab096ddfe3..81787248b60f 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) /* * scd->clock = clamp(scd->tick_gtod + delta, - * max(scd->tick_gtod, scd->clock), - * scd->tick_gtod + TICK_NSEC); + * max(scd->tick_gtod, scd->clock), + * max(scd->clock, scd->tick_gtod + TICK_NSEC)); */ clock = scd->tick_gtod + delta; min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = scd->tick_gtod + TICK_NSEC; + max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); diff --git a/kernel/softirq.c b/kernel/softirq.c index c506f266a6b9..83ba21a13bd4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,6 +6,8 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Remote softirq infrastructure is by Jens Axboe. */ #include <linux/module.h> @@ -46,7 +48,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; EXPORT_SYMBOL(irq_stat); #endif -static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; +static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); @@ -205,7 +207,18 @@ restart: do { if (pending & 1) { + int prev_count = preempt_count(); + h->action(h); + + if (unlikely(prev_count != preempt_count())) { + printk(KERN_ERR "huh, entered softirq %td %p" + "with preempt_count %08x," + " exited with %08x?\n", h - softirq_vec, + h->action, prev_count, preempt_count()); + preempt_count() = prev_count; + } + rcu_bh_qsctr_inc(cpu); } h++; @@ -463,17 +476,144 @@ void tasklet_kill(struct tasklet_struct *t) EXPORT_SYMBOL(tasklet_kill); +DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); +EXPORT_PER_CPU_SYMBOL(softirq_work_list); + +static void __local_trigger(struct call_single_data *cp, int softirq) +{ + struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); + + list_add_tail(&cp->list, head); + + /* Trigger the softirq only if the list was previously empty. */ + if (head->next == &cp->list) + raise_softirq_irqoff(softirq); +} + +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS +static void remote_softirq_receive(void *data) +{ + struct call_single_data *cp = data; + unsigned long flags; + int softirq; + + softirq = cp->priv; + + local_irq_save(flags); + __local_trigger(cp, softirq); + local_irq_restore(flags); +} + +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + if (cpu_online(cpu)) { + cp->func = remote_softirq_receive; + cp->info = cp; + cp->flags = 0; + cp->priv = softirq; + + __smp_call_function_single(cpu, cp); + return 0; + } + return 1; +} +#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + return 1; +} +#endif + +/** + * __send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @this_cpu: the currently executing cpu + * @softirq: the softirq for the work + * + * Attempt to schedule softirq work on a remote cpu. If this cannot be + * done, the work is instead queued up on the local cpu. + * + * Interrupts must be disabled. + */ +void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) +{ + if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) + __local_trigger(cp, softirq); +} +EXPORT_SYMBOL(__send_remote_softirq); + +/** + * send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @softirq: the softirq for the work + * + * Like __send_remote_softirq except that disabling interrupts and + * computing the current cpu is done for the caller. + */ +void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + unsigned long flags; + int this_cpu; + + local_irq_save(flags); + this_cpu = smp_processor_id(); + __send_remote_softirq(cp, cpu, this_cpu, softirq); + local_irq_restore(flags); +} +EXPORT_SYMBOL(send_remote_softirq); + +static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + int i; + + local_irq_disable(); + for (i = 0; i < NR_SOFTIRQS; i++) { + struct list_head *head = &per_cpu(softirq_work_list[i], cpu); + struct list_head *local_head; + + if (list_empty(head)) + continue; + + local_head = &__get_cpu_var(softirq_work_list[i]); + list_splice_init(head, local_head); + raise_softirq_irqoff(i); + } + local_irq_enable(); + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { + .notifier_call = remote_softirq_cpu_notify, +}; + void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { + int i; + per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; + for (i = 0; i < NR_SOFTIRQS; i++) + INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } + register_hotcpu_notifier(&remote_softirq_cpu_notifier); + open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } diff --git a/kernel/softlockup.c b/kernel/softlockup.c index cb838ee93a82..3953e4aed733 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -226,7 +226,7 @@ static void check_hung_uninterruptible_tasks(int this_cpu) * If the system crashed already then all bets are off, * do not report extra hung tasks: */ - if ((tainted & TAINT_DIE) || did_panic) + if (test_taint(TAINT_DIE) || did_panic) return; read_lock(&tasklist_lock); diff --git a/kernel/sys.c b/kernel/sys.c index 234d9454294e..0bc8fa3c2288 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1349,8 +1349,10 @@ asmlinkage long sys_sethostname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->nodename, tmp, len); - utsname()->nodename[len] = 0; + struct new_utsname *u = utsname(); + + memcpy(u->nodename, tmp, len); + memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; } up_write(&uts_sem); @@ -1362,15 +1364,17 @@ asmlinkage long sys_sethostname(char __user *name, int len) asmlinkage long sys_gethostname(char __user *name, int len) { int i, errno; + struct new_utsname *u; if (len < 0) return -EINVAL; down_read(&uts_sem); - i = 1 + strlen(utsname()->nodename); + u = utsname(); + i = 1 + strlen(u->nodename); if (i > len) i = len; errno = 0; - if (copy_to_user(name, utsname()->nodename, i)) + if (copy_to_user(name, u->nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1395,8 +1399,10 @@ asmlinkage long sys_setdomainname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->domainname, tmp, len); - utsname()->domainname[len] = 0; + struct new_utsname *u = utsname(); + + memcpy(u->domainname, tmp, len); + memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; } up_write(&uts_sem); @@ -1450,14 +1456,22 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) return -EINVAL; if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) - return -EINVAL; old_rlim = current->signal->rlim + resource; if ((new_rlim.rlim_max > old_rlim->rlim_max) && !capable(CAP_SYS_RESOURCE)) return -EPERM; - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) - return -EPERM; + + if (resource == RLIMIT_NOFILE) { + if (new_rlim.rlim_max == RLIM_INFINITY) + new_rlim.rlim_max = sysctl_nr_open; + if (new_rlim.rlim_cur == RLIM_INFINITY) + new_rlim.rlim_cur = sysctl_nr_open; + if (new_rlim.rlim_max > sysctl_nr_open) + return -EPERM; + } + + if (new_rlim.rlim_cur > new_rlim.rlim_max) + return -EINVAL; retval = security_task_setrlimit(resource, &new_rlim); if (retval) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 503d8d4eb80a..a77b27b11b04 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -126,6 +126,11 @@ cond_syscall(sys_vm86); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); cond_syscall(sys_flock); +cond_syscall(sys_io_setup); +cond_syscall(sys_io_destroy); +cond_syscall(sys_io_submit); +cond_syscall(sys_io_cancel); +cond_syscall(sys_io_getevents); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cfc5295f1e82..b3cc73931d1f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -149,7 +149,7 @@ extern int max_lock_depth; #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -379,10 +379,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", - .data = &tainted, - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = &proc_dointvec_taint, + .proc_handler = &proc_taint, }, #endif #ifdef CONFIG_LATENCYTOP @@ -834,6 +833,16 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_UNEVICTABLE_LRU + { + .ctl_name = CTL_UNNUMBERED, + .procname = "scan_unevictable_pages", + .data = &scan_unevictable_pages, + .maxlen = sizeof(scan_unevictable_pages), + .mode = 0644, + .proc_handler = &scan_unevictable_handler, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -1282,6 +1291,7 @@ static struct ctl_table fs_table[] = { .extra2 = &two, }, #endif +#ifdef CONFIG_AIO { .procname = "aio-nr", .data = &aio_nr, @@ -1296,6 +1306,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, +#endif /* CONFIG_AIO */ #ifdef CONFIG_INOTIFY_USER { .ctl_name = FS_INOTIFY, @@ -1501,7 +1512,6 @@ void register_sysctl_root(struct ctl_table_root *root) /* Perform the actual read/write of a sysctl table entry. */ static int do_sysctl_strategy(struct ctl_table_root *root, struct ctl_table *table, - int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1515,8 +1525,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root, return -EPERM; if (table->strategy) { - rc = table->strategy(table, name, nlen, oldval, oldlenp, - newval, newlen); + rc = table->strategy(table, oldval, oldlenp, newval, newlen); if (rc < 0) return rc; if (rc > 0) @@ -1526,8 +1535,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root, /* If there is no strategy routine, or if the strategy returns * zero, proceed with automatic r/w */ if (table->data && table->maxlen) { - rc = sysctl_data(table, name, nlen, oldval, oldlenp, - newval, newlen); + rc = sysctl_data(table, oldval, oldlenp, newval, newlen); if (rc < 0) return rc; } @@ -1559,7 +1567,7 @@ repeat: table = table->child; goto repeat; } - error = do_sysctl_strategy(root, table, name, nlen, + error = do_sysctl_strategy(root, table, oldval, oldlenp, newval, newlen); return error; @@ -2228,49 +2236,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, NULL,NULL); } -#define OP_SET 0 -#define OP_AND 1 -#define OP_OR 2 - -static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - int op = *(int *)data; - if (write) { - int val = *negp ? -*lvalp : *lvalp; - switch(op) { - case OP_SET: *valp = val; break; - case OP_AND: *valp &= val; break; - case OP_OR: *valp |= val; break; - } - } else { - int val = *valp; - if (val < 0) { - *negp = -1; - *lvalp = (unsigned long)-val; - } else { - *negp = 0; - *lvalp = (unsigned long)val; - } - } - return 0; -} - /* - * Taint values can only be increased + * Taint values can only be increased + * This means we can safely use a temporary. */ -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int op; + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; - op = OP_OR; - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_bset_conv,&op); + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + int i; + for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { + if ((tmptaint >> i) & 1) + add_taint(i); + } + } + + return err; } struct do_proc_dointvec_minmax_conv_param { @@ -2718,7 +2716,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, */ /* The generic sysctl data routine (used if no strategy routine supplied) */ -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, +int sysctl_data(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2752,7 +2750,7 @@ int sysctl_data(struct ctl_table *table, int __user *name, int nlen, } /* The generic string strategy routine: */ -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, +int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2798,7 +2796,7 @@ int sysctl_string(struct ctl_table *table, int __user *name, int nlen, * are between the minimum and maximum values given in the arrays * table->extra1 and table->extra2, respectively. */ -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, +int sysctl_intvec(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2834,7 +2832,7 @@ int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, } /* Strategy function to convert jiffies to seconds */ -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2868,7 +2866,7 @@ int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, } /* Strategy function to convert jiffies to seconds */ -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_ms_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2923,35 +2921,35 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) return error; } -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, +int sysctl_data(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, +int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, +int sysctl_intvec(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_ms_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8d53106a0a92..95ed42951e0a 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -3,7 +3,6 @@ # config TICK_ONESHOT bool - default n config NO_HZ bool "Tickless System (Dynamic Ticks)" diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a4d219398167..b711ffcb106c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -270,7 +270,7 @@ void tick_nohz_stop_sched_tick(int inidle) next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; - if (rcu_needs_cpu(cpu)) + if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu)) delta_jiffies = 1; /* * Do not stop the tick, if we are only one off diff --git a/kernel/timer.c b/kernel/timer.c index 03bc7f1f1593..510fe69351ca 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -978,6 +978,7 @@ void update_process_times(int user_tick) run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); + printk_tick(); scheduler_tick(); run_posix_cpu_timers(p); } diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4ab9659d269e..3b34b3545936 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -60,7 +60,7 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, #ifdef CONFIG_SYSCTL_SYSCALL /* The generic string strategy routine: */ -static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, +static int sysctl_uts_string(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -69,8 +69,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, write = newval && newlen; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = sysctl_string(&uts_table, name, nlen, - oldval, oldlenp, newval, newlen); + r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen); put_uts(table, write, uts_table.data); return r; } diff --git a/kernel/wait.c b/kernel/wait.c index c275c56cf2d3..cd87131f2fc2 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -72,12 +72,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) __add_wait_queue(q, wait); - /* - * don't alter the task state if this is just going to - * queue an async wait queue callback - */ - if (is_sync_wait(wait)) - set_current_state(state); + set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait); @@ -91,12 +86,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) __add_wait_queue_tail(q, wait); - /* - * don't alter the task state if this is just going to - * queue an async wait queue callback - */ - if (is_sync_wait(wait)) - set_current_state(state); + set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait_exclusive); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4048e92aa04f..714afad46539 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -9,7 +9,7 @@ * Derived from the taskqueue/keventd code by: * * David Woodhouse <dwmw2@infradead.org> - * Andrew Morton <andrewm@uow.edu.au> + * Andrew Morton * Kai Petzke <wpp@marie.physik.tu-berlin.de> * Theodore Ts'o <tytso@mit.edu> * |