summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c26
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/cpuset.c22
-rw-r--r--kernel/exit.c75
-rw-r--r--kernel/fork.c81
-rw-r--r--kernel/futex.c10
-rw-r--r--kernel/irq/proc.c3
-rw-r--r--kernel/kallsyms.c16
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/mutex.c9
-rw-r--r--kernel/nsproxy.c42
-rw-r--r--kernel/pid.c75
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/sched.c511
-rw-r--r--kernel/signal.c13
-rw-r--r--kernel/sys.c23
-rw-r--r--kernel/sysctl.c387
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/timer.c148
-rw-r--r--kernel/tsacct.c9
-rw-r--r--kernel/workqueue.c21
21 files changed, 883 insertions, 608 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index dc12db8600e7..70d0d88e5554 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -118,7 +118,7 @@ static int check_free_space(struct file *file)
spin_unlock(&acct_globals.lock);
/* May block */
- if (vfs_statfs(file->f_dentry, &sbuf))
+ if (vfs_statfs(file->f_path.dentry, &sbuf))
return res;
suspend = sbuf.f_blocks * SUSPEND;
resume = sbuf.f_blocks * RESUME;
@@ -194,7 +194,7 @@ static void acct_file_reopen(struct file *file)
add_timer(&acct_globals.timer);
}
if (old_acct) {
- mnt_unpin(old_acct->f_vfsmnt);
+ mnt_unpin(old_acct->f_path.mnt);
spin_unlock(&acct_globals.lock);
do_acct_process(old_acct);
filp_close(old_acct, NULL);
@@ -212,7 +212,7 @@ static int acct_on(char *name)
if (IS_ERR(file))
return PTR_ERR(file);
- if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+ if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
filp_close(file, NULL);
return -EACCES;
}
@@ -229,11 +229,11 @@ static int acct_on(char *name)
}
spin_lock(&acct_globals.lock);
- mnt_pin(file->f_vfsmnt);
+ mnt_pin(file->f_path.mnt);
acct_file_reopen(file);
spin_unlock(&acct_globals.lock);
- mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */
+ mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
return 0;
}
@@ -283,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name)
void acct_auto_close_mnt(struct vfsmount *m)
{
spin_lock(&acct_globals.lock);
- if (acct_globals.file && acct_globals.file->f_vfsmnt == m)
+ if (acct_globals.file && acct_globals.file->f_path.mnt == m)
acct_file_reopen(NULL);
spin_unlock(&acct_globals.lock);
}
@@ -299,7 +299,7 @@ void acct_auto_close(struct super_block *sb)
{
spin_lock(&acct_globals.lock);
if (acct_globals.file &&
- acct_globals.file->f_vfsmnt->mnt_sb == sb) {
+ acct_globals.file->f_path.mnt->mnt_sb == sb) {
acct_file_reopen(NULL);
}
spin_unlock(&acct_globals.lock);
@@ -428,6 +428,7 @@ static void do_acct_process(struct file *file)
u64 elapsed;
u64 run_time;
struct timespec uptime;
+ struct tty_struct *tty;
/*
* First check to see if there is enough free_space to continue
@@ -484,16 +485,9 @@ static void do_acct_process(struct file *file)
ac.ac_ppid = current->parent->tgid;
#endif
- mutex_lock(&tty_mutex);
- /* FIXME: Whoever is responsible for current->signal locking needs
- to use the same locking all over the kernel and document it */
- read_lock(&tasklist_lock);
- ac.ac_tty = current->signal->tty ?
- old_encode_dev(tty_devnum(current->signal->tty)) : 0;
- read_unlock(&tasklist_lock);
- mutex_unlock(&tty_mutex);
-
spin_lock_irq(&current->sighand->siglock);
+ tty = current->signal->tty;
+ ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
ac.ac_flag = pacct->ac_flag;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 40722e26de98..298897559ca4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -781,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
if ((vma->vm_flags & VM_EXECUTABLE) &&
vma->vm_file) {
audit_log_d_path(ab, "exe=",
- vma->vm_file->f_dentry,
- vma->vm_file->f_vfsmnt);
+ vma->vm_file->f_path.dentry,
+ vma->vm_file->f_path.mnt);
break;
}
vma = vma->vm_next;
@@ -826,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
context->return_code);
mutex_lock(&tty_mutex);
+ read_lock(&tasklist_lock);
if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
tty = tsk->signal->tty->name;
else
tty = "(none)";
+ read_unlock(&tasklist_lock);
audit_log_format(ab,
" a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
" ppid=%d pid=%d auid=%u uid=%u gid=%u"
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0a6b4d89f9a0..2c3b4431472b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = {
*
*
* When reading/writing to a file:
- * - the cpuset to use in file->f_dentry->d_parent->d_fsdata
- * - the 'cftype' of the file is file->f_dentry->d_fsdata
+ * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
+ * - the 'cftype' of the file is file->f_path.dentry->d_fsdata
*/
struct cftype {
@@ -1284,8 +1284,8 @@ static ssize_t cpuset_common_file_write(struct file *file,
const char __user *userbuf,
size_t nbytes, loff_t *unused_ppos)
{
- struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
- struct cftype *cft = __d_cft(file->f_dentry);
+ struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
+ struct cftype *cft = __d_cft(file->f_path.dentry);
cpuset_filetype_t type = cft->private;
char *buffer;
char *pathbuf = NULL;
@@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
size_t nbytes, loff_t *ppos)
{
ssize_t retval = 0;
- struct cftype *cft = __d_cft(file->f_dentry);
+ struct cftype *cft = __d_cft(file->f_path.dentry);
if (!cft)
return -ENODEV;
@@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos)
{
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+ struct cftype *cft = __d_cft(file->f_path.dentry);
+ struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
cpuset_filetype_t type = cft->private;
char *page;
ssize_t retval = 0;
@@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt
loff_t *ppos)
{
ssize_t retval = 0;
- struct cftype *cft = __d_cft(file->f_dentry);
+ struct cftype *cft = __d_cft(file->f_path.dentry);
if (!cft)
return -ENODEV;
@@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
if (err)
return err;
- cft = __d_cft(file->f_dentry);
+ cft = __d_cft(file->f_path.dentry);
if (!cft)
return -ENODEV;
if (cft->open)
@@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
static int cpuset_file_release(struct inode *inode, struct file *file)
{
- struct cftype *cft = __d_cft(file->f_dentry);
+ struct cftype *cft = __d_cft(file->f_path.dentry);
if (cft->release)
return cft->release(inode, file);
return 0;
@@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
*/
static int cpuset_tasks_open(struct inode *unused, struct file *file)
{
- struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+ struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
struct ctr_struct *ctr;
pid_t *pidarray;
int npids;
diff --git a/kernel/exit.c b/kernel/exit.c
index 4e3f919edc48..122fadb972fc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,7 +13,7 @@
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
#include <linux/key.h>
#include <linux/security.h>
#include <linux/cpu.h>
@@ -22,6 +22,7 @@
#include <linux/file.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/profile.h>
#include <linux/mount.h>
@@ -48,7 +49,6 @@
#include <asm/mmu_context.h>
extern void sem_exit (void);
-extern struct task_struct *child_reaper;
static void exit_mm(struct task_struct * tsk);
@@ -189,21 +189,18 @@ repeat:
int session_of_pgrp(int pgrp)
{
struct task_struct *p;
- int sid = -1;
+ int sid = 0;
read_lock(&tasklist_lock);
- do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
- if (p->signal->session > 0) {
- sid = p->signal->session;
- goto out;
- }
- } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
- p = find_task_by_pid(pgrp);
- if (p)
- sid = p->signal->session;
-out:
+
+ p = find_task_by_pid_type(PIDTYPE_PGID, pgrp);
+ if (p == NULL)
+ p = find_task_by_pid(pgrp);
+ if (p != NULL)
+ sid = process_session(p);
+
read_unlock(&tasklist_lock);
-
+
return sid;
}
@@ -225,8 +222,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
|| p->exit_state
|| is_init(p->real_parent))
continue;
- if (process_group(p->real_parent) != pgrp
- && p->real_parent->signal->session == p->signal->session) {
+ if (process_group(p->real_parent) != pgrp &&
+ process_session(p->real_parent) == process_session(p)) {
ret = 0;
break;
}
@@ -260,7 +257,8 @@ static int has_stopped_jobs(int pgrp)
}
/**
- * reparent_to_init - Reparent the calling kernel thread to the init task.
+ * reparent_to_init - Reparent the calling kernel thread to the init task
+ * of the pid space that the thread belongs to.
*
* If a kernel thread is launched as a result of a system call, or if
* it ever exits, it should generally reparent itself to init so that
@@ -278,8 +276,8 @@ static void reparent_to_init(void)
ptrace_unlink(current);
/* Reparent to init */
remove_parent(current);
- current->parent = child_reaper;
- current->real_parent = child_reaper;
+ current->parent = child_reaper(current);
+ current->real_parent = child_reaper(current);
add_parent(current);
/* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -302,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp)
{
struct task_struct *curr = current->group_leader;
- if (curr->signal->session != session) {
+ if (process_session(curr) != session) {
detach_pid(curr, PIDTYPE_SID);
- curr->signal->session = session;
+ set_signal_session(curr->signal, session);
attach_pid(curr, PIDTYPE_SID, session);
}
if (process_group(curr) != pgrp) {
@@ -314,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp)
}
}
-void set_special_pids(pid_t session, pid_t pgrp)
+static void set_special_pids(pid_t session, pid_t pgrp)
{
write_lock_irq(&tasklist_lock);
__set_special_pids(session, pgrp);
@@ -384,9 +382,7 @@ void daemonize(const char *name, ...)
exit_mm(current);
set_special_pids(1, 1);
- mutex_lock(&tty_mutex);
- current->signal->tty = NULL;
- mutex_unlock(&tty_mutex);
+ proc_clear_tty(current);
/* Block and flush all signals */
sigfillset(&blocked);
@@ -429,7 +425,7 @@ static void close_files(struct files_struct * files)
for (;;) {
unsigned long set;
i = j * __NFDBITS;
- if (i >= fdt->max_fdset || i >= fdt->max_fds)
+ if (i >= fdt->max_fds)
break;
set = fdt->open_fds->fds_bits[j++];
while (set) {
@@ -470,11 +466,9 @@ void fastcall put_files_struct(struct files_struct *files)
* you can free files immediately.
*/
fdt = files_fdtable(files);
- if (fdt == &files->fdtab)
- fdt->free_files = files;
- else
+ if (fdt != &files->fdtab)
kmem_cache_free(files_cachep, files);
- free_fdtable(fdt);
+ call_rcu(&fdt->rcu, free_fdtable_rcu);
}
}
@@ -649,10 +643,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
* outside, so the child pgrp is now orphaned.
*/
if ((process_group(p) != process_group(father)) &&
- (p->signal->session == father->signal->session)) {
+ (process_session(p) == process_session(father))) {
int pgrp = process_group(p);
- if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
+ if (will_become_orphaned_pgrp(pgrp, NULL) &&
+ has_stopped_jobs(pgrp)) {
__kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
__kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
}
@@ -663,7 +658,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
* When we die, we re-parent all our children.
* Try to give them to another thread in our thread
* group, and if no such member exists, give it to
- * the global child reaper process (ie "init")
+ * the child reaper process (ie "init") in our pid
+ * space.
*/
static void
forget_original_parent(struct task_struct *father, struct list_head *to_release)
@@ -674,7 +670,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
do {
reaper = next_thread(reaper);
if (reaper == father) {
- reaper = child_reaper;
+ reaper = child_reaper(father);
break;
}
} while (reaper->exit_state);
@@ -786,7 +782,7 @@ static void exit_notify(struct task_struct *tsk)
t = tsk->real_parent;
if ((process_group(t) != process_group(tsk)) &&
- (t->signal->session == tsk->signal->session) &&
+ (process_session(t) == process_session(tsk)) &&
will_become_orphaned_pgrp(process_group(tsk), tsk) &&
has_stopped_jobs(process_group(tsk))) {
__kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
@@ -860,8 +856,13 @@ fastcall NORET_TYPE void do_exit(long code)
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
- if (unlikely(tsk == child_reaper))
- panic("Attempted to kill init!");
+ if (unlikely(tsk == child_reaper(tsk))) {
+ if (tsk->nsproxy->pid_ns != &init_pid_ns)
+ tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper;
+ else
+ panic("Attempted to kill init!");
+ }
+
if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
current->ptrace_message = code;
diff --git a/kernel/fork.c b/kernel/fork.c
index 7f2e31ba33af..d16c566eb645 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -18,7 +18,7 @@
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
@@ -36,6 +36,7 @@
#include <linux/syscalls.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
@@ -252,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
anon_vma_link(tmp);
file = tmp->vm_file;
if (file) {
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file->f_path.dentry->d_inode;
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
@@ -613,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
static int count_open_files(struct fdtable *fdt)
{
- int size = fdt->max_fdset;
+ int size = fdt->max_fds;
int i;
/* Find the last open fd */
@@ -640,12 +641,10 @@ static struct files_struct *alloc_files(void)
newf->next_fd = 0;
fdt = &newf->fdtab;
fdt->max_fds = NR_OPEN_DEFAULT;
- fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
fdt->open_fds = (fd_set *)&newf->open_fds_init;
fdt->fd = &newf->fd_array[0];
INIT_RCU_HEAD(&fdt->rcu);
- fdt->free_files = NULL;
fdt->next = NULL;
rcu_assign_pointer(newf->fdt, fdt);
out:
@@ -661,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
{
struct files_struct *newf;
struct file **old_fds, **new_fds;
- int open_files, size, i, expand;
+ int open_files, size, i;
struct fdtable *old_fdt, *new_fdt;
*errorp = -ENOMEM;
@@ -672,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
spin_lock(&oldf->file_lock);
old_fdt = files_fdtable(oldf);
new_fdt = files_fdtable(newf);
- size = old_fdt->max_fdset;
open_files = count_open_files(old_fdt);
- expand = 0;
/*
- * Check whether we need to allocate a larger fd array or fd set.
- * Note: we're not a clone task, so the open count won't change.
+ * Check whether we need to allocate a larger fd array and fd set.
+ * Note: we're not a clone task, so the open count won't change.
*/
- if (open_files > new_fdt->max_fdset) {
- new_fdt->max_fdset = 0;
- expand = 1;
- }
if (open_files > new_fdt->max_fds) {
new_fdt->max_fds = 0;
- expand = 1;
- }
-
- /* if the old fdset gets grown now, we'll only copy up to "size" fds */
- if (expand) {
spin_unlock(&oldf->file_lock);
spin_lock(&newf->file_lock);
*errorp = expand_files(newf, open_files-1);
@@ -710,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
- memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
- memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
+ memcpy(new_fdt->open_fds->fds_bits,
+ old_fdt->open_fds->fds_bits, open_files/8);
+ memcpy(new_fdt->close_on_exec->fds_bits,
+ old_fdt->close_on_exec->fds_bits, open_files/8);
for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
@@ -736,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
/* This is long word aligned thus could use a optimized version */
memset(new_fds, 0, size);
- if (new_fdt->max_fdset > open_files) {
- int left = (new_fdt->max_fdset-open_files)/8;
+ if (new_fdt->max_fds > open_files) {
+ int left = (new_fdt->max_fds-open_files)/8;
int start = open_files / (8 * sizeof(unsigned long));
memset(&new_fdt->open_fds->fds_bits[start], 0, left);
memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
}
-out:
return newf;
out_release:
- free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
- free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
- free_fd_array(new_fdt->fd, new_fdt->max_fds);
kmem_cache_free(files_cachep, newf);
+out:
return NULL;
}
@@ -1055,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->wchar = 0; /* I/O counter: bytes written */
p->syscr = 0; /* I/O counter: read syscalls */
p->syscw = 0; /* I/O counter: write syscalls */
+ task_io_accounting_init(p);
acct_clear_integrals(p);
p->it_virt_expires = cputime_zero;
@@ -1259,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (thread_group_leader(p)) {
p->signal->tty = current->signal->tty;
p->signal->pgrp = process_group(current);
- p->signal->session = current->signal->session;
+ set_signal_session(p->signal, process_session(current));
attach_pid(p, PIDTYPE_PGID, process_group(p));
- attach_pid(p, PIDTYPE_SID, p->signal->session);
+ attach_pid(p, PIDTYPE_SID, process_session(p));
list_add_tail_rcu(&p->tasks, &init_task.tasks);
__get_cpu_var(process_counts)++;
@@ -1525,17 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
}
/*
- * Unshare the namespace structure if it is being shared
+ * Unshare the mnt_namespace structure if it is being shared
*/
-static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
+static int unshare_mnt_namespace(unsigned long unshare_flags,
+ struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
{
- struct namespace *ns = current->nsproxy->namespace;
+ struct mnt_namespace *ns = current->nsproxy->mnt_ns;
if ((unshare_flags & CLONE_NEWNS) && ns) {
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
+ *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
if (!*new_nsp)
return -ENOMEM;
}
@@ -1544,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new
}
/*
- * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
- * supported yet
+ * Unsharing of sighand is not supported yet
*/
static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
{
struct sighand_struct *sigh = current->sighand;
- if ((unshare_flags & CLONE_SIGHAND) &&
- (sigh && atomic_read(&sigh->count) > 1))
+ if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
return -EINVAL;
else
return 0;
@@ -1625,8 +1612,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
{
int err = 0;
struct fs_struct *fs, *new_fs = NULL;
- struct namespace *ns, *new_ns = NULL;
- struct sighand_struct *sigh, *new_sigh = NULL;
+ struct mnt_namespace *ns, *new_ns = NULL;
+ struct sighand_struct *new_sigh = NULL;
struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
struct sem_undo_list *new_ulist = NULL;
@@ -1647,7 +1634,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
goto bad_unshare_out;
if ((err = unshare_fs(unshare_flags, &new_fs)))
goto bad_unshare_cleanup_thread;
- if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
+ if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
goto bad_unshare_cleanup_fs;
if ((err = unshare_sighand(unshare_flags, &new_sigh)))
goto bad_unshare_cleanup_ns;
@@ -1671,7 +1658,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
}
}
- if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
+ if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
new_uts || new_ipc) {
task_lock(current);
@@ -1688,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
}
if (new_ns) {
- ns = current->nsproxy->namespace;
- current->nsproxy->namespace = new_ns;
+ ns = current->nsproxy->mnt_ns;
+ current->nsproxy->mnt_ns = new_ns;
new_ns = ns;
}
- if (new_sigh) {
- sigh = current->sighand;
- rcu_assign_pointer(current->sighand, new_sigh);
- new_sigh = sigh;
- }
-
if (new_mm) {
mm = current->mm;
active_mm = current->active_mm;
@@ -1756,7 +1737,7 @@ bad_unshare_cleanup_sigh:
bad_unshare_cleanup_ns:
if (new_ns)
- put_namespace(new_ns);
+ put_mnt_ns(new_ns);
bad_unshare_cleanup_fs:
if (new_fs)
diff --git a/kernel/futex.c b/kernel/futex.c
index 95989a3b4168..5a737de857d3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
/*
* Get parameters which are the keys for a futex.
*
- * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
+ * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
@@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
/*
* Linear file mappings are also simple.
*/
- key->shared.inode = vma->vm_file->f_dentry->d_inode;
+ key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
@@ -1528,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal)
goto out;
}
filp->f_op = &futex_fops;
- filp->f_vfsmnt = mntget(futex_mnt);
- filp->f_dentry = dget(futex_mnt->mnt_root);
- filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+ filp->f_path.mnt = mntget(futex_mnt);
+ filp->f_path.dentry = dget(futex_mnt->mnt_root);
+ filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
if (signal) {
err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9a352667007c..61f5c717a8f5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
unsigned int irq = (int)(long)data, full_count = count, err;
cpumask_t new_value, tmp;
- if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
+ if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
+ CHECK_IRQ_PER_CPU(irq_desc[irq].status))
return -EIO;
err = cpumask_parse_user(buffer, count, new_value);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index ab63cfc42992..6f294ff4f9ee 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -31,14 +31,14 @@
#endif
/* These will be re-linked against their real values during the second link stage */
-extern unsigned long kallsyms_addresses[] __attribute__((weak));
-extern unsigned long kallsyms_num_syms __attribute__((weak,section("data")));
-extern u8 kallsyms_names[] __attribute__((weak));
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const unsigned long kallsyms_num_syms __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
-extern u8 kallsyms_token_table[] __attribute__((weak));
-extern u16 kallsyms_token_index[] __attribute__((weak));
+extern const u8 kallsyms_token_table[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
-extern unsigned long kallsyms_markers[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
static inline int is_kernel_inittext(unsigned long addr)
{
@@ -84,7 +84,7 @@ static int is_ksym_addr(unsigned long addr)
static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
{
int len, skipped_first = 0;
- u8 *tptr, *data;
+ const u8 *tptr, *data;
/* get the compressed symbol length from the first symbol byte */
data = &kallsyms_names[off];
@@ -132,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
* kallsyms array */
static unsigned int get_symbol_offset(unsigned long pos)
{
- u8 *name;
+ const u8 *name;
int i;
/* use the closest marker we have. We have markers every 256 positions,
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8d2bea09a4ec..3a7379aa31ca 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,7 +25,7 @@
#include <linux/kmod.h>
#include <linux/smp_lock.h>
#include <linux/slab.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
#include <linux/completion.h>
#include <linux/file.h>
#include <linux/workqueue.h>
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 8c71cf72a497..e7cbbb82765b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
}
EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+int __sched
+mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass);
+}
+
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
#endif
/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 674aceb7335a..e2ce748e96af 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -17,8 +17,9 @@
#include <linux/version.h>
#include <linux/nsproxy.h>
#include <linux/init_task.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
@@ -45,8 +46,10 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
struct nsproxy *ns;
ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL);
- if (ns)
+ if (ns) {
atomic_set(&ns->count, 1);
+ ns->id = -1;
+ }
return ns;
}
@@ -60,12 +63,14 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
struct nsproxy *ns = clone_namespaces(orig);
if (ns) {
- if (ns->namespace)
- get_namespace(ns->namespace);
+ if (ns->mnt_ns)
+ get_mnt_ns(ns->mnt_ns);
if (ns->uts_ns)
get_uts_ns(ns->uts_ns);
if (ns->ipc_ns)
get_ipc_ns(ns->ipc_ns);
+ if (ns->pid_ns)
+ get_pid_ns(ns->pid_ns);
}
return ns;
@@ -97,7 +102,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
tsk->nsproxy = new_ns;
- err = copy_namespace(flags, tsk);
+ err = copy_mnt_ns(flags, tsk);
if (err)
goto out_ns;
@@ -109,16 +114,23 @@ int copy_namespaces(int flags, struct task_struct *tsk)
if (err)
goto out_ipc;
+ err = copy_pid_ns(flags, tsk);
+ if (err)
+ goto out_pid;
+
out:
put_nsproxy(old_ns);
return err;
+out_pid:
+ if (new_ns->ipc_ns)
+ put_ipc_ns(new_ns->ipc_ns);
out_ipc:
if (new_ns->uts_ns)
put_uts_ns(new_ns->uts_ns);
out_uts:
- if (new_ns->namespace)
- put_namespace(new_ns->namespace);
+ if (new_ns->mnt_ns)
+ put_mnt_ns(new_ns->mnt_ns);
out_ns:
tsk->nsproxy = old_ns;
kfree(new_ns);
@@ -127,11 +139,13 @@ out_ns:
void free_nsproxy(struct nsproxy *ns)
{
- if (ns->namespace)
- put_namespace(ns->namespace);
- if (ns->uts_ns)
- put_uts_ns(ns->uts_ns);
- if (ns->ipc_ns)
- put_ipc_ns(ns->ipc_ns);
- kfree(ns);
+ if (ns->mnt_ns)
+ put_mnt_ns(ns->mnt_ns);
+ if (ns->uts_ns)
+ put_uts_ns(ns->uts_ns);
+ if (ns->ipc_ns)
+ put_ipc_ns(ns->ipc_ns);
+ if (ns->pid_ns)
+ put_pid_ns(ns->pid_ns);
+ kfree(ns);
}
diff --git a/kernel/pid.c b/kernel/pid.c
index a48879b0b921..2efe9d8d367b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -26,7 +26,7 @@
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/hash.h>
-#include <linux/pspace.h>
+#include <linux/pid_namespace.h>
#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
static struct hlist_head *pid_hash;
@@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT;
#define BITS_PER_PAGE (PAGE_SIZE*8)
#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
-static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
+static inline int mk_pid(struct pid_namespace *pid_ns,
+ struct pidmap *map, int off)
{
- return (map - pspace->pidmap)*BITS_PER_PAGE + off;
+ return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
}
#define find_next_offset(map, off) \
@@ -57,11 +58,15 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
* value does not cause lots of bitmaps to be allocated, but
* the scheme scales to up to 4 million PIDs, runtime.
*/
-struct pspace init_pspace = {
+struct pid_namespace init_pid_ns = {
+ .kref = {
+ .refcount = ATOMIC_INIT(2),
+ },
.pidmap = {
[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
},
- .last_pid = 0
+ .last_pid = 0,
+ .child_reaper = &init_task
};
/*
@@ -80,25 +85,25 @@ struct pspace init_pspace = {
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static fastcall void free_pidmap(struct pspace *pspace, int pid)
+static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
{
- struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE;
+ struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
int offset = pid & BITS_PER_PAGE_MASK;
clear_bit(offset, map->page);
atomic_inc(&map->nr_free);
}
-static int alloc_pidmap(struct pspace *pspace)
+static int alloc_pidmap(struct pid_namespace *pid_ns)
{
- int i, offset, max_scan, pid, last = pspace->last_pid;
+ int i, offset, max_scan, pid, last = pid_ns->last_pid;
struct pidmap *map;
pid = last + 1;
if (pid >= pid_max)
pid = RESERVED_PIDS;
offset = pid & BITS_PER_PAGE_MASK;
- map = &pspace->pidmap[pid/BITS_PER_PAGE];
+ map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
for (i = 0; i <= max_scan; ++i) {
if (unlikely(!map->page)) {
@@ -120,11 +125,11 @@ static int alloc_pidmap(struct pspace *pspace)
do {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
- pspace->last_pid = pid;
+ pid_ns->last_pid = pid;
return pid;
}
offset = find_next_offset(map, offset);
- pid = mk_pid(pspace, map, offset);
+ pid = mk_pid(pid_ns, map, offset);
/*
* find_next_offset() found a bit, the pid from it
* is in-bounds, and if we fell back to the last
@@ -135,34 +140,34 @@ static int alloc_pidmap(struct pspace *pspace)
(i != max_scan || pid < last ||
!((last+1) & BITS_PER_PAGE_MASK)));
}
- if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+ if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
++map;
offset = 0;
} else {
- map = &pspace->pidmap[0];
+ map = &pid_ns->pidmap[0];
offset = RESERVED_PIDS;
if (unlikely(last == offset))
break;
}
- pid = mk_pid(pspace, map, offset);
+ pid = mk_pid(pid_ns, map, offset);
}
return -1;
}
-static int next_pidmap(struct pspace *pspace, int last)
+static int next_pidmap(struct pid_namespace *pid_ns, int last)
{
int offset;
struct pidmap *map, *end;
offset = (last + 1) & BITS_PER_PAGE_MASK;
- map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE];
- end = &pspace->pidmap[PIDMAP_ENTRIES];
+ map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
+ end = &pid_ns->pidmap[PIDMAP_ENTRIES];
for (; map < end; map++, offset = 0) {
if (unlikely(!map->page))
continue;
offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
if (offset < BITS_PER_PAGE)
- return mk_pid(pspace, map, offset);
+ return mk_pid(pid_ns, map, offset);
}
return -1;
}
@@ -192,7 +197,7 @@ fastcall void free_pid(struct pid *pid)
hlist_del_rcu(&pid->pid_chain);
spin_unlock_irqrestore(&pidmap_lock, flags);
- free_pidmap(&init_pspace, pid->nr);
+ free_pidmap(current->nsproxy->pid_ns, pid->nr);
call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -206,7 +211,7 @@ struct pid *alloc_pid(void)
if (!pid)
goto out;
- nr = alloc_pidmap(&init_pspace);
+ nr = alloc_pidmap(current->nsproxy->pid_ns);
if (nr < 0)
goto out_free;
@@ -348,13 +353,33 @@ struct pid *find_ge_pid(int nr)
pid = find_pid(nr);
if (pid)
break;
- nr = next_pidmap(&init_pspace, nr);
+ nr = next_pidmap(current->nsproxy->pid_ns, nr);
} while (nr > 0);
return pid;
}
EXPORT_SYMBOL_GPL(find_get_pid);
+int copy_pid_ns(int flags, struct task_struct *tsk)
+{
+ struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
+ int err = 0;
+
+ if (!old_ns)
+ return 0;
+
+ get_pid_ns(old_ns);
+ return err;
+}
+
+void free_pid_ns(struct kref *kref)
+{
+ struct pid_namespace *ns;
+
+ ns = container_of(kref, struct pid_namespace, kref);
+ kfree(ns);
+}
+
/*
* The pid hash table is scaled according to the amount of memory in the
* machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -382,10 +407,10 @@ void __init pidhash_init(void)
void __init pidmap_init(void)
{
- init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
/* Reserve PID 0. We never call free_pidmap(0) */
- set_bit(0, init_pspace.pidmap[0].page);
- atomic_dec(&init_pspace.pidmap[0].nr_free);
+ set_bit(0, init_pid_ns.pidmap[0].page);
+ atomic_dec(&init_pid_ns.pidmap[0].nr_free);
pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
__alignof__(struct pid),
diff --git a/kernel/relay.c b/kernel/relay.c
index 75a3a9a7efc2..818e514729cf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -959,7 +959,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
if (!desc->count)
return 0;
- mutex_lock(&filp->f_dentry->d_inode->i_mutex);
+ mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
do {
if (!relay_file_read_avail(buf, *ppos))
break;
@@ -979,7 +979,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
*ppos = relay_file_read_end_pos(buf, read_start, ret);
}
} while (desc->count && ret);
- mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
+ mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
return desc->written;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index f385eff4682d..8a0afb97af71 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -225,8 +225,10 @@ struct rq {
unsigned long nr_uninterruptible;
unsigned long expired_timestamp;
- unsigned long long timestamp_last_tick;
+ /* Cached timestamp set by update_cpu_clock() */
+ unsigned long long most_recent_timestamp;
struct task_struct *curr, *idle;
+ unsigned long next_balance;
struct mm_struct *prev_mm;
struct prio_array *active, *expired, arrays[2];
int best_expired_prio;
@@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
* bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 12
+#define SCHEDSTAT_VERSION 14
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "domain%d %s", dcnt++, mask_str);
for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
itype++) {
- seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
+ seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
+ "%lu",
sd->lb_cnt[itype],
sd->lb_balanced[itype],
sd->lb_failed[itype],
@@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->lb_nobusyq[itype],
sd->lb_nobusyg[itype]);
}
- seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+ seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
+ " %lu %lu %lu\n",
sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
- sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
+ sd->ttwu_wake_remote, sd->ttwu_move_affine,
+ sd->ttwu_move_balance);
}
preempt_enable();
#endif
@@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
#endif
/*
- * rq_lock - lock a given runqueue and disable interrupts.
+ * this_rq_lock - lock this runqueue and disable interrupts.
*/
static inline struct rq *this_rq_lock(void)
__acquires(rq->lock)
@@ -938,13 +943,16 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
{
unsigned long long now;
+ if (rt_task(p))
+ goto out;
+
now = sched_clock();
#ifdef CONFIG_SMP
if (!local) {
/* Compensate for drifting sched_clock */
struct rq *this_rq = this_rq();
- now = (now - this_rq->timestamp_last_tick)
- + rq->timestamp_last_tick;
+ now = (now - this_rq->most_recent_timestamp)
+ + rq->most_recent_timestamp;
}
#endif
@@ -959,8 +967,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
(now - p->timestamp) >> 20);
}
- if (!rt_task(p))
- p->prio = recalc_task_prio(p, now);
+ p->prio = recalc_task_prio(p, now);
/*
* This checks to make sure it's not an uninterruptible task
@@ -985,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
}
}
p->timestamp = now;
-
+out:
__activate_task(p, rq);
}
@@ -1450,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
if (this_sd->flags & SD_WAKE_AFFINE) {
unsigned long tl = this_load;
- unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+ unsigned long tl_per_task;
+
+ tl_per_task = cpu_avg_load_per_task(this_cpu);
/*
* If sync wakeup then subtract the (maximum possible)
@@ -1688,8 +1697,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
* Not the local CPU - must adjust timestamp. This should
* get optimised away in the !CONFIG_SMP case.
*/
- p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
- + rq->timestamp_last_tick;
+ p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
+ + rq->most_recent_timestamp;
__activate_task(p, rq);
if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
@@ -1952,6 +1961,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
__acquires(rq1->lock)
__acquires(rq2->lock)
{
+ BUG_ON(!irqs_disabled());
if (rq1 == rq2) {
spin_lock(&rq1->lock);
__acquire(rq2->lock); /* Fake it out ;) */
@@ -1991,6 +2001,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
+ if (unlikely(!irqs_disabled())) {
+ /* printk() doesn't work good under rq->lock */
+ spin_unlock(&this_rq->lock);
+ BUG_ON(1);
+ }
if (unlikely(!spin_trylock(&busiest->lock))) {
if (busiest < this_rq) {
spin_unlock(&this_rq->lock);
@@ -2061,8 +2076,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
set_task_cpu(p, this_cpu);
inc_nr_running(p, this_rq);
enqueue_task(p, this_array);
- p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
- + this_rq->timestamp_last_tick;
+ p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
+ + this_rq->most_recent_timestamp;
/*
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
@@ -2098,10 +2113,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 2) too many balance attempts have failed.
*/
- if (sd->nr_balance_failed > sd->cache_nice_tries)
+ if (sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+ if (task_hot(p, rq->most_recent_timestamp, sd))
+ schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
return 1;
+ }
- if (task_hot(p, rq->timestamp_last_tick, sd))
+ if (task_hot(p, rq->most_recent_timestamp, sd))
return 0;
return 1;
}
@@ -2199,11 +2219,6 @@ skip_queue:
goto skip_bitmap;
}
-#ifdef CONFIG_SCHEDSTATS
- if (task_hot(tmp, busiest->timestamp_last_tick, sd))
- schedstat_inc(sd, lb_hot_gained[idle]);
-#endif
-
pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
pulled++;
rem_load_move -= tmp->load_weight;
@@ -2241,7 +2256,7 @@ out:
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum idle_type idle, int *sd_idle,
- cpumask_t *cpus)
+ cpumask_t *cpus, int *balance)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2270,10 +2285,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long load, group_capacity;
int local_group;
int i;
+ unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long sum_nr_running, sum_weighted_load;
local_group = cpu_isset(this_cpu, group->cpumask);
+ if (local_group)
+ balance_cpu = first_cpu(group->cpumask);
+
/* Tally up the load of all CPUs in the group */
sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -2289,9 +2308,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
*sd_idle = 0;
/* Bias balancing toward cpus of our domain */
- if (local_group)
+ if (local_group) {
+ if (idle_cpu(i) && !first_idle_cpu) {
+ first_idle_cpu = 1;
+ balance_cpu = i;
+ }
+
load = target_load(i, load_idx);
- else
+ } else
load = source_load(i, load_idx);
avg_load += load;
@@ -2299,6 +2323,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
sum_weighted_load += rq->raw_weighted_load;
}
+ /*
+ * First idle cpu or the first cpu(busiest) in this sched group
+ * is eligible for doing load balancing at this and above
+ * domains.
+ */
+ if (local_group && balance_cpu != this_cpu && balance) {
+ *balance = 0;
+ goto ret;
+ }
+
total_load += avg_load;
total_pwr += group->cpu_power;
@@ -2458,18 +2492,21 @@ small_imbalance:
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
- tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
+ tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+ busiest->cpu_power;
if (max_load > tmp)
pwr_move += busiest->cpu_power *
min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
- if (max_load*busiest->cpu_power <
- busiest_load_per_task*SCHED_LOAD_SCALE)
- tmp = max_load*busiest->cpu_power/this->cpu_power;
+ if (max_load * busiest->cpu_power <
+ busiest_load_per_task * SCHED_LOAD_SCALE)
+ tmp = max_load * busiest->cpu_power / this->cpu_power;
else
- tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
- pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
+ tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+ this->cpu_power;
+ pwr_move += this->cpu_power *
+ min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
@@ -2490,8 +2527,8 @@ out_balanced:
*imbalance = min_load_per_task;
return group_min;
}
-ret:
#endif
+ret:
*imbalance = 0;
return NULL;
}
@@ -2540,17 +2577,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
- *
- * Called with this_rq unlocked.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
- struct sched_domain *sd, enum idle_type idle)
+ struct sched_domain *sd, enum idle_type idle,
+ int *balance)
{
int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
cpumask_t cpus = CPU_MASK_ALL;
+ unsigned long flags;
/*
* When power savings policy is enabled for the parent domain, idle
@@ -2566,7 +2603,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
redo:
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
- &cpus);
+ &cpus, balance);
+
+ if (*balance == 0)
+ goto out_balanced;
+
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
@@ -2590,11 +2631,13 @@ redo:
* still unbalanced. nr_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
+ local_irq_save(flags);
double_rq_lock(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
minus_1_or_zero(busiest->nr_running),
imbalance, sd, idle, &all_pinned);
double_rq_unlock(this_rq, busiest);
+ local_irq_restore(flags);
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned)) {
@@ -2611,13 +2654,13 @@ redo:
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
- spin_lock(&busiest->lock);
+ spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the migration_thread, if the curr
* task on busiest cpu can't be moved to this_cpu
*/
if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
- spin_unlock(&busiest->lock);
+ spin_unlock_irqrestore(&busiest->lock, flags);
all_pinned = 1;
goto out_one_pinned;
}
@@ -2627,7 +2670,7 @@ redo:
busiest->push_cpu = this_cpu;
active_balance = 1;
}
- spin_unlock(&busiest->lock);
+ spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance)
wake_up_process(busiest->migration_thread);
@@ -2706,7 +2749,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
redo:
group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
- &sd_idle, &cpus);
+ &sd_idle, &cpus, NULL);
if (!group) {
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
goto out_balanced;
@@ -2766,14 +2809,28 @@ out_balanced:
static void idle_balance(int this_cpu, struct rq *this_rq)
{
struct sched_domain *sd;
+ int pulled_task = 0;
+ unsigned long next_balance = jiffies + 60 * HZ;
for_each_domain(this_cpu, sd) {
if (sd->flags & SD_BALANCE_NEWIDLE) {
/* If we've pulled tasks over stop searching: */
- if (load_balance_newidle(this_cpu, this_rq, sd))
+ pulled_task = load_balance_newidle(this_cpu,
+ this_rq, sd);
+ if (time_after(next_balance,
+ sd->last_balance + sd->balance_interval))
+ next_balance = sd->last_balance
+ + sd->balance_interval;
+ if (pulled_task)
break;
}
}
+ if (!pulled_task)
+ /*
+ * We are going idle. next_balance may be set based on
+ * a busy processor. So reset next_balance.
+ */
+ this_rq->next_balance = next_balance;
}
/*
@@ -2826,26 +2883,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
spin_unlock(&target_rq->lock);
}
-/*
- * rebalance_tick will get called every timer tick, on every CPU.
- *
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
- */
-
-/* Don't have all balancing operations going off at once: */
-static inline unsigned long cpu_offset(int cpu)
+static void update_load(struct rq *this_rq)
{
- return jiffies + cpu * HZ / NR_CPUS;
-}
-
-static void
-rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
-{
- unsigned long this_load, interval, j = cpu_offset(this_cpu);
- struct sched_domain *sd;
+ unsigned long this_load;
int i, scale;
this_load = this_rq->raw_weighted_load;
@@ -2865,6 +2905,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
new_load += scale-1;
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
}
+}
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ *
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+static DEFINE_SPINLOCK(balancing);
+
+static void run_rebalance_domains(struct softirq_action *h)
+{
+ int this_cpu = smp_processor_id(), balance = 1;
+ struct rq *this_rq = cpu_rq(this_cpu);
+ unsigned long interval;
+ struct sched_domain *sd;
+ /*
+ * We are idle if there are no processes running. This
+ * is valid even if we are the idle process (SMT).
+ */
+ enum idle_type idle = !this_rq->nr_running ?
+ SCHED_IDLE : NOT_IDLE;
+ /* Earliest time when we have to call run_rebalance_domains again */
+ unsigned long next_balance = jiffies + 60*HZ;
for_each_domain(this_cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2879,8 +2945,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
if (unlikely(!interval))
interval = 1;
- if (j - sd->last_balance >= interval) {
- if (load_balance(this_cpu, this_rq, sd, idle)) {
+ if (sd->flags & SD_SERIALIZE) {
+ if (!spin_trylock(&balancing))
+ goto out;
+ }
+
+ if (time_after_eq(jiffies, sd->last_balance + interval)) {
+ if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
@@ -2888,39 +2959,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
*/
idle = NOT_IDLE;
}
- sd->last_balance += interval;
+ sd->last_balance = jiffies;
}
+ if (sd->flags & SD_SERIALIZE)
+ spin_unlock(&balancing);
+out:
+ if (time_after(next_balance, sd->last_balance + interval))
+ next_balance = sd->last_balance + interval;
+
+ /*
+ * Stop the load balance at this level. There is another
+ * CPU in our sched group which is doing load balancing more
+ * actively.
+ */
+ if (!balance)
+ break;
}
+ this_rq->next_balance = next_balance;
}
#else
/*
* on UP we do not need to balance between CPUs:
*/
-static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
-{
-}
static inline void idle_balance(int cpu, struct rq *rq)
{
}
#endif
-static inline int wake_priority_sleeper(struct rq *rq)
+static inline void wake_priority_sleeper(struct rq *rq)
{
- int ret = 0;
-
#ifdef CONFIG_SCHED_SMT
+ if (!rq->nr_running)
+ return;
+
spin_lock(&rq->lock);
/*
* If an SMT sibling task has been put to sleep for priority
* reasons reschedule the idle task to see if it can now run.
*/
- if (rq->nr_running) {
+ if (rq->nr_running)
resched_task(rq->idle);
- ret = 1;
- }
spin_unlock(&rq->lock);
#endif
- return ret;
}
DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -2934,7 +3014,8 @@ EXPORT_PER_CPU_SYMBOL(kstat);
static inline void
update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
{
- p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
+ p->sched_time += now - p->last_ran;
+ p->last_ran = rq->most_recent_timestamp = now;
}
/*
@@ -2947,8 +3028,7 @@ unsigned long long current_sched_time(const struct task_struct *p)
unsigned long flags;
local_irq_save(flags);
- ns = max(p->timestamp, task_rq(p)->timestamp_last_tick);
- ns = p->sched_time + sched_clock() - ns;
+ ns = p->sched_time + sched_clock() - p->last_ran;
local_irq_restore(flags);
return ns;
@@ -3048,35 +3128,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
cpustat->steal = cputime64_add(cpustat->steal, tmp);
}
-/*
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
- */
-void scheduler_tick(void)
+static void task_running_tick(struct rq *rq, struct task_struct *p)
{
- unsigned long long now = sched_clock();
- struct task_struct *p = current;
- int cpu = smp_processor_id();
- struct rq *rq = cpu_rq(cpu);
-
- update_cpu_clock(p, rq, now);
-
- rq->timestamp_last_tick = now;
-
- if (p == rq->idle) {
- if (wake_priority_sleeper(rq))
- goto out;
- rebalance_tick(cpu, rq, SCHED_IDLE);
- return;
- }
-
- /* Task might have expired already, but not scheduled off yet */
if (p->array != rq->active) {
+ /* Task has expired but was not scheduled yet */
set_tsk_need_resched(p);
- goto out;
+ return;
}
spin_lock(&rq->lock);
/*
@@ -3144,8 +3201,34 @@ void scheduler_tick(void)
}
out_unlock:
spin_unlock(&rq->lock);
-out:
- rebalance_tick(cpu, rq, NOT_IDLE);
+}
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ *
+ * It also gets called by the fork code, when changing the parent's
+ * timeslices.
+ */
+void scheduler_tick(void)
+{
+ unsigned long long now = sched_clock();
+ struct task_struct *p = current;
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+
+ update_cpu_clock(p, rq, now);
+
+ if (p == rq->idle)
+ /* Task on the idle queue */
+ wake_priority_sleeper(rq);
+ else
+ task_running_tick(rq, p);
+#ifdef CONFIG_SMP
+ update_load(rq);
+ if (time_after_eq(jiffies, rq->next_balance))
+ raise_softirq(SCHED_SOFTIRQ);
+#endif
}
#ifdef CONFIG_SCHED_SMT
@@ -3291,7 +3374,8 @@ void fastcall add_preempt_count(int val)
/*
* Spinlock count overflowing soon?
*/
- DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+ PREEMPT_MASK - 10);
}
EXPORT_SYMBOL(add_preempt_count);
@@ -4990,8 +5074,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* afterwards, and pretending it was a local activate.
* This way is cleaner and logically correct.
*/
- p->timestamp = p->timestamp - rq_src->timestamp_last_tick
- + rq_dest->timestamp_last_tick;
+ p->timestamp = p->timestamp - rq_src->most_recent_timestamp
+ + rq_dest->most_recent_timestamp;
deactivate_task(p, rq_src);
__activate_task(p, rq_dest);
if (TASK_PREEMPTS_CURR(p, rq_dest))
@@ -5067,7 +5151,10 @@ wait_to_die:
}
#ifdef CONFIG_HOTPLUG_CPU
-/* Figure out where task on dead CPU should go, use force if neccessary. */
+/*
+ * Figure out where task on dead CPU should go, use force if neccessary.
+ * NOTE: interrupts should be disabled by the caller
+ */
static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
unsigned long flags;
@@ -5187,6 +5274,7 @@ void idle_task_exit(void)
mmdrop(mm);
}
+/* called under rq->lock with disabled interrupts */
static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
{
struct rq *rq = cpu_rq(dead_cpu);
@@ -5203,10 +5291,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
* Drop lock around migration; if someone else moves it,
* that's OK. No task can be added to this CPU, so iteration is
* fine.
+ * NOTE: interrupts should be left disabled --dev@
*/
- spin_unlock_irq(&rq->lock);
+ spin_unlock(&rq->lock);
move_task_off_dead_cpu(dead_cpu, p);
- spin_lock_irq(&rq->lock);
+ spin_lock(&rq->lock);
put_task_struct(p);
}
@@ -5359,16 +5448,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
+ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+ " has parent");
break;
}
printk("span %s\n", str);
if (!cpu_isset(cpu, sd->span))
- printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
+ printk(KERN_ERR "ERROR: domain->span does not contain "
+ "CPU%d\n", cpu);
if (!cpu_isset(cpu, group->cpumask))
- printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
+ printk(KERN_ERR "ERROR: domain->groups does not contain"
+ " CPU%d\n", cpu);
printk(KERN_DEBUG);
for (i = 0; i < level + 2; i++)
@@ -5383,7 +5475,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
if (!group->cpu_power) {
printk("\n");
- printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
+ printk(KERN_ERR "ERROR: domain->cpu_power not "
+ "set\n");
}
if (!cpus_weight(group->cpumask)) {
@@ -5406,15 +5499,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
printk("\n");
if (!cpus_equal(sd->span, groupmask))
- printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+ printk(KERN_ERR "ERROR: groups don't span "
+ "domain->span\n");
level++;
sd = sd->parent;
+ if (!sd)
+ continue;
- if (sd) {
- if (!cpus_subset(groupmask, sd->span))
- printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
- }
+ if (!cpus_subset(groupmask, sd->span))
+ printk(KERN_ERR "ERROR: parent span is not a superset "
+ "of domain->span\n");
} while (sd);
}
@@ -5528,28 +5623,27 @@ static int __init isolated_cpu_setup(char *str)
__setup ("isolcpus=", isolated_cpu_setup);
/*
- * init_sched_build_groups takes an array of groups, the cpumask we wish
- * to span, and a pointer to a function which identifies what group a CPU
- * belongs to. The return value of group_fn must be a valid index into the
- * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
- * keep track of groups covered with a cpumask_t).
+ * init_sched_build_groups takes the cpumask we wish to span, and a pointer
+ * to a function which identifies what group(along with sched group) a CPU
+ * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
+ * (due to the fact that we keep track of groups covered with a cpumask_t).
*
* init_sched_build_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
*/
static void
-init_sched_build_groups(struct sched_group groups[], cpumask_t span,
- const cpumask_t *cpu_map,
- int (*group_fn)(int cpu, const cpumask_t *cpu_map))
+init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
+ int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg))
{
struct sched_group *first = NULL, *last = NULL;
cpumask_t covered = CPU_MASK_NONE;
int i;
for_each_cpu_mask(i, span) {
- int group = group_fn(i, cpu_map);
- struct sched_group *sg = &groups[group];
+ struct sched_group *sg;
+ int group = group_fn(i, cpu_map, &sg);
int j;
if (cpu_isset(i, covered))
@@ -5559,7 +5653,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span,
sg->cpu_power = 0;
for_each_cpu_mask(j, span) {
- if (group_fn(j, cpu_map) != group)
+ if (group_fn(j, cpu_map, NULL) != group)
continue;
cpu_set(j, covered);
@@ -5733,8 +5827,9 @@ __setup("max_cache_size=", setup_max_cache_size);
*/
static void touch_cache(void *__cache, unsigned long __size)
{
- unsigned long size = __size/sizeof(long), chunk1 = size/3,
- chunk2 = 2*size/3;
+ unsigned long size = __size / sizeof(long);
+ unsigned long chunk1 = size / 3;
+ unsigned long chunk2 = 2 * size / 3;
unsigned long *cache = __cache;
int i;
@@ -5843,11 +5938,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
*/
measure_one(cache, size, cpu1, cpu2);
for (i = 0; i < ITERATIONS; i++)
- cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
+ cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
measure_one(cache, size, cpu2, cpu1);
for (i = 0; i < ITERATIONS; i++)
- cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
+ cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
/*
* (We measure the non-migrating [cached] cost on both
@@ -5857,17 +5952,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
measure_one(cache, size, cpu1, cpu1);
for (i = 0; i < ITERATIONS; i++)
- cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
+ cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
measure_one(cache, size, cpu2, cpu2);
for (i = 0; i < ITERATIONS; i++)
- cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
+ cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
/*
* Get the per-iteration migration cost:
*/
- do_div(cost1, 2*ITERATIONS);
- do_div(cost2, 2*ITERATIONS);
+ do_div(cost1, 2 * ITERATIONS);
+ do_div(cost2, 2 * ITERATIONS);
return cost1 - cost2;
}
@@ -5905,7 +6000,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
*/
cache = vmalloc(max_size);
if (!cache) {
- printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
+ printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
return 1000000; /* return 1 msec on very small boxen */
}
@@ -5930,7 +6025,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
avg_fluct = (avg_fluct + fluct)/2;
if (migration_debug)
- printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
+ printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
+ "(%8Ld %8Ld)\n",
cpu1, cpu2, size,
(long)cost / 1000000,
((long)cost / 100000) % 10,
@@ -6025,20 +6121,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
-1
#endif
);
- if (system_state == SYSTEM_BOOTING) {
- if (num_online_cpus() > 1) {
- printk("migration_cost=");
- for (distance = 0; distance <= max_distance; distance++) {
- if (distance)
- printk(",");
- printk("%ld", (long)migration_cost[distance] / 1000);
- }
- printk("\n");
+ if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
+ printk("migration_cost=");
+ for (distance = 0; distance <= max_distance; distance++) {
+ if (distance)
+ printk(",");
+ printk("%ld", (long)migration_cost[distance] / 1000);
}
+ printk("\n");
}
j1 = jiffies;
if (migration_debug)
- printk("migration: %ld seconds\n", (j1-j0)/HZ);
+ printk("migration: %ld seconds\n", (j1-j0) / HZ);
/*
* Move back to the original CPU. NUMA-Q gets confused
@@ -6135,10 +6229,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
*/
#ifdef CONFIG_SCHED_SMT
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
-static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
{
+ if (sg)
+ *sg = &per_cpu(sched_group_cpus, cpu);
return cpu;
}
#endif
@@ -6148,39 +6245,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
*/
#ifdef CONFIG_SCHED_MC
static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group sched_group_core[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_core);
#endif
#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
-static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
{
+ int group;
cpumask_t mask = cpu_sibling_map[cpu];
cpus_and(mask, mask, *cpu_map);
- return first_cpu(mask);
+ group = first_cpu(mask);
+ if (sg)
+ *sg = &per_cpu(sched_group_core, group);
+ return group;
}
#elif defined(CONFIG_SCHED_MC)
-static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
{
+ if (sg)
+ *sg = &per_cpu(sched_group_core, cpu);
return cpu;
}
#endif
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
-static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
{
+ int group;
#ifdef CONFIG_SCHED_MC
cpumask_t mask = cpu_coregroup_map(cpu);
cpus_and(mask, mask, *cpu_map);
- return first_cpu(mask);
+ group = first_cpu(mask);
#elif defined(CONFIG_SCHED_SMT)
cpumask_t mask = cpu_sibling_map[cpu];
cpus_and(mask, mask, *cpu_map);
- return first_cpu(mask);
+ group = first_cpu(mask);
#else
- return cpu;
+ group = cpu;
#endif
+ if (sg)
+ *sg = &per_cpu(sched_group_phys, group);
+ return group;
}
#ifdef CONFIG_NUMA
@@ -6193,12 +6303,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
-static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
+ struct sched_group **sg)
{
- return cpu_to_node(cpu);
+ cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
+ int group;
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ group = first_cpu(nodemask);
+
+ if (sg)
+ *sg = &per_cpu(sched_group_allnodes, group);
+ return group;
}
+
static void init_numa_sched_groups_power(struct sched_group *group_head)
{
struct sched_group *sg = group_head;
@@ -6234,16 +6354,9 @@ static void free_sched_groups(const cpumask_t *cpu_map)
int cpu, i;
for_each_cpu_mask(cpu, *cpu_map) {
- struct sched_group *sched_group_allnodes
- = sched_group_allnodes_bycpu[cpu];
struct sched_group **sched_group_nodes
= sched_group_nodes_bycpu[cpu];
- if (sched_group_allnodes) {
- kfree(sched_group_allnodes);
- sched_group_allnodes_bycpu[cpu] = NULL;
- }
-
if (!sched_group_nodes)
continue;
@@ -6337,7 +6450,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
struct sched_domain *sd;
#ifdef CONFIG_NUMA
struct sched_group **sched_group_nodes = NULL;
- struct sched_group *sched_group_allnodes = NULL;
+ int sd_allnodes = 0;
/*
* Allocate the per-node list of sched groups
@@ -6355,7 +6468,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
* Set up domains for cpus specified by the cpu_map.
*/
for_each_cpu_mask(i, *cpu_map) {
- int group;
struct sched_domain *sd = NULL, *p;
cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
@@ -6364,26 +6476,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
#ifdef CONFIG_NUMA
if (cpus_weight(*cpu_map)
> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
- if (!sched_group_allnodes) {
- sched_group_allnodes
- = kmalloc_node(sizeof(struct sched_group)
- * MAX_NUMNODES,
- GFP_KERNEL,
- cpu_to_node(i));
- if (!sched_group_allnodes) {
- printk(KERN_WARNING
- "Can not alloc allnodes sched group\n");
- goto error;
- }
- sched_group_allnodes_bycpu[i]
- = sched_group_allnodes;
- }
sd = &per_cpu(allnodes_domains, i);
*sd = SD_ALLNODES_INIT;
sd->span = *cpu_map;
- group = cpu_to_allnodes_group(i, cpu_map);
- sd->groups = &sched_group_allnodes[group];
+ cpu_to_allnodes_group(i, cpu_map, &sd->groups);
p = sd;
+ sd_allnodes = 1;
} else
p = NULL;
@@ -6398,36 +6496,33 @@ static int build_sched_domains(const cpumask_t *cpu_map)
p = sd;
sd = &per_cpu(phys_domains, i);
- group = cpu_to_phys_group(i, cpu_map);
*sd = SD_CPU_INIT;
sd->span = nodemask;
sd->parent = p;
if (p)
p->child = sd;
- sd->groups = &sched_group_phys[group];
+ cpu_to_phys_group(i, cpu_map, &sd->groups);
#ifdef CONFIG_SCHED_MC
p = sd;
sd = &per_cpu(core_domains, i);
- group = cpu_to_core_group(i, cpu_map);
*sd = SD_MC_INIT;
sd->span = cpu_coregroup_map(i);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
p->child = sd;
- sd->groups = &sched_group_core[group];
+ cpu_to_core_group(i, cpu_map, &sd->groups);
#endif
#ifdef CONFIG_SCHED_SMT
p = sd;
sd = &per_cpu(cpu_domains, i);
- group = cpu_to_cpu_group(i, cpu_map);
*sd = SD_SIBLING_INIT;
sd->span = cpu_sibling_map[i];
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
p->child = sd;
- sd->groups = &sched_group_cpus[group];
+ cpu_to_cpu_group(i, cpu_map, &sd->groups);
#endif
}
@@ -6439,8 +6534,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
if (i != first_cpu(this_sibling_map))
continue;
- init_sched_build_groups(sched_group_cpus, this_sibling_map,
- cpu_map, &cpu_to_cpu_group);
+ init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
}
#endif
@@ -6451,8 +6545,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
cpus_and(this_core_map, this_core_map, *cpu_map);
if (i != first_cpu(this_core_map))
continue;
- init_sched_build_groups(sched_group_core, this_core_map,
- cpu_map, &cpu_to_core_group);
+ init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
}
#endif
@@ -6465,15 +6558,13 @@ static int build_sched_domains(const cpumask_t *cpu_map)
if (cpus_empty(nodemask))
continue;
- init_sched_build_groups(sched_group_phys, nodemask,
- cpu_map, &cpu_to_phys_group);
+ init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
}
#ifdef CONFIG_NUMA
/* Set up node groups */
- if (sched_group_allnodes)
- init_sched_build_groups(sched_group_allnodes, *cpu_map,
- cpu_map, &cpu_to_allnodes_group);
+ if (sd_allnodes)
+ init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
for (i = 0; i < MAX_NUMNODES; i++) {
/* Set up node groups */
@@ -6565,10 +6656,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
for (i = 0; i < MAX_NUMNODES; i++)
init_numa_sched_groups_power(sched_group_nodes[i]);
- if (sched_group_allnodes) {
- int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map);
- struct sched_group *sg = &sched_group_allnodes[group];
+ if (sd_allnodes) {
+ struct sched_group *sg;
+ cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
init_numa_sched_groups_power(sg);
}
#endif
@@ -6847,6 +6938,10 @@ void __init sched_init(void)
set_load_weight(&init_task);
+#ifdef CONFIG_SMP
+ open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
+#endif
+
#ifdef CONFIG_RT_MUTEXES
plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index ec81defde339..1921ffdc5e77 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -24,6 +24,9 @@
#include <linux/signal.h>
#include <linux/capability.h>
#include <linux/freezer.h>
+#include <linux/pid_namespace.h>
+#include <linux/nsproxy.h>
+
#include <asm/param.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -583,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
error = -EPERM;
if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
&& ((sig != SIGCONT) ||
- (current->signal->session != t->signal->session))
+ (process_session(current) != process_session(t)))
&& (current->euid ^ t->suid) && (current->euid ^ t->uid)
&& (current->uid ^ t->suid) && (current->uid ^ t->uid)
&& !capable(CAP_KILL))
@@ -1877,8 +1880,12 @@ relock:
if (sig_kernel_ignore(signr)) /* Default is nothing. */
continue;
- /* Init gets no signals it doesn't want. */
- if (current == child_reaper)
+ /*
+ * Init of a pid space gets no signals it doesn't want from
+ * within that pid space. It can of course get signals from
+ * its parent pid space.
+ */
+ if (current == child_reaper(current))
continue;
if (sig_kernel_stop(signr)) {
diff --git a/kernel/sys.c b/kernel/sys.c
index a0c1a29a507f..c7675c1bfdf2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
if (p->real_parent == group_leader) {
err = -EPERM;
- if (p->signal->session != group_leader->signal->session)
+ if (process_session(p) != process_session(group_leader))
goto out;
err = -EACCES;
if (p->did_exec)
@@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
goto out;
if (pgid != pid) {
- struct task_struct *p;
+ struct task_struct *g =
+ find_task_by_pid_type(PIDTYPE_PGID, pgid);
- do_each_task_pid(pgid, PIDTYPE_PGID, p) {
- if (p->signal->session == group_leader->signal->session)
- goto ok_pgid;
- } while_each_task_pid(pgid, PIDTYPE_PGID, p);
- goto out;
+ if (!g || process_session(g) != process_session(group_leader))
+ goto out;
}
-ok_pgid:
err = security_task_setpgid(p, pgid);
if (err)
goto out;
@@ -1459,7 +1456,7 @@ asmlinkage long sys_getpgrp(void)
asmlinkage long sys_getsid(pid_t pid)
{
if (!pid)
- return current->signal->session;
+ return process_session(current);
else {
int retval;
struct task_struct *p;
@@ -1471,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid)
if (p) {
retval = security_task_getsid(p);
if (!retval)
- retval = p->signal->session;
+ retval = process_session(p);
}
read_unlock(&tasklist_lock);
return retval;
@@ -1484,7 +1481,6 @@ asmlinkage long sys_setsid(void)
pid_t session;
int err = -EPERM;
- mutex_lock(&tty_mutex);
write_lock_irq(&tasklist_lock);
/* Fail if I am already a session leader */
@@ -1504,12 +1500,15 @@ asmlinkage long sys_setsid(void)
group_leader->signal->leader = 1;
__set_special_pids(session, session);
+
+ spin_lock(&group_leader->sighand->siglock);
group_leader->signal->tty = NULL;
group_leader->signal->tty_old_pgrp = 0;
+ spin_unlock(&group_leader->sighand->siglock);
+
err = process_group(group_leader);
out:
write_unlock_irq(&tasklist_lock);
- mutex_unlock(&tty_mutex);
return err;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8e9f00fd6d18..130c5ec9ee0b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -92,7 +92,9 @@ extern char modprobe_path[];
extern int sg_big_buff;
#endif
#ifdef CONFIG_SYSVIPC
-static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
@@ -131,12 +133,22 @@ extern int max_lock_depth;
#ifdef CONFIG_SYSCTL_SYSCALL
static int parse_table(int __user *, int, void __user *, size_t __user *,
- void __user *, size_t, ctl_table *, void **);
+ void __user *, size_t, ctl_table *);
#endif
static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos);
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen);
+
+#ifdef CONFIG_SYSVIPC
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen);
+#endif
+
#ifdef CONFIG_PROC_SYSCTL
static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -163,6 +175,40 @@ extern ctl_table inotify_table[];
int sysctl_legacy_va_layout;
#endif
+static void *get_uts(ctl_table *table, int write)
+{
+ char *which = table->data;
+#ifdef CONFIG_UTS_NS
+ struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+ which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
+#endif
+ if (!write)
+ down_read(&uts_sem);
+ else
+ down_write(&uts_sem);
+ return which;
+}
+
+static void put_uts(ctl_table *table, int write, void *which)
+{
+ if (!write)
+ up_read(&uts_sem);
+ else
+ up_write(&uts_sem);
+}
+
+#ifdef CONFIG_SYSVIPC
+static void *get_ipc(ctl_table *table, int write)
+{
+ char *which = table->data;
+ struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+ which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
+ return which;
+}
+#else
+#define get_ipc(T,W) ((T)->data)
+#endif
+
/* /proc declarations: */
#ifdef CONFIG_PROC_SYSCTL
@@ -229,7 +275,6 @@ static ctl_table root_table[] = {
};
static ctl_table kern_table[] = {
-#ifndef CONFIG_UTS_NS
{
.ctl_name = KERN_OSTYPE,
.procname = "ostype",
@@ -237,7 +282,7 @@ static ctl_table kern_table[] = {
.maxlen = sizeof(init_uts_ns.name.sysname),
.mode = 0444,
.proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
+ .strategy = &sysctl_uts_string,
},
{
.ctl_name = KERN_OSRELEASE,
@@ -246,7 +291,7 @@ static ctl_table kern_table[] = {
.maxlen = sizeof(init_uts_ns.name.release),
.mode = 0444,
.proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
+ .strategy = &sysctl_uts_string,
},
{
.ctl_name = KERN_VERSION,
@@ -255,7 +300,7 @@ static ctl_table kern_table[] = {
.maxlen = sizeof(init_uts_ns.name.version),
.mode = 0444,
.proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
+ .strategy = &sysctl_uts_string,
},
{
.ctl_name = KERN_NODENAME,
@@ -264,7 +309,7 @@ static ctl_table kern_table[] = {
.maxlen = sizeof(init_uts_ns.name.nodename),
.mode = 0644,
.proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
+ .strategy = &sysctl_uts_string,
},
{
.ctl_name = KERN_DOMAINNAME,
@@ -273,56 +318,8 @@ static ctl_table kern_table[] = {
.maxlen = sizeof(init_uts_ns.name.domainname),
.mode = 0644,
.proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
- },
-#else /* !CONFIG_UTS_NS */
- {
- .ctl_name = KERN_OSTYPE,
- .procname = "ostype",
- .data = NULL,
- /* could maybe use __NEW_UTS_LEN here? */
- .maxlen = FIELD_SIZEOF(struct new_utsname, sysname),
- .mode = 0444,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
- },
- {
- .ctl_name = KERN_OSRELEASE,
- .procname = "osrelease",
- .data = NULL,
- .maxlen = FIELD_SIZEOF(struct new_utsname, release),
- .mode = 0444,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
- },
- {
- .ctl_name = KERN_VERSION,
- .procname = "version",
- .data = NULL,
- .maxlen = FIELD_SIZEOF(struct new_utsname, version),
- .mode = 0444,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
- },
- {
- .ctl_name = KERN_NODENAME,
- .procname = "hostname",
- .data = NULL,
- .maxlen = FIELD_SIZEOF(struct new_utsname, nodename),
- .mode = 0644,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
- },
- {
- .ctl_name = KERN_DOMAINNAME,
- .procname = "domainname",
- .data = NULL,
- .maxlen = FIELD_SIZEOF(struct new_utsname, domainname),
- .mode = 0644,
- .proc_handler = &proc_do_uts_string,
- .strategy = &sysctl_string,
+ .strategy = &sysctl_uts_string,
},
-#endif /* !CONFIG_UTS_NS */
{
.ctl_name = KERN_PANIC,
.procname = "panic",
@@ -481,58 +478,65 @@ static ctl_table kern_table[] = {
{
.ctl_name = KERN_SHMMAX,
.procname = "shmmax",
- .data = NULL,
- .maxlen = sizeof (size_t),
+ .data = &init_ipc_ns.shm_ctlmax,
+ .maxlen = sizeof (init_ipc_ns.shm_ctlmax),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_doulongvec_minmax,
+ .strategy = sysctl_ipc_data,
},
{
.ctl_name = KERN_SHMALL,
.procname = "shmall",
- .data = NULL,
- .maxlen = sizeof (size_t),
+ .data = &init_ipc_ns.shm_ctlall,
+ .maxlen = sizeof (init_ipc_ns.shm_ctlall),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_doulongvec_minmax,
+ .strategy = sysctl_ipc_data,
},
{
.ctl_name = KERN_SHMMNI,
.procname = "shmmni",
- .data = NULL,
- .maxlen = sizeof (int),
+ .data = &init_ipc_ns.shm_ctlmni,
+ .maxlen = sizeof (init_ipc_ns.shm_ctlmni),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_dointvec,
+ .strategy = sysctl_ipc_data,
},
{
.ctl_name = KERN_MSGMAX,
.procname = "msgmax",
- .data = NULL,
- .maxlen = sizeof (int),
+ .data = &init_ipc_ns.msg_ctlmax,
+ .maxlen = sizeof (init_ipc_ns.msg_ctlmax),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_dointvec,
+ .strategy = sysctl_ipc_data,
},
{
.ctl_name = KERN_MSGMNI,
.procname = "msgmni",
- .data = NULL,
- .maxlen = sizeof (int),
+ .data = &init_ipc_ns.msg_ctlmni,
+ .maxlen = sizeof (init_ipc_ns.msg_ctlmni),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_dointvec,
+ .strategy = sysctl_ipc_data,
},
{
.ctl_name = KERN_MSGMNB,
.procname = "msgmnb",
- .data = NULL,
- .maxlen = sizeof (int),
+ .data = &init_ipc_ns.msg_ctlmnb,
+ .maxlen = sizeof (init_ipc_ns.msg_ctlmnb),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_dointvec,
+ .strategy = sysctl_ipc_data,
},
{
.ctl_name = KERN_SEM,
.procname = "sem",
- .data = NULL,
+ .data = &init_ipc_ns.sem_ctls,
.maxlen = 4*sizeof (int),
.mode = 0644,
- .proc_handler = &proc_do_ipc_string,
+ .proc_handler = &proc_ipc_dointvec,
+ .strategy = sysctl_ipc_data,
},
#endif
#ifdef CONFIG_MAGIC_SYSRQ
@@ -1239,7 +1243,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
do {
struct ctl_table_header *head =
list_entry(tmp, struct ctl_table_header, ctl_entry);
- void *context = NULL;
if (!use_table(head))
continue;
@@ -1247,9 +1250,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
spin_unlock(&sysctl_lock);
error = parse_table(name, nlen, oldval, oldlenp,
- newval, newlen, head->ctl_table,
- &context);
- kfree(context);
+ newval, newlen, head->ctl_table);
spin_lock(&sysctl_lock);
unuse_table(head);
@@ -1305,7 +1306,7 @@ static inline int ctl_perm(ctl_table *table, int op)
static int parse_table(int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
void __user *newval, size_t newlen,
- ctl_table *table, void **context)
+ ctl_table *table)
{
int n;
repeat:
@@ -1325,7 +1326,7 @@ repeat:
error = table->strategy(
table, name, nlen,
oldval, oldlenp,
- newval, newlen, context);
+ newval, newlen);
if (error)
return error;
}
@@ -1336,7 +1337,7 @@ repeat:
}
error = do_sysctl_strategy(table, name, nlen,
oldval, oldlenp,
- newval, newlen, context);
+ newval, newlen);
return error;
}
}
@@ -1347,7 +1348,7 @@ repeat:
int do_sysctl_strategy (ctl_table *table,
int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
int op = 0, rc;
size_t len;
@@ -1361,7 +1362,7 @@ int do_sysctl_strategy (ctl_table *table,
if (table->strategy) {
rc = table->strategy(table, name, nlen, oldval, oldlenp,
- newval, newlen, context);
+ newval, newlen);
if (rc < 0)
return rc;
if (rc > 0)
@@ -1614,7 +1615,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
int op;
- struct proc_dir_entry *de = PDE(file->f_dentry->d_inode);
+ struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode);
struct ctl_table *table;
size_t res;
ssize_t error = -ENOTDIR;
@@ -1753,66 +1754,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
* Special case of dostring for the UTS structure. This has locks
* to observe. Should this be in kernel/sys.c ????
*/
-
-#ifndef CONFIG_UTS_NS
-static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int r;
- if (!write) {
- down_read(&uts_sem);
- r=proc_dostring(table,0,filp,buffer,lenp, ppos);
- up_read(&uts_sem);
- } else {
- down_write(&uts_sem);
- r=proc_dostring(table,1,filp,buffer,lenp, ppos);
- up_write(&uts_sem);
- }
- return r;
-}
-#else /* !CONFIG_UTS_NS */
static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int r;
- struct uts_namespace* uts_ns = current->nsproxy->uts_ns;
- char* which;
-
- switch (table->ctl_name) {
- case KERN_OSTYPE:
- which = uts_ns->name.sysname;
- break;
- case KERN_NODENAME:
- which = uts_ns->name.nodename;
- break;
- case KERN_OSRELEASE:
- which = uts_ns->name.release;
- break;
- case KERN_VERSION:
- which = uts_ns->name.version;
- break;
- case KERN_DOMAINNAME:
- which = uts_ns->name.domainname;
- break;
- default:
- r = -EINVAL;
- goto out;
- }
-
- if (!write) {
- down_read(&uts_sem);
- r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos);
- up_read(&uts_sem);
- } else {
- down_write(&uts_sem);
- r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos);
- up_write(&uts_sem);
- }
- out:
+ void *which;
+ which = get_uts(table, write);
+ r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos);
+ put_uts(table, write, which);
return r;
}
-#endif /* !CONFIG_UTS_NS */
static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
int *valp,
@@ -1976,9 +1928,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp,
#define OP_SET 0
#define OP_AND 1
-#define OP_OR 2
-#define OP_MAX 3
-#define OP_MIN 4
static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
int *valp,
@@ -1990,13 +1939,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
switch(op) {
case OP_SET: *valp = val; break;
case OP_AND: *valp &= val; break;
- case OP_OR: *valp |= val; break;
- case OP_MAX: if(*valp < val)
- *valp = val;
- break;
- case OP_MIN: if(*valp > val)
- *valp = val;
- break;
}
} else {
int val = *valp;
@@ -2391,46 +2333,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
}
#ifdef CONFIG_SYSVIPC
-static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
{
- void *data;
- struct ipc_namespace *ns;
-
- ns = current->nsproxy->ipc_ns;
-
- switch (table->ctl_name) {
- case KERN_SHMMAX:
- data = &ns->shm_ctlmax;
- goto proc_minmax;
- case KERN_SHMALL:
- data = &ns->shm_ctlall;
- goto proc_minmax;
- case KERN_SHMMNI:
- data = &ns->shm_ctlmni;
- break;
- case KERN_MSGMAX:
- data = &ns->msg_ctlmax;
- break;
- case KERN_MSGMNI:
- data = &ns->msg_ctlmni;
- break;
- case KERN_MSGMNB:
- data = &ns->msg_ctlmnb;
- break;
- case KERN_SEM:
- data = &ns->sem_ctls;
- break;
- default:
- return -EINVAL;
- }
-
- return __do_proc_dointvec(data, table, write, filp, buffer,
+ void *which;
+ which = get_ipc(table, write);
+ return __do_proc_dointvec(which, table, write, filp, buffer,
lenp, ppos, NULL, NULL);
-proc_minmax:
- return __do_proc_doulongvec_minmax(data, table, write, filp, buffer,
+}
+
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ void *which;
+ which = get_ipc(table, write);
+ return __do_proc_doulongvec_minmax(which, table, write, filp, buffer,
lenp, ppos, 1l, 1l);
}
+
#endif
static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
@@ -2475,6 +2395,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
{
return -ENOSYS;
}
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
#endif
int proc_dointvec(ctl_table *table, int write, struct file *filp,
@@ -2539,7 +2470,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
/* The generic string strategy routine: */
int sysctl_string(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
if (!table->data || !table->maxlen)
return -ENOTDIR;
@@ -2585,7 +2516,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
*/
int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
if (newval && newlen) {
@@ -2621,7 +2552,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
/* Strategy function to convert jiffies to seconds */
int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
if (oldval) {
size_t olen;
@@ -2649,7 +2580,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
/* Strategy function to convert jiffies to seconds */
int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
if (oldval) {
size_t olen;
@@ -2674,6 +2605,64 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
return 1;
}
+
+/* The generic string strategy routine: */
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ struct ctl_table uts_table;
+ int r, write;
+ write = newval && newlen;
+ memcpy(&uts_table, table, sizeof(uts_table));
+ uts_table.data = get_uts(table, write);
+ r = sysctl_string(&uts_table, name, nlen,
+ oldval, oldlenp, newval, newlen);
+ put_uts(table, write, uts_table.data);
+ return r;
+}
+
+#ifdef CONFIG_SYSVIPC
+/* The generic sysctl ipc data routine. */
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ size_t len;
+ void *data;
+
+ /* Get out of I don't have a variable */
+ if (!table->data || !table->maxlen)
+ return -ENOTDIR;
+
+ data = get_ipc(table, 1);
+ if (!data)
+ return -ENOTDIR;
+
+ if (oldval && oldlenp) {
+ if (get_user(len, oldlenp))
+ return -EFAULT;
+ if (len) {
+ if (len > table->maxlen)
+ len = table->maxlen;
+ if (copy_to_user(oldval, data, len))
+ return -EFAULT;
+ if (put_user(len, oldlenp))
+ return -EFAULT;
+ }
+ }
+
+ if (newval && newlen) {
+ if (newlen > table->maxlen)
+ newlen = table->maxlen;
+
+ if (copy_from_user(data, newval, newlen))
+ return -EFAULT;
+ }
+ return 1;
+}
+#endif
+
#else /* CONFIG_SYSCTL_SYSCALL */
@@ -2712,32 +2701,44 @@ out:
int sysctl_string(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
return -ENOSYS;
}
int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
return -ENOSYS;
}
int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
return -ENOSYS;
}
int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen, void **context)
+ void __user *newval, size_t newlen)
{
return -ENOSYS;
}
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ return -ENOSYS;
+}
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ return -ENOSYS;
+}
#endif /* CONFIG_SYSCTL_SYSCALL */
/*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 74eca5939bd9..22504afc0d34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c)
/* check if clocksource is already registered */
if (is_registered_source(c)) {
printk("register_clocksource: Cannot register %s. "
- "Already registered!", c->name);
+ "Already registered!", c->name);
ret = -EBUSY;
} else {
/* register it */
@@ -186,6 +186,7 @@ void clocksource_reselect(void)
}
EXPORT_SYMBOL(clocksource_reselect);
+#ifdef CONFIG_SYSFS
/**
* sysfs_show_current_clocksources - sysfs interface for current clocksource
* @dev: unused
@@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
* Sysfs setup bits:
*/
static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
- sysfs_override_clocksource);
+ sysfs_override_clocksource);
static SYSDEV_ATTR(available_clocksource, 0600,
- sysfs_show_available_clocksources, NULL);
+ sysfs_show_available_clocksources, NULL);
static struct sysdev_class clocksource_sysclass = {
set_kset_name("clocksource"),
@@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void)
}
device_initcall(init_clocksource_sysfs);
+#endif /* CONFIG_SYSFS */
/**
* boot_override_clocksource - boot clock override
diff --git a/kernel/timer.c b/kernel/timer.c
index c1c7fbcffec1..0256ab443d8a 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+ int rem;
+ unsigned long original = j;
+
+ /*
+ * We don't want all cpus firing their timers at once hitting the
+ * same lock or cachelines, so we skew each extra cpu with an extra
+ * 3 jiffies. This 3 jiffies came originally from the mm/ code which
+ * already did this.
+ * The skew is done by adding 3*cpunr, then round, then subtract this
+ * extra offset again.
+ */
+ j += cpu * 3;
+
+ rem = j % HZ;
+
+ /*
+ * If the target jiffie is just after a whole second (which can happen
+ * due to delays of the timer irq, long irq off times etc etc) then
+ * we should round down to the whole second, not up. Use 1/4th second
+ * as cutoff for this rounding as an extreme upper bound for this.
+ */
+ if (rem < HZ/4) /* round down */
+ j = j - rem;
+ else /* round up */
+ j = j - rem + HZ;
+
+ /* now that we have rounded, subtract the extra skew again */
+ j -= cpu * 3;
+
+ if (j <= jiffies) /* rounding ate our timeout entirely; */
+ return original;
+ return j;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies);
+
+/**
+ * __round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies_relative rounds a time delta in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long __round_jiffies_relative(unsigned long j, int cpu)
+{
+ /*
+ * In theory the following code can skip a jiffy in case jiffies
+ * increments right between the addition and the later subtraction.
+ * However since the entire point of this function is to use approximate
+ * timeouts, it's entirely ok to not handle that.
+ */
+ return __round_jiffies(j + jiffies, cpu) - jiffies;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_relative);
+
+/**
+ * round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * round_jiffies rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long round_jiffies(unsigned long j)
+{
+ return __round_jiffies(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies);
+
+/**
+ * round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * round_jiffies_relative rounds a time delta in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long round_jiffies_relative(unsigned long j)
+{
+ return __round_jiffies_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_relative);
+
+
static inline void set_running_timer(tvec_base_t *base,
struct timer_list *timer)
{
@@ -714,7 +846,7 @@ static int change_clocksource(void)
clock = new;
clock->cycle_last = now;
printk(KERN_INFO "Time: %s clocksource has been installed.\n",
- clock->name);
+ clock->name);
return 1;
} else if (clock->update_callback) {
return clock->update_callback();
@@ -722,7 +854,10 @@ static int change_clocksource(void)
return 0;
}
#else
-#define change_clocksource() (0)
+static inline int change_clocksource(void)
+{
+ return 0;
+}
#endif
/**
@@ -820,7 +955,8 @@ device_initcall(timekeeping_init_device);
* If the error is already larger, we look ahead even further
* to compensate for late or lost adjustments.
*/
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+ s64 *offset)
{
s64 tick_error, i;
u32 look_ahead, adj;
@@ -844,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *
* Now calculate the error in (1 << look_ahead) ticks, but first
* remove the single look ahead already included in the error.
*/
- tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
+ tick_error = current_tick_length() >>
+ (TICK_LENGTH_SHIFT - clock->shift + 1);
tick_error -= clock->xtime_interval >> 1;
error = ((error - tick_error) >> look_ahead) + tick_error;
@@ -896,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
clock->mult += adj;
clock->xtime_interval += interval;
clock->xtime_nsec -= offset;
- clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
+ clock->error -= (interval - offset) <<
+ (TICK_LENGTH_SHIFT - clock->shift);
}
/**
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 96f77013d3f0..baacc3691415 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -96,6 +96,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
stats->write_char = p->wchar;
stats->read_syscalls = p->syscr;
stats->write_syscalls = p->syscw;
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ stats->read_bytes = p->ioac.read_bytes;
+ stats->write_bytes = p->ioac.write_bytes;
+ stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+#else
+ stats->read_bytes = 0;
+ stats->write_bytes = 0;
+ stats->cancelled_write_bytes = 0;
+#endif
}
#undef KB
#undef MB
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6b186750e9be..db49886bfae1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -85,22 +85,19 @@ static inline int is_single_threaded(struct workqueue_struct *wq)
return list_empty(&wq->list);
}
+/*
+ * Set the workqueue on which a work item is to be run
+ * - Must *only* be called if the pending flag is set
+ */
static inline void set_wq_data(struct work_struct *work, void *wq)
{
- unsigned long new, old, res;
+ unsigned long new;
+
+ BUG_ON(!work_pending(work));
- /* assume the pending flag is already set and that the task has already
- * been queued on this workqueue */
new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
- res = work->management;
- if (res != new) {
- do {
- old = res;
- new = (unsigned long) wq;
- new |= (old & WORK_STRUCT_FLAG_MASK);
- res = cmpxchg(&work->management, old, new);
- } while (res != old);
- }
+ new |= work->management & WORK_STRUCT_FLAG_MASK;
+ work->management = new;
}
static inline void *get_wq_data(struct work_struct *work)