diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/acct.c | 26 | ||||
-rw-r--r-- | kernel/auditsc.c | 6 | ||||
-rw-r--r-- | kernel/cpuset.c | 22 | ||||
-rw-r--r-- | kernel/exit.c | 75 | ||||
-rw-r--r-- | kernel/fork.c | 81 | ||||
-rw-r--r-- | kernel/futex.c | 10 | ||||
-rw-r--r-- | kernel/irq/proc.c | 3 | ||||
-rw-r--r-- | kernel/kallsyms.c | 16 | ||||
-rw-r--r-- | kernel/kmod.c | 2 | ||||
-rw-r--r-- | kernel/mutex.c | 9 | ||||
-rw-r--r-- | kernel/nsproxy.c | 42 | ||||
-rw-r--r-- | kernel/pid.c | 75 | ||||
-rw-r--r-- | kernel/relay.c | 4 | ||||
-rw-r--r-- | kernel/sched.c | 511 | ||||
-rw-r--r-- | kernel/signal.c | 13 | ||||
-rw-r--r-- | kernel/sys.c | 23 | ||||
-rw-r--r-- | kernel/sysctl.c | 387 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 8 | ||||
-rw-r--r-- | kernel/timer.c | 148 | ||||
-rw-r--r-- | kernel/tsacct.c | 9 | ||||
-rw-r--r-- | kernel/workqueue.c | 21 |
21 files changed, 883 insertions, 608 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index dc12db8600e7..70d0d88e5554 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -118,7 +118,7 @@ static int check_free_space(struct file *file) spin_unlock(&acct_globals.lock); /* May block */ - if (vfs_statfs(file->f_dentry, &sbuf)) + if (vfs_statfs(file->f_path.dentry, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; @@ -194,7 +194,7 @@ static void acct_file_reopen(struct file *file) add_timer(&acct_globals.timer); } if (old_acct) { - mnt_unpin(old_acct->f_vfsmnt); + mnt_unpin(old_acct->f_path.mnt); spin_unlock(&acct_globals.lock); do_acct_process(old_acct); filp_close(old_acct, NULL); @@ -212,7 +212,7 @@ static int acct_on(char *name) if (IS_ERR(file)) return PTR_ERR(file); - if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { + if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { filp_close(file, NULL); return -EACCES; } @@ -229,11 +229,11 @@ static int acct_on(char *name) } spin_lock(&acct_globals.lock); - mnt_pin(file->f_vfsmnt); + mnt_pin(file->f_path.mnt); acct_file_reopen(file); spin_unlock(&acct_globals.lock); - mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ + mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ return 0; } @@ -283,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name) void acct_auto_close_mnt(struct vfsmount *m) { spin_lock(&acct_globals.lock); - if (acct_globals.file && acct_globals.file->f_vfsmnt == m) + if (acct_globals.file && acct_globals.file->f_path.mnt == m) acct_file_reopen(NULL); spin_unlock(&acct_globals.lock); } @@ -299,7 +299,7 @@ void acct_auto_close(struct super_block *sb) { spin_lock(&acct_globals.lock); if (acct_globals.file && - acct_globals.file->f_vfsmnt->mnt_sb == sb) { + acct_globals.file->f_path.mnt->mnt_sb == sb) { acct_file_reopen(NULL); } spin_unlock(&acct_globals.lock); @@ -428,6 +428,7 @@ static void do_acct_process(struct file *file) u64 elapsed; u64 run_time; struct timespec uptime; + struct tty_struct *tty; /* * First check to see if there is enough free_space to continue @@ -484,16 +485,9 @@ static void do_acct_process(struct file *file) ac.ac_ppid = current->parent->tgid; #endif - mutex_lock(&tty_mutex); - /* FIXME: Whoever is responsible for current->signal locking needs - to use the same locking all over the kernel and document it */ - read_lock(&tasklist_lock); - ac.ac_tty = current->signal->tty ? - old_encode_dev(tty_devnum(current->signal->tty)) : 0; - read_unlock(&tasklist_lock); - mutex_unlock(&tty_mutex); - spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; + ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); ac.ac_flag = pacct->ac_flag; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 40722e26de98..298897559ca4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -781,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { audit_log_d_path(ab, "exe=", - vma->vm_file->f_dentry, - vma->vm_file->f_vfsmnt); + vma->vm_file->f_path.dentry, + vma->vm_file->f_path.mnt); break; } vma = vma->vm_next; @@ -826,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->return_code); mutex_lock(&tty_mutex); + read_lock(&tasklist_lock); if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) tty = tsk->signal->tty->name; else tty = "(none)"; + read_unlock(&tasklist_lock); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " ppid=%d pid=%d auid=%u uid=%u gid=%u" diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 0a6b4d89f9a0..2c3b4431472b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = { * * * When reading/writing to a file: - * - the cpuset to use in file->f_dentry->d_parent->d_fsdata - * - the 'cftype' of the file is file->f_dentry->d_fsdata + * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata + * - the 'cftype' of the file is file->f_path.dentry->d_fsdata */ struct cftype { @@ -1284,8 +1284,8 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) { - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); - struct cftype *cft = __d_cft(file->f_dentry); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); + struct cftype *cft = __d_cft(file->f_path.dentry); cpuset_filetype_t type = cft->private; char *buffer; char *pathbuf = NULL; @@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos) { ssize_t retval = 0; - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; @@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct cftype *cft = __d_cft(file->f_dentry); - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); + struct cftype *cft = __d_cft(file->f_path.dentry); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); cpuset_filetype_t type = cft->private; char *page; ssize_t retval = 0; @@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt loff_t *ppos) { ssize_t retval = 0; - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; @@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) if (err) return err; - cft = __d_cft(file->f_dentry); + cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; if (cft->open) @@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) static int cpuset_file_release(struct inode *inode, struct file *file) { - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (cft->release) return cft->release(inode, file); return 0; @@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) */ static int cpuset_tasks_open(struct inode *unused, struct file *file) { - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); struct ctr_struct *ctr; pid_t *pidarray; int npids; diff --git a/kernel/exit.c b/kernel/exit.c index 4e3f919edc48..122fadb972fc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -13,7 +13,7 @@ #include <linux/completion.h> #include <linux/personality.h> #include <linux/tty.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/key.h> #include <linux/security.h> #include <linux/cpu.h> @@ -22,6 +22,7 @@ #include <linux/file.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> +#include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> @@ -48,7 +49,6 @@ #include <asm/mmu_context.h> extern void sem_exit (void); -extern struct task_struct *child_reaper; static void exit_mm(struct task_struct * tsk); @@ -189,21 +189,18 @@ repeat: int session_of_pgrp(int pgrp) { struct task_struct *p; - int sid = -1; + int sid = 0; read_lock(&tasklist_lock); - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { - if (p->signal->session > 0) { - sid = p->signal->session; - goto out; - } - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); - p = find_task_by_pid(pgrp); - if (p) - sid = p->signal->session; -out: + + p = find_task_by_pid_type(PIDTYPE_PGID, pgrp); + if (p == NULL) + p = find_task_by_pid(pgrp); + if (p != NULL) + sid = process_session(p); + read_unlock(&tasklist_lock); - + return sid; } @@ -225,8 +222,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) || p->exit_state || is_init(p->real_parent)) continue; - if (process_group(p->real_parent) != pgrp - && p->real_parent->signal->session == p->signal->session) { + if (process_group(p->real_parent) != pgrp && + process_session(p->real_parent) == process_session(p)) { ret = 0; break; } @@ -260,7 +257,8 @@ static int has_stopped_jobs(int pgrp) } /** - * reparent_to_init - Reparent the calling kernel thread to the init task. + * reparent_to_init - Reparent the calling kernel thread to the init task + * of the pid space that the thread belongs to. * * If a kernel thread is launched as a result of a system call, or if * it ever exits, it should generally reparent itself to init so that @@ -278,8 +276,8 @@ static void reparent_to_init(void) ptrace_unlink(current); /* Reparent to init */ remove_parent(current); - current->parent = child_reaper; - current->real_parent = child_reaper; + current->parent = child_reaper(current); + current->real_parent = child_reaper(current); add_parent(current); /* Set the exit signal to SIGCHLD so we signal init on exit */ @@ -302,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp) { struct task_struct *curr = current->group_leader; - if (curr->signal->session != session) { + if (process_session(curr) != session) { detach_pid(curr, PIDTYPE_SID); - curr->signal->session = session; + set_signal_session(curr->signal, session); attach_pid(curr, PIDTYPE_SID, session); } if (process_group(curr) != pgrp) { @@ -314,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp) } } -void set_special_pids(pid_t session, pid_t pgrp) +static void set_special_pids(pid_t session, pid_t pgrp) { write_lock_irq(&tasklist_lock); __set_special_pids(session, pgrp); @@ -384,9 +382,7 @@ void daemonize(const char *name, ...) exit_mm(current); set_special_pids(1, 1); - mutex_lock(&tty_mutex); - current->signal->tty = NULL; - mutex_unlock(&tty_mutex); + proc_clear_tty(current); /* Block and flush all signals */ sigfillset(&blocked); @@ -429,7 +425,7 @@ static void close_files(struct files_struct * files) for (;;) { unsigned long set; i = j * __NFDBITS; - if (i >= fdt->max_fdset || i >= fdt->max_fds) + if (i >= fdt->max_fds) break; set = fdt->open_fds->fds_bits[j++]; while (set) { @@ -470,11 +466,9 @@ void fastcall put_files_struct(struct files_struct *files) * you can free files immediately. */ fdt = files_fdtable(files); - if (fdt == &files->fdtab) - fdt->free_files = files; - else + if (fdt != &files->fdtab) kmem_cache_free(files_cachep, files); - free_fdtable(fdt); + call_rcu(&fdt->rcu, free_fdtable_rcu); } } @@ -649,10 +643,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * outside, so the child pgrp is now orphaned. */ if ((process_group(p) != process_group(father)) && - (p->signal->session == father->signal->session)) { + (process_session(p) == process_session(father))) { int pgrp = process_group(p); - if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { + if (will_become_orphaned_pgrp(pgrp, NULL) && + has_stopped_jobs(pgrp)) { __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); } @@ -663,7 +658,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * When we die, we re-parent all our children. * Try to give them to another thread in our thread * group, and if no such member exists, give it to - * the global child reaper process (ie "init") + * the child reaper process (ie "init") in our pid + * space. */ static void forget_original_parent(struct task_struct *father, struct list_head *to_release) @@ -674,7 +670,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release) do { reaper = next_thread(reaper); if (reaper == father) { - reaper = child_reaper; + reaper = child_reaper(father); break; } } while (reaper->exit_state); @@ -786,7 +782,7 @@ static void exit_notify(struct task_struct *tsk) t = tsk->real_parent; if ((process_group(t) != process_group(tsk)) && - (t->signal->session == tsk->signal->session) && + (process_session(t) == process_session(tsk)) && will_become_orphaned_pgrp(process_group(tsk), tsk) && has_stopped_jobs(process_group(tsk))) { __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); @@ -860,8 +856,13 @@ fastcall NORET_TYPE void do_exit(long code) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (unlikely(tsk == child_reaper)) - panic("Attempted to kill init!"); + if (unlikely(tsk == child_reaper(tsk))) { + if (tsk->nsproxy->pid_ns != &init_pid_ns) + tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper; + else + panic("Attempted to kill init!"); + } + if (unlikely(current->ptrace & PT_TRACE_EXIT)) { current->ptrace_message = code; diff --git a/kernel/fork.c b/kernel/fork.c index 7f2e31ba33af..d16c566eb645 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -18,7 +18,7 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/completion.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/personality.h> #include <linux/mempolicy.h> #include <linux/sem.h> @@ -36,6 +36,7 @@ #include <linux/syscalls.h> #include <linux/jiffies.h> #include <linux/futex.h> +#include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> #include <linux/ptrace.h> #include <linux/mount.h> @@ -252,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) anon_vma_link(tmp); file = tmp->vm_file; if (file) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); @@ -613,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) static int count_open_files(struct fdtable *fdt) { - int size = fdt->max_fdset; + int size = fdt->max_fds; int i; /* Find the last open fd */ @@ -640,12 +641,10 @@ static struct files_struct *alloc_files(void) newf->next_fd = 0; fdt = &newf->fdtab; fdt->max_fds = NR_OPEN_DEFAULT; - fdt->max_fdset = EMBEDDED_FD_SET_SIZE; fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; fdt->open_fds = (fd_set *)&newf->open_fds_init; fdt->fd = &newf->fd_array[0]; INIT_RCU_HEAD(&fdt->rcu); - fdt->free_files = NULL; fdt->next = NULL; rcu_assign_pointer(newf->fdt, fdt); out: @@ -661,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, size, i, expand; + int open_files, size, i; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; @@ -672,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); new_fdt = files_fdtable(newf); - size = old_fdt->max_fdset; open_files = count_open_files(old_fdt); - expand = 0; /* - * Check whether we need to allocate a larger fd array or fd set. - * Note: we're not a clone task, so the open count won't change. + * Check whether we need to allocate a larger fd array and fd set. + * Note: we're not a clone task, so the open count won't change. */ - if (open_files > new_fdt->max_fdset) { - new_fdt->max_fdset = 0; - expand = 1; - } if (open_files > new_fdt->max_fds) { new_fdt->max_fds = 0; - expand = 1; - } - - /* if the old fdset gets grown now, we'll only copy up to "size" fds */ - if (expand) { spin_unlock(&oldf->file_lock); spin_lock(&newf->file_lock); *errorp = expand_files(newf, open_files-1); @@ -710,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) old_fds = old_fdt->fd; new_fds = new_fdt->fd; - memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds->fds_bits, + old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, + old_fdt->close_on_exec->fds_bits, open_files/8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -736,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) /* This is long word aligned thus could use a optimized version */ memset(new_fds, 0, size); - if (new_fdt->max_fdset > open_files) { - int left = (new_fdt->max_fdset-open_files)/8; + if (new_fdt->max_fds > open_files) { + int left = (new_fdt->max_fds-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); memset(&new_fdt->open_fds->fds_bits[start], 0, left); memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); } -out: return newf; out_release: - free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); - free_fdset (new_fdt->open_fds, new_fdt->max_fdset); - free_fd_array(new_fdt->fd, new_fdt->max_fds); kmem_cache_free(files_cachep, newf); +out: return NULL; } @@ -1055,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->wchar = 0; /* I/O counter: bytes written */ p->syscr = 0; /* I/O counter: read syscalls */ p->syscw = 0; /* I/O counter: write syscalls */ + task_io_accounting_init(p); acct_clear_integrals(p); p->it_virt_expires = cputime_zero; @@ -1259,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { p->signal->tty = current->signal->tty; p->signal->pgrp = process_group(current); - p->signal->session = current->signal->session; + set_signal_session(p->signal, process_session(current)); attach_pid(p, PIDTYPE_PGID, process_group(p)); - attach_pid(p, PIDTYPE_SID, p->signal->session); + attach_pid(p, PIDTYPE_SID, process_session(p)); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; @@ -1525,17 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unshare the namespace structure if it is being shared + * Unshare the mnt_namespace structure if it is being shared */ -static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) +static int unshare_mnt_namespace(unsigned long unshare_flags, + struct mnt_namespace **new_nsp, struct fs_struct *new_fs) { - struct namespace *ns = current->nsproxy->namespace; + struct mnt_namespace *ns = current->nsproxy->mnt_ns; if ((unshare_flags & CLONE_NEWNS) && ns) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); + *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); if (!*new_nsp) return -ENOMEM; } @@ -1544,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new } /* - * Unsharing of sighand for tasks created with CLONE_SIGHAND is not - * supported yet + * Unsharing of sighand is not supported yet */ static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) { struct sighand_struct *sigh = current->sighand; - if ((unshare_flags & CLONE_SIGHAND) && - (sigh && atomic_read(&sigh->count) > 1)) + if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) return -EINVAL; else return 0; @@ -1625,8 +1612,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) { int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct namespace *ns, *new_ns = NULL; - struct sighand_struct *sigh, *new_sigh = NULL; + struct mnt_namespace *ns, *new_ns = NULL; + struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; @@ -1647,7 +1634,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) goto bad_unshare_cleanup_thread; - if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) + if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) goto bad_unshare_cleanup_fs; if ((err = unshare_sighand(unshare_flags, &new_sigh))) goto bad_unshare_cleanup_ns; @@ -1671,7 +1658,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } } - if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || + if (new_fs || new_ns || new_mm || new_fd || new_ulist || new_uts || new_ipc) { task_lock(current); @@ -1688,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } if (new_ns) { - ns = current->nsproxy->namespace; - current->nsproxy->namespace = new_ns; + ns = current->nsproxy->mnt_ns; + current->nsproxy->mnt_ns = new_ns; new_ns = ns; } - if (new_sigh) { - sigh = current->sighand; - rcu_assign_pointer(current->sighand, new_sigh); - new_sigh = sigh; - } - if (new_mm) { mm = current->mm; active_mm = current->active_mm; @@ -1756,7 +1737,7 @@ bad_unshare_cleanup_sigh: bad_unshare_cleanup_ns: if (new_ns) - put_namespace(new_ns); + put_mnt_ns(new_ns); bad_unshare_cleanup_fs: if (new_fs) diff --git a/kernel/futex.c b/kernel/futex.c index 95989a3b4168..5a737de857d3 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) /* * Get parameters which are the keys for a futex. * - * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, + * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * @@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) /* * Linear file mappings are also simple. */ - key->shared.inode = vma->vm_file->f_dentry->d_inode; + key->shared.inode = vma->vm_file->f_path.dentry->d_inode; key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) @@ -1528,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal) goto out; } filp->f_op = &futex_fops; - filp->f_vfsmnt = mntget(futex_mnt); - filp->f_dentry = dget(futex_mnt->mnt_root); - filp->f_mapping = filp->f_dentry->d_inode->i_mapping; + filp->f_path.mnt = mntget(futex_mnt); + filp->f_path.dentry = dget(futex_mnt->mnt_root); + filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; if (signal) { err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9a352667007c..61f5c717a8f5 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, unsigned int irq = (int)(long)data, full_count = count, err; cpumask_t new_value, tmp; - if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) + if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || + CHECK_IRQ_PER_CPU(irq_desc[irq].status)) return -EIO; err = cpumask_parse_user(buffer, count, new_value); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index ab63cfc42992..6f294ff4f9ee 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -31,14 +31,14 @@ #endif /* These will be re-linked against their real values during the second link stage */ -extern unsigned long kallsyms_addresses[] __attribute__((weak)); -extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); -extern u8 kallsyms_names[] __attribute__((weak)); +extern const unsigned long kallsyms_addresses[] __attribute__((weak)); +extern const unsigned long kallsyms_num_syms __attribute__((weak)); +extern const u8 kallsyms_names[] __attribute__((weak)); -extern u8 kallsyms_token_table[] __attribute__((weak)); -extern u16 kallsyms_token_index[] __attribute__((weak)); +extern const u8 kallsyms_token_table[] __attribute__((weak)); +extern const u16 kallsyms_token_index[] __attribute__((weak)); -extern unsigned long kallsyms_markers[] __attribute__((weak)); +extern const unsigned long kallsyms_markers[] __attribute__((weak)); static inline int is_kernel_inittext(unsigned long addr) { @@ -84,7 +84,7 @@ static int is_ksym_addr(unsigned long addr) static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) { int len, skipped_first = 0; - u8 *tptr, *data; + const u8 *tptr, *data; /* get the compressed symbol length from the first symbol byte */ data = &kallsyms_names[off]; @@ -132,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off) * kallsyms array */ static unsigned int get_symbol_offset(unsigned long pos) { - u8 *name; + const u8 *name; int i; /* use the closest marker we have. We have markers every 256 positions, diff --git a/kernel/kmod.c b/kernel/kmod.c index 8d2bea09a4ec..3a7379aa31ca 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -25,7 +25,7 @@ #include <linux/kmod.h> #include <linux/smp_lock.h> #include <linux/slab.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/completion.h> #include <linux/file.h> #include <linux/workqueue.h> diff --git a/kernel/mutex.c b/kernel/mutex.c index 8c71cf72a497..e7cbbb82765b 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) } EXPORT_SYMBOL_GPL(mutex_lock_nested); + +int __sched +mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) +{ + might_sleep(); + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass); +} + +EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); #endif /* diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 674aceb7335a..e2ce748e96af 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -17,8 +17,9 @@ #include <linux/version.h> #include <linux/nsproxy.h> #include <linux/init_task.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/utsname.h> +#include <linux/pid_namespace.h> struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); @@ -45,8 +46,10 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) struct nsproxy *ns; ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); - if (ns) + if (ns) { atomic_set(&ns->count, 1); + ns->id = -1; + } return ns; } @@ -60,12 +63,14 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig) struct nsproxy *ns = clone_namespaces(orig); if (ns) { - if (ns->namespace) - get_namespace(ns->namespace); + if (ns->mnt_ns) + get_mnt_ns(ns->mnt_ns); if (ns->uts_ns) get_uts_ns(ns->uts_ns); if (ns->ipc_ns) get_ipc_ns(ns->ipc_ns); + if (ns->pid_ns) + get_pid_ns(ns->pid_ns); } return ns; @@ -97,7 +102,7 @@ int copy_namespaces(int flags, struct task_struct *tsk) tsk->nsproxy = new_ns; - err = copy_namespace(flags, tsk); + err = copy_mnt_ns(flags, tsk); if (err) goto out_ns; @@ -109,16 +114,23 @@ int copy_namespaces(int flags, struct task_struct *tsk) if (err) goto out_ipc; + err = copy_pid_ns(flags, tsk); + if (err) + goto out_pid; + out: put_nsproxy(old_ns); return err; +out_pid: + if (new_ns->ipc_ns) + put_ipc_ns(new_ns->ipc_ns); out_ipc: if (new_ns->uts_ns) put_uts_ns(new_ns->uts_ns); out_uts: - if (new_ns->namespace) - put_namespace(new_ns->namespace); + if (new_ns->mnt_ns) + put_mnt_ns(new_ns->mnt_ns); out_ns: tsk->nsproxy = old_ns; kfree(new_ns); @@ -127,11 +139,13 @@ out_ns: void free_nsproxy(struct nsproxy *ns) { - if (ns->namespace) - put_namespace(ns->namespace); - if (ns->uts_ns) - put_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - put_ipc_ns(ns->ipc_ns); - kfree(ns); + if (ns->mnt_ns) + put_mnt_ns(ns->mnt_ns); + if (ns->uts_ns) + put_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + put_ipc_ns(ns->ipc_ns); + if (ns->pid_ns) + put_pid_ns(ns->pid_ns); + kfree(ns); } diff --git a/kernel/pid.c b/kernel/pid.c index a48879b0b921..2efe9d8d367b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -26,7 +26,7 @@ #include <linux/init.h> #include <linux/bootmem.h> #include <linux/hash.h> -#include <linux/pspace.h> +#include <linux/pid_namespace.h> #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) static struct hlist_head *pid_hash; @@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT; #define BITS_PER_PAGE (PAGE_SIZE*8) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) -static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) +static inline int mk_pid(struct pid_namespace *pid_ns, + struct pidmap *map, int off) { - return (map - pspace->pidmap)*BITS_PER_PAGE + off; + return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; } #define find_next_offset(map, off) \ @@ -57,11 +58,15 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ -struct pspace init_pspace = { +struct pid_namespace init_pid_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, .pidmap = { [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }, - .last_pid = 0 + .last_pid = 0, + .child_reaper = &init_task }; /* @@ -80,25 +85,25 @@ struct pspace init_pspace = { static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static fastcall void free_pidmap(struct pspace *pspace, int pid) +static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid) { - struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; + struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; int offset = pid & BITS_PER_PAGE_MASK; clear_bit(offset, map->page); atomic_inc(&map->nr_free); } -static int alloc_pidmap(struct pspace *pspace) +static int alloc_pidmap(struct pid_namespace *pid_ns) { - int i, offset, max_scan, pid, last = pspace->last_pid; + int i, offset, max_scan, pid, last = pid_ns->last_pid; struct pidmap *map; pid = last + 1; if (pid >= pid_max) pid = RESERVED_PIDS; offset = pid & BITS_PER_PAGE_MASK; - map = &pspace->pidmap[pid/BITS_PER_PAGE]; + map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; for (i = 0; i <= max_scan; ++i) { if (unlikely(!map->page)) { @@ -120,11 +125,11 @@ static int alloc_pidmap(struct pspace *pspace) do { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); - pspace->last_pid = pid; + pid_ns->last_pid = pid; return pid; } offset = find_next_offset(map, offset); - pid = mk_pid(pspace, map, offset); + pid = mk_pid(pid_ns, map, offset); /* * find_next_offset() found a bit, the pid from it * is in-bounds, and if we fell back to the last @@ -135,34 +140,34 @@ static int alloc_pidmap(struct pspace *pspace) (i != max_scan || pid < last || !((last+1) & BITS_PER_PAGE_MASK))); } - if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { + if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; offset = 0; } else { - map = &pspace->pidmap[0]; + map = &pid_ns->pidmap[0]; offset = RESERVED_PIDS; if (unlikely(last == offset)) break; } - pid = mk_pid(pspace, map, offset); + pid = mk_pid(pid_ns, map, offset); } return -1; } -static int next_pidmap(struct pspace *pspace, int last) +static int next_pidmap(struct pid_namespace *pid_ns, int last) { int offset; struct pidmap *map, *end; offset = (last + 1) & BITS_PER_PAGE_MASK; - map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; - end = &pspace->pidmap[PIDMAP_ENTRIES]; + map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; + end = &pid_ns->pidmap[PIDMAP_ENTRIES]; for (; map < end; map++, offset = 0) { if (unlikely(!map->page)) continue; offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); if (offset < BITS_PER_PAGE) - return mk_pid(pspace, map, offset); + return mk_pid(pid_ns, map, offset); } return -1; } @@ -192,7 +197,7 @@ fastcall void free_pid(struct pid *pid) hlist_del_rcu(&pid->pid_chain); spin_unlock_irqrestore(&pidmap_lock, flags); - free_pidmap(&init_pspace, pid->nr); + free_pidmap(current->nsproxy->pid_ns, pid->nr); call_rcu(&pid->rcu, delayed_put_pid); } @@ -206,7 +211,7 @@ struct pid *alloc_pid(void) if (!pid) goto out; - nr = alloc_pidmap(&init_pspace); + nr = alloc_pidmap(current->nsproxy->pid_ns); if (nr < 0) goto out_free; @@ -348,13 +353,33 @@ struct pid *find_ge_pid(int nr) pid = find_pid(nr); if (pid) break; - nr = next_pidmap(&init_pspace, nr); + nr = next_pidmap(current->nsproxy->pid_ns, nr); } while (nr > 0); return pid; } EXPORT_SYMBOL_GPL(find_get_pid); +int copy_pid_ns(int flags, struct task_struct *tsk) +{ + struct pid_namespace *old_ns = tsk->nsproxy->pid_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_pid_ns(old_ns); + return err; +} + +void free_pid_ns(struct kref *kref) +{ + struct pid_namespace *ns; + + ns = container_of(kref, struct pid_namespace, kref); + kfree(ns); +} + /* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or @@ -382,10 +407,10 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); + init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, init_pspace.pidmap[0].page); - atomic_dec(&init_pspace.pidmap[0].nr_free); + set_bit(0, init_pid_ns.pidmap[0].page); + atomic_dec(&init_pid_ns.pidmap[0].nr_free); pid_cachep = kmem_cache_create("pid", sizeof(struct pid), __alignof__(struct pid), diff --git a/kernel/relay.c b/kernel/relay.c index 75a3a9a7efc2..818e514729cf 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -959,7 +959,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, if (!desc->count) return 0; - mutex_lock(&filp->f_dentry->d_inode->i_mutex); + mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); do { if (!relay_file_read_avail(buf, *ppos)) break; @@ -979,7 +979,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, *ppos = relay_file_read_end_pos(buf, read_start, ret); } } while (desc->count && ret); - mutex_unlock(&filp->f_dentry->d_inode->i_mutex); + mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); return desc->written; } diff --git a/kernel/sched.c b/kernel/sched.c index f385eff4682d..8a0afb97af71 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -225,8 +225,10 @@ struct rq { unsigned long nr_uninterruptible; unsigned long expired_timestamp; - unsigned long long timestamp_last_tick; + /* Cached timestamp set by update_cpu_clock() */ + unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; + unsigned long next_balance; struct mm_struct *prev_mm; struct prio_array *active, *expired, arrays[2]; int best_expired_prio; @@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 12 +#define SCHEDSTAT_VERSION 14 static int show_schedstat(struct seq_file *seq, void *v) { @@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "domain%d %s", dcnt++, mask_str); for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " + "%lu", sd->lb_cnt[itype], sd->lb_balanced[itype], sd->lb_failed[itype], @@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->lb_nobusyq[itype], sd->lb_nobusyg[itype]); } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" + " %lu %lu %lu\n", sd->alb_cnt, sd->alb_failed, sd->alb_pushed, sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); } preempt_enable(); #endif @@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) #endif /* - * rq_lock - lock a given runqueue and disable interrupts. + * this_rq_lock - lock this runqueue and disable interrupts. */ static inline struct rq *this_rq_lock(void) __acquires(rq->lock) @@ -938,13 +943,16 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) { unsigned long long now; + if (rt_task(p)) + goto out; + now = sched_clock(); #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ struct rq *this_rq = this_rq(); - now = (now - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; + now = (now - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; } #endif @@ -959,8 +967,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) (now - p->timestamp) >> 20); } - if (!rt_task(p)) - p->prio = recalc_task_prio(p, now); + p->prio = recalc_task_prio(p, now); /* * This checks to make sure it's not an uninterruptible task @@ -985,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) } } p->timestamp = now; - +out: __activate_task(p, rq); } @@ -1450,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (this_sd->flags & SD_WAKE_AFFINE) { unsigned long tl = this_load; - unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); + unsigned long tl_per_task; + + tl_per_task = cpu_avg_load_per_task(this_cpu); /* * If sync wakeup then subtract the (maximum possible) @@ -1688,8 +1697,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * Not the local CPU - must adjust timestamp. This should * get optimised away in the !CONFIG_SMP case. */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; __activate_task(p, rq); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -1952,6 +1961,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) __acquires(rq1->lock) __acquires(rq2->lock) { + BUG_ON(!irqs_disabled()); if (rq1 == rq2) { spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ @@ -1991,6 +2001,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); @@ -2061,8 +2076,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array, set_task_cpu(p, this_cpu); inc_nr_running(p, this_rq); enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) - + this_rq->timestamp_last_tick; + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + + this_rq->most_recent_timestamp; /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. @@ -2098,10 +2113,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - if (sd->nr_balance_failed > sd->cache_nice_tries) + if (sd->nr_balance_failed > sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (task_hot(p, rq->most_recent_timestamp, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif return 1; + } - if (task_hot(p, rq->timestamp_last_tick, sd)) + if (task_hot(p, rq->most_recent_timestamp, sd)) return 0; return 1; } @@ -2199,11 +2219,6 @@ skip_queue: goto skip_bitmap; } -#ifdef CONFIG_SCHEDSTATS - if (task_hot(tmp, busiest->timestamp_last_tick, sd)) - schedstat_inc(sd, lb_hot_gained[idle]); -#endif - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; rem_load_move -= tmp->load_weight; @@ -2241,7 +2256,7 @@ out: static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum idle_type idle, int *sd_idle, - cpumask_t *cpus) + cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2270,10 +2285,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long load, group_capacity; int local_group; int i; + unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; local_group = cpu_isset(this_cpu, group->cpumask); + if (local_group) + balance_cpu = first_cpu(group->cpumask); + /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -2289,9 +2308,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, *sd_idle = 0; /* Bias balancing toward cpus of our domain */ - if (local_group) + if (local_group) { + if (idle_cpu(i) && !first_idle_cpu) { + first_idle_cpu = 1; + balance_cpu = i; + } + load = target_load(i, load_idx); - else + } else load = source_load(i, load_idx); avg_load += load; @@ -2299,6 +2323,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sum_weighted_load += rq->raw_weighted_load; } + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. + */ + if (local_group && balance_cpu != this_cpu && balance) { + *balance = 0; + goto ret; + } + total_load += avg_load; total_pwr += group->cpu_power; @@ -2458,18 +2492,21 @@ small_imbalance: pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; + tmp = busiest_load_per_task * SCHED_LOAD_SCALE / + busiest->cpu_power; if (max_load > tmp) pwr_move += busiest->cpu_power * min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ - if (max_load*busiest->cpu_power < - busiest_load_per_task*SCHED_LOAD_SCALE) - tmp = max_load*busiest->cpu_power/this->cpu_power; + if (max_load * busiest->cpu_power < + busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = max_load * busiest->cpu_power / this->cpu_power; else - tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; - pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); + tmp = busiest_load_per_task * SCHED_LOAD_SCALE / + this->cpu_power; + pwr_move += this->cpu_power * + min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ @@ -2490,8 +2527,8 @@ out_balanced: *imbalance = min_load_per_task; return group_min; } -ret: #endif +ret: *imbalance = 0; return NULL; } @@ -2540,17 +2577,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n) /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. - * - * Called with this_rq unlocked. */ static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle, + int *balance) { int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; cpumask_t cpus = CPU_MASK_ALL; + unsigned long flags; /* * When power savings policy is enabled for the parent domain, idle @@ -2566,7 +2603,11 @@ static int load_balance(int this_cpu, struct rq *this_rq, redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, - &cpus); + &cpus, balance); + + if (*balance == 0) + goto out_balanced; + if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; @@ -2590,11 +2631,13 @@ redo: * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ + local_irq_save(flags); double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, minus_1_or_zero(busiest->nr_running), imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); + local_irq_restore(flags); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { @@ -2611,13 +2654,13 @@ redo: if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - spin_lock(&busiest->lock); + spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { - spin_unlock(&busiest->lock); + spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; } @@ -2627,7 +2670,7 @@ redo: busiest->push_cpu = this_cpu; active_balance = 1; } - spin_unlock(&busiest->lock); + spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) wake_up_process(busiest->migration_thread); @@ -2706,7 +2749,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, - &sd_idle, &cpus); + &sd_idle, &cpus, NULL); if (!group) { schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; @@ -2766,14 +2809,28 @@ out_balanced: static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; + int pulled_task = 0; + unsigned long next_balance = jiffies + 60 * HZ; for_each_domain(this_cpu, sd) { if (sd->flags & SD_BALANCE_NEWIDLE) { /* If we've pulled tasks over stop searching: */ - if (load_balance_newidle(this_cpu, this_rq, sd)) + pulled_task = load_balance_newidle(this_cpu, + this_rq, sd); + if (time_after(next_balance, + sd->last_balance + sd->balance_interval)) + next_balance = sd->last_balance + + sd->balance_interval; + if (pulled_task) break; } } + if (!pulled_task) + /* + * We are going idle. next_balance may be set based on + * a busy processor. So reset next_balance. + */ + this_rq->next_balance = next_balance; } /* @@ -2826,26 +2883,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) spin_unlock(&target_rq->lock); } -/* - * rebalance_tick will get called every timer tick, on every CPU. - * - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in arch_init_sched_domains. - */ - -/* Don't have all balancing operations going off at once: */ -static inline unsigned long cpu_offset(int cpu) +static void update_load(struct rq *this_rq) { - return jiffies + cpu * HZ / NR_CPUS; -} - -static void -rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) -{ - unsigned long this_load, interval, j = cpu_offset(this_cpu); - struct sched_domain *sd; + unsigned long this_load; int i, scale; this_load = this_rq->raw_weighted_load; @@ -2865,6 +2905,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; } +} + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ +static DEFINE_SPINLOCK(balancing); + +static void run_rebalance_domains(struct softirq_action *h) +{ + int this_cpu = smp_processor_id(), balance = 1; + struct rq *this_rq = cpu_rq(this_cpu); + unsigned long interval; + struct sched_domain *sd; + /* + * We are idle if there are no processes running. This + * is valid even if we are the idle process (SMT). + */ + enum idle_type idle = !this_rq->nr_running ? + SCHED_IDLE : NOT_IDLE; + /* Earliest time when we have to call run_rebalance_domains again */ + unsigned long next_balance = jiffies + 60*HZ; for_each_domain(this_cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -2879,8 +2945,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) if (unlikely(!interval)) interval = 1; - if (j - sd->last_balance >= interval) { - if (load_balance(this_cpu, this_rq, sd, idle)) { + if (sd->flags & SD_SERIALIZE) { + if (!spin_trylock(&balancing)) + goto out; + } + + if (time_after_eq(jiffies, sd->last_balance + interval)) { + if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -2888,39 +2959,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) */ idle = NOT_IDLE; } - sd->last_balance += interval; + sd->last_balance = jiffies; } + if (sd->flags & SD_SERIALIZE) + spin_unlock(&balancing); +out: + if (time_after(next_balance, sd->last_balance + interval)) + next_balance = sd->last_balance + interval; + + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!balance) + break; } + this_rq->next_balance = next_balance; } #else /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) -{ -} static inline void idle_balance(int cpu, struct rq *rq) { } #endif -static inline int wake_priority_sleeper(struct rq *rq) +static inline void wake_priority_sleeper(struct rq *rq) { - int ret = 0; - #ifdef CONFIG_SCHED_SMT + if (!rq->nr_running) + return; + spin_lock(&rq->lock); /* * If an SMT sibling task has been put to sleep for priority * reasons reschedule the idle task to see if it can now run. */ - if (rq->nr_running) { + if (rq->nr_running) resched_task(rq->idle); - ret = 1; - } spin_unlock(&rq->lock); #endif - return ret; } DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -2934,7 +3014,8 @@ EXPORT_PER_CPU_SYMBOL(kstat); static inline void update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) { - p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); + p->sched_time += now - p->last_ran; + p->last_ran = rq->most_recent_timestamp = now; } /* @@ -2947,8 +3028,7 @@ unsigned long long current_sched_time(const struct task_struct *p) unsigned long flags; local_irq_save(flags); - ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); - ns = p->sched_time + sched_clock() - ns; + ns = p->sched_time + sched_clock() - p->last_ran; local_irq_restore(flags); return ns; @@ -3048,35 +3128,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal) cpustat->steal = cputime64_add(cpustat->steal, tmp); } -/* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. - */ -void scheduler_tick(void) +static void task_running_tick(struct rq *rq, struct task_struct *p) { - unsigned long long now = sched_clock(); - struct task_struct *p = current; - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - - update_cpu_clock(p, rq, now); - - rq->timestamp_last_tick = now; - - if (p == rq->idle) { - if (wake_priority_sleeper(rq)) - goto out; - rebalance_tick(cpu, rq, SCHED_IDLE); - return; - } - - /* Task might have expired already, but not scheduled off yet */ if (p->array != rq->active) { + /* Task has expired but was not scheduled yet */ set_tsk_need_resched(p); - goto out; + return; } spin_lock(&rq->lock); /* @@ -3144,8 +3201,34 @@ void scheduler_tick(void) } out_unlock: spin_unlock(&rq->lock); -out: - rebalance_tick(cpu, rq, NOT_IDLE); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(void) +{ + unsigned long long now = sched_clock(); + struct task_struct *p = current; + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + update_cpu_clock(p, rq, now); + + if (p == rq->idle) + /* Task on the idle queue */ + wake_priority_sleeper(rq); + else + task_running_tick(rq, p); +#ifdef CONFIG_SMP + update_load(rq); + if (time_after_eq(jiffies, rq->next_balance)) + raise_softirq(SCHED_SOFTIRQ); +#endif } #ifdef CONFIG_SCHED_SMT @@ -3291,7 +3374,8 @@ void fastcall add_preempt_count(int val) /* * Spinlock count overflowing soon? */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= + PREEMPT_MASK - 10); } EXPORT_SYMBOL(add_preempt_count); @@ -4990,8 +5074,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * afterwards, and pretending it was a local activate. * This way is cleaner and logically correct. */ - p->timestamp = p->timestamp - rq_src->timestamp_last_tick - + rq_dest->timestamp_last_tick; + p->timestamp = p->timestamp - rq_src->most_recent_timestamp + + rq_dest->most_recent_timestamp; deactivate_task(p, rq_src); __activate_task(p, rq_dest); if (TASK_PREEMPTS_CURR(p, rq_dest)) @@ -5067,7 +5151,10 @@ wait_to_die: } #ifdef CONFIG_HOTPLUG_CPU -/* Figure out where task on dead CPU should go, use force if neccessary. */ +/* + * Figure out where task on dead CPU should go, use force if neccessary. + * NOTE: interrupts should be disabled by the caller + */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { unsigned long flags; @@ -5187,6 +5274,7 @@ void idle_task_exit(void) mmdrop(mm); } +/* called under rq->lock with disabled interrupts */ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) { struct rq *rq = cpu_rq(dead_cpu); @@ -5203,10 +5291,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) * Drop lock around migration; if someone else moves it, * that's OK. No task can be added to this CPU, so iteration is * fine. + * NOTE: interrupts should be left disabled --dev@ */ - spin_unlock_irq(&rq->lock); + spin_unlock(&rq->lock); move_task_off_dead_cpu(dead_cpu, p); - spin_lock_irq(&rq->lock); + spin_lock(&rq->lock); put_task_struct(p); } @@ -5359,16 +5448,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); break; } printk("span %s\n", str); if (!cpu_isset(cpu, sd->span)) - printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); if (!cpu_isset(cpu, group->cpumask)) - printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); printk(KERN_DEBUG); for (i = 0; i < level + 2; i++) @@ -5383,7 +5475,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!group->cpu_power) { printk("\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); } if (!cpus_weight(group->cpumask)) { @@ -5406,15 +5499,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk("\n"); if (!cpus_equal(sd->span, groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + printk(KERN_ERR "ERROR: groups don't span " + "domain->span\n"); level++; sd = sd->parent; + if (!sd) + continue; - if (sd) { - if (!cpus_subset(groupmask, sd->span)) - printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); - } + if (!cpus_subset(groupmask, sd->span)) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); } while (sd); } @@ -5528,28 +5623,27 @@ static int __init isolated_cpu_setup(char *str) __setup ("isolcpus=", isolated_cpu_setup); /* - * init_sched_build_groups takes an array of groups, the cpumask we wish - * to span, and a pointer to a function which identifies what group a CPU - * belongs to. The return value of group_fn must be a valid index into the - * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we - * keep track of groups covered with a cpumask_t). + * init_sched_build_groups takes the cpumask we wish to span, and a pointer + * to a function which identifies what group(along with sched group) a CPU + * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS + * (due to the fact that we keep track of groups covered with a cpumask_t). * * init_sched_build_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ static void -init_sched_build_groups(struct sched_group groups[], cpumask_t span, - const cpumask_t *cpu_map, - int (*group_fn)(int cpu, const cpumask_t *cpu_map)) +init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, + int (*group_fn)(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg)) { struct sched_group *first = NULL, *last = NULL; cpumask_t covered = CPU_MASK_NONE; int i; for_each_cpu_mask(i, span) { - int group = group_fn(i, cpu_map); - struct sched_group *sg = &groups[group]; + struct sched_group *sg; + int group = group_fn(i, cpu_map, &sg); int j; if (cpu_isset(i, covered)) @@ -5559,7 +5653,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span, sg->cpu_power = 0; for_each_cpu_mask(j, span) { - if (group_fn(j, cpu_map) != group) + if (group_fn(j, cpu_map, NULL) != group) continue; cpu_set(j, covered); @@ -5733,8 +5827,9 @@ __setup("max_cache_size=", setup_max_cache_size); */ static void touch_cache(void *__cache, unsigned long __size) { - unsigned long size = __size/sizeof(long), chunk1 = size/3, - chunk2 = 2*size/3; + unsigned long size = __size / sizeof(long); + unsigned long chunk1 = size / 3; + unsigned long chunk2 = 2 * size / 3; unsigned long *cache = __cache; int i; @@ -5843,11 +5938,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) */ measure_one(cache, size, cpu1, cpu2); for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); + cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); measure_one(cache, size, cpu2, cpu1); for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); + cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); /* * (We measure the non-migrating [cached] cost on both @@ -5857,17 +5952,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) measure_one(cache, size, cpu1, cpu1); for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); + cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); measure_one(cache, size, cpu2, cpu2); for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); + cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); /* * Get the per-iteration migration cost: */ - do_div(cost1, 2*ITERATIONS); - do_div(cost2, 2*ITERATIONS); + do_div(cost1, 2 * ITERATIONS); + do_div(cost2, 2 * ITERATIONS); return cost1 - cost2; } @@ -5905,7 +6000,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) */ cache = vmalloc(max_size); if (!cache) { - printk("could not vmalloc %d bytes for cache!\n", 2*max_size); + printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); return 1000000; /* return 1 msec on very small boxen */ } @@ -5930,7 +6025,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) avg_fluct = (avg_fluct + fluct)/2; if (migration_debug) - printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", + printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " + "(%8Ld %8Ld)\n", cpu1, cpu2, size, (long)cost / 1000000, ((long)cost / 100000) % 10, @@ -6025,20 +6121,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) -1 #endif ); - if (system_state == SYSTEM_BOOTING) { - if (num_online_cpus() > 1) { - printk("migration_cost="); - for (distance = 0; distance <= max_distance; distance++) { - if (distance) - printk(","); - printk("%ld", (long)migration_cost[distance] / 1000); - } - printk("\n"); + if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { + printk("migration_cost="); + for (distance = 0; distance <= max_distance; distance++) { + if (distance) + printk(","); + printk("%ld", (long)migration_cost[distance] / 1000); } + printk("\n"); } j1 = jiffies; if (migration_debug) - printk("migration: %ld seconds\n", (j1-j0)/HZ); + printk("migration: %ld seconds\n", (j1-j0) / HZ); /* * Move back to the original CPU. NUMA-Q gets confused @@ -6135,10 +6229,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); -static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + if (sg) + *sg = &per_cpu(sched_group_cpus, cpu); return cpu; } #endif @@ -6148,39 +6245,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) */ #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); -static struct sched_group sched_group_core[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_core); #endif #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) -static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + int group; cpumask_t mask = cpu_sibling_map[cpu]; cpus_and(mask, mask, *cpu_map); - return first_cpu(mask); + group = first_cpu(mask); + if (sg) + *sg = &per_cpu(sched_group_core, group); + return group; } #elif defined(CONFIG_SCHED_MC) -static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + if (sg) + *sg = &per_cpu(sched_group_core, cpu); return cpu; } #endif static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static struct sched_group sched_group_phys[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_phys); -static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + int group; #ifdef CONFIG_SCHED_MC cpumask_t mask = cpu_coregroup_map(cpu); cpus_and(mask, mask, *cpu_map); - return first_cpu(mask); + group = first_cpu(mask); #elif defined(CONFIG_SCHED_SMT) cpumask_t mask = cpu_sibling_map[cpu]; cpus_and(mask, mask, *cpu_map); - return first_cpu(mask); + group = first_cpu(mask); #else - return cpu; + group = cpu; #endif + if (sg) + *sg = &per_cpu(sched_group_phys, group); + return group; } #ifdef CONFIG_NUMA @@ -6193,12 +6303,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); -static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); -static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) +static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { - return cpu_to_node(cpu); + cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); + int group; + + cpus_and(nodemask, nodemask, *cpu_map); + group = first_cpu(nodemask); + + if (sg) + *sg = &per_cpu(sched_group_allnodes, group); + return group; } + static void init_numa_sched_groups_power(struct sched_group *group_head) { struct sched_group *sg = group_head; @@ -6234,16 +6354,9 @@ static void free_sched_groups(const cpumask_t *cpu_map) int cpu, i; for_each_cpu_mask(cpu, *cpu_map) { - struct sched_group *sched_group_allnodes - = sched_group_allnodes_bycpu[cpu]; struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; - if (sched_group_allnodes) { - kfree(sched_group_allnodes); - sched_group_allnodes_bycpu[cpu] = NULL; - } - if (!sched_group_nodes) continue; @@ -6337,7 +6450,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) struct sched_domain *sd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; - struct sched_group *sched_group_allnodes = NULL; + int sd_allnodes = 0; /* * Allocate the per-node list of sched groups @@ -6355,7 +6468,6 @@ static int build_sched_domains(const cpumask_t *cpu_map) * Set up domains for cpus specified by the cpu_map. */ for_each_cpu_mask(i, *cpu_map) { - int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); @@ -6364,26 +6476,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_NUMA if (cpus_weight(*cpu_map) > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { - if (!sched_group_allnodes) { - sched_group_allnodes - = kmalloc_node(sizeof(struct sched_group) - * MAX_NUMNODES, - GFP_KERNEL, - cpu_to_node(i)); - if (!sched_group_allnodes) { - printk(KERN_WARNING - "Can not alloc allnodes sched group\n"); - goto error; - } - sched_group_allnodes_bycpu[i] - = sched_group_allnodes; - } sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; sd->span = *cpu_map; - group = cpu_to_allnodes_group(i, cpu_map); - sd->groups = &sched_group_allnodes[group]; + cpu_to_allnodes_group(i, cpu_map, &sd->groups); p = sd; + sd_allnodes = 1; } else p = NULL; @@ -6398,36 +6496,33 @@ static int build_sched_domains(const cpumask_t *cpu_map) p = sd; sd = &per_cpu(phys_domains, i); - group = cpu_to_phys_group(i, cpu_map); *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; if (p) p->child = sd; - sd->groups = &sched_group_phys[group]; + cpu_to_phys_group(i, cpu_map, &sd->groups); #ifdef CONFIG_SCHED_MC p = sd; sd = &per_cpu(core_domains, i); - group = cpu_to_core_group(i, cpu_map); *sd = SD_MC_INIT; sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - sd->groups = &sched_group_core[group]; + cpu_to_core_group(i, cpu_map, &sd->groups); #endif #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); - group = cpu_to_cpu_group(i, cpu_map); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - sd->groups = &sched_group_cpus[group]; + cpu_to_cpu_group(i, cpu_map, &sd->groups); #endif } @@ -6439,8 +6534,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (i != first_cpu(this_sibling_map)) continue; - init_sched_build_groups(sched_group_cpus, this_sibling_map, - cpu_map, &cpu_to_cpu_group); + init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); } #endif @@ -6451,8 +6545,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpus_and(this_core_map, this_core_map, *cpu_map); if (i != first_cpu(this_core_map)) continue; - init_sched_build_groups(sched_group_core, this_core_map, - cpu_map, &cpu_to_core_group); + init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); } #endif @@ -6465,15 +6558,13 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (cpus_empty(nodemask)) continue; - init_sched_build_groups(sched_group_phys, nodemask, - cpu_map, &cpu_to_phys_group); + init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); } #ifdef CONFIG_NUMA /* Set up node groups */ - if (sched_group_allnodes) - init_sched_build_groups(sched_group_allnodes, *cpu_map, - cpu_map, &cpu_to_allnodes_group); + if (sd_allnodes) + init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ @@ -6565,10 +6656,10 @@ static int build_sched_domains(const cpumask_t *cpu_map) for (i = 0; i < MAX_NUMNODES; i++) init_numa_sched_groups_power(sched_group_nodes[i]); - if (sched_group_allnodes) { - int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); - struct sched_group *sg = &sched_group_allnodes[group]; + if (sd_allnodes) { + struct sched_group *sg; + cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); init_numa_sched_groups_power(sg); } #endif @@ -6847,6 +6938,10 @@ void __init sched_init(void) set_load_weight(&init_task); +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); +#endif + #ifdef CONFIG_RT_MUTEXES plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); #endif diff --git a/kernel/signal.c b/kernel/signal.c index ec81defde339..1921ffdc5e77 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -24,6 +24,9 @@ #include <linux/signal.h> #include <linux/capability.h> #include <linux/freezer.h> +#include <linux/pid_namespace.h> +#include <linux/nsproxy.h> + #include <asm/param.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -583,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info, error = -EPERM; if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) && ((sig != SIGCONT) || - (current->signal->session != t->signal->session)) + (process_session(current) != process_session(t))) && (current->euid ^ t->suid) && (current->euid ^ t->uid) && (current->uid ^ t->suid) && (current->uid ^ t->uid) && !capable(CAP_KILL)) @@ -1877,8 +1880,12 @@ relock: if (sig_kernel_ignore(signr)) /* Default is nothing. */ continue; - /* Init gets no signals it doesn't want. */ - if (current == child_reaper) + /* + * Init of a pid space gets no signals it doesn't want from + * within that pid space. It can of course get signals from + * its parent pid space. + */ + if (current == child_reaper(current)) continue; if (sig_kernel_stop(signr)) { diff --git a/kernel/sys.c b/kernel/sys.c index a0c1a29a507f..c7675c1bfdf2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (p->real_parent == group_leader) { err = -EPERM; - if (p->signal->session != group_leader->signal->session) + if (process_session(p) != process_session(group_leader)) goto out; err = -EACCES; if (p->did_exec) @@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) goto out; if (pgid != pid) { - struct task_struct *p; + struct task_struct *g = + find_task_by_pid_type(PIDTYPE_PGID, pgid); - do_each_task_pid(pgid, PIDTYPE_PGID, p) { - if (p->signal->session == group_leader->signal->session) - goto ok_pgid; - } while_each_task_pid(pgid, PIDTYPE_PGID, p); - goto out; + if (!g || process_session(g) != process_session(group_leader)) + goto out; } -ok_pgid: err = security_task_setpgid(p, pgid); if (err) goto out; @@ -1459,7 +1456,7 @@ asmlinkage long sys_getpgrp(void) asmlinkage long sys_getsid(pid_t pid) { if (!pid) - return current->signal->session; + return process_session(current); else { int retval; struct task_struct *p; @@ -1471,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid) if (p) { retval = security_task_getsid(p); if (!retval) - retval = p->signal->session; + retval = process_session(p); } read_unlock(&tasklist_lock); return retval; @@ -1484,7 +1481,6 @@ asmlinkage long sys_setsid(void) pid_t session; int err = -EPERM; - mutex_lock(&tty_mutex); write_lock_irq(&tasklist_lock); /* Fail if I am already a session leader */ @@ -1504,12 +1500,15 @@ asmlinkage long sys_setsid(void) group_leader->signal->leader = 1; __set_special_pids(session, session); + + spin_lock(&group_leader->sighand->siglock); group_leader->signal->tty = NULL; group_leader->signal->tty_old_pgrp = 0; + spin_unlock(&group_leader->sighand->siglock); + err = process_group(group_leader); out: write_unlock_irq(&tasklist_lock); - mutex_unlock(&tty_mutex); return err; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8e9f00fd6d18..130c5ec9ee0b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -92,7 +92,9 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif #ifdef CONFIG_SYSVIPC -static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, +static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -131,12 +133,22 @@ extern int max_lock_depth; #ifdef CONFIG_SYSCTL_SYSCALL static int parse_table(int __user *, int, void __user *, size_t __user *, - void __user *, size_t, ctl_table *, void **); + void __user *, size_t, ctl_table *); #endif static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen); + +#ifdef CONFIG_SYSVIPC +static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen); +#endif + #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -163,6 +175,40 @@ extern ctl_table inotify_table[]; int sysctl_legacy_va_layout; #endif +static void *get_uts(ctl_table *table, int write) +{ + char *which = table->data; +#ifdef CONFIG_UTS_NS + struct uts_namespace *uts_ns = current->nsproxy->uts_ns; + which = (which - (char *)&init_uts_ns) + (char *)uts_ns; +#endif + if (!write) + down_read(&uts_sem); + else + down_write(&uts_sem); + return which; +} + +static void put_uts(ctl_table *table, int write, void *which) +{ + if (!write) + up_read(&uts_sem); + else + up_write(&uts_sem); +} + +#ifdef CONFIG_SYSVIPC +static void *get_ipc(ctl_table *table, int write) +{ + char *which = table->data; + struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; + which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; + return which; +} +#else +#define get_ipc(T,W) ((T)->data) +#endif + /* /proc declarations: */ #ifdef CONFIG_PROC_SYSCTL @@ -229,7 +275,6 @@ static ctl_table root_table[] = { }; static ctl_table kern_table[] = { -#ifndef CONFIG_UTS_NS { .ctl_name = KERN_OSTYPE, .procname = "ostype", @@ -237,7 +282,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.sysname), .mode = 0444, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_OSRELEASE, @@ -246,7 +291,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.release), .mode = 0444, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_VERSION, @@ -255,7 +300,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.version), .mode = 0444, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_NODENAME, @@ -264,7 +309,7 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_DOMAINNAME, @@ -273,56 +318,8 @@ static ctl_table kern_table[] = { .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, -#else /* !CONFIG_UTS_NS */ - { - .ctl_name = KERN_OSTYPE, - .procname = "ostype", - .data = NULL, - /* could maybe use __NEW_UTS_LEN here? */ - .maxlen = FIELD_SIZEOF(struct new_utsname, sysname), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, - { - .ctl_name = KERN_OSRELEASE, - .procname = "osrelease", - .data = NULL, - .maxlen = FIELD_SIZEOF(struct new_utsname, release), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, - { - .ctl_name = KERN_VERSION, - .procname = "version", - .data = NULL, - .maxlen = FIELD_SIZEOF(struct new_utsname, version), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, - { - .ctl_name = KERN_NODENAME, - .procname = "hostname", - .data = NULL, - .maxlen = FIELD_SIZEOF(struct new_utsname, nodename), - .mode = 0644, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, - }, - { - .ctl_name = KERN_DOMAINNAME, - .procname = "domainname", - .data = NULL, - .maxlen = FIELD_SIZEOF(struct new_utsname, domainname), - .mode = 0644, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_string, + .strategy = &sysctl_uts_string, }, -#endif /* !CONFIG_UTS_NS */ { .ctl_name = KERN_PANIC, .procname = "panic", @@ -481,58 +478,65 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_SHMMAX, .procname = "shmmax", - .data = NULL, - .maxlen = sizeof (size_t), + .data = &init_ipc_ns.shm_ctlmax, + .maxlen = sizeof (init_ipc_ns.shm_ctlmax), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_doulongvec_minmax, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_SHMALL, .procname = "shmall", - .data = NULL, - .maxlen = sizeof (size_t), + .data = &init_ipc_ns.shm_ctlall, + .maxlen = sizeof (init_ipc_ns.shm_ctlall), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_doulongvec_minmax, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_SHMMNI, .procname = "shmmni", - .data = NULL, - .maxlen = sizeof (int), + .data = &init_ipc_ns.shm_ctlmni, + .maxlen = sizeof (init_ipc_ns.shm_ctlmni), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_MSGMAX, .procname = "msgmax", - .data = NULL, - .maxlen = sizeof (int), + .data = &init_ipc_ns.msg_ctlmax, + .maxlen = sizeof (init_ipc_ns.msg_ctlmax), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_MSGMNI, .procname = "msgmni", - .data = NULL, - .maxlen = sizeof (int), + .data = &init_ipc_ns.msg_ctlmni, + .maxlen = sizeof (init_ipc_ns.msg_ctlmni), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_MSGMNB, .procname = "msgmnb", - .data = NULL, - .maxlen = sizeof (int), + .data = &init_ipc_ns.msg_ctlmnb, + .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_SEM, .procname = "sem", - .data = NULL, + .data = &init_ipc_ns.sem_ctls, .maxlen = 4*sizeof (int), .mode = 0644, - .proc_handler = &proc_do_ipc_string, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -1239,7 +1243,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol do { struct ctl_table_header *head = list_entry(tmp, struct ctl_table_header, ctl_entry); - void *context = NULL; if (!use_table(head)) continue; @@ -1247,9 +1250,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol spin_unlock(&sysctl_lock); error = parse_table(name, nlen, oldval, oldlenp, - newval, newlen, head->ctl_table, - &context); - kfree(context); + newval, newlen, head->ctl_table); spin_lock(&sysctl_lock); unuse_table(head); @@ -1305,7 +1306,7 @@ static inline int ctl_perm(ctl_table *table, int op) static int parse_table(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, - ctl_table *table, void **context) + ctl_table *table) { int n; repeat: @@ -1325,7 +1326,7 @@ repeat: error = table->strategy( table, name, nlen, oldval, oldlenp, - newval, newlen, context); + newval, newlen); if (error) return error; } @@ -1336,7 +1337,7 @@ repeat: } error = do_sysctl_strategy(table, name, nlen, oldval, oldlenp, - newval, newlen, context); + newval, newlen); return error; } } @@ -1347,7 +1348,7 @@ repeat: int do_sysctl_strategy (ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { int op = 0, rc; size_t len; @@ -1361,7 +1362,7 @@ int do_sysctl_strategy (ctl_table *table, if (table->strategy) { rc = table->strategy(table, name, nlen, oldval, oldlenp, - newval, newlen, context); + newval, newlen); if (rc < 0) return rc; if (rc > 0) @@ -1614,7 +1615,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, size_t count, loff_t *ppos) { int op; - struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); + struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode); struct ctl_table *table; size_t res; ssize_t error = -ENOTDIR; @@ -1753,66 +1754,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ - -#ifndef CONFIG_UTS_NS -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int r; - if (!write) { - down_read(&uts_sem); - r=proc_dostring(table,0,filp,buffer,lenp, ppos); - up_read(&uts_sem); - } else { - down_write(&uts_sem); - r=proc_dostring(table,1,filp,buffer,lenp, ppos); - up_write(&uts_sem); - } - return r; -} -#else /* !CONFIG_UTS_NS */ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int r; - struct uts_namespace* uts_ns = current->nsproxy->uts_ns; - char* which; - - switch (table->ctl_name) { - case KERN_OSTYPE: - which = uts_ns->name.sysname; - break; - case KERN_NODENAME: - which = uts_ns->name.nodename; - break; - case KERN_OSRELEASE: - which = uts_ns->name.release; - break; - case KERN_VERSION: - which = uts_ns->name.version; - break; - case KERN_DOMAINNAME: - which = uts_ns->name.domainname; - break; - default: - r = -EINVAL; - goto out; - } - - if (!write) { - down_read(&uts_sem); - r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos); - up_read(&uts_sem); - } else { - down_write(&uts_sem); - r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos); - up_write(&uts_sem); - } - out: + void *which; + which = get_uts(table, write); + r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos); + put_uts(table, write, which); return r; } -#endif /* !CONFIG_UTS_NS */ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, int *valp, @@ -1976,9 +1928,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp, #define OP_SET 0 #define OP_AND 1 -#define OP_OR 2 -#define OP_MAX 3 -#define OP_MIN 4 static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, int *valp, @@ -1990,13 +1939,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, switch(op) { case OP_SET: *valp = val; break; case OP_AND: *valp &= val; break; - case OP_OR: *valp |= val; break; - case OP_MAX: if(*valp < val) - *valp = val; - break; - case OP_MIN: if(*valp > val) - *valp = val; - break; } } else { int val = *valp; @@ -2391,46 +2333,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, } #ifdef CONFIG_SYSVIPC -static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - void *data; - struct ipc_namespace *ns; - - ns = current->nsproxy->ipc_ns; - - switch (table->ctl_name) { - case KERN_SHMMAX: - data = &ns->shm_ctlmax; - goto proc_minmax; - case KERN_SHMALL: - data = &ns->shm_ctlall; - goto proc_minmax; - case KERN_SHMMNI: - data = &ns->shm_ctlmni; - break; - case KERN_MSGMAX: - data = &ns->msg_ctlmax; - break; - case KERN_MSGMNI: - data = &ns->msg_ctlmni; - break; - case KERN_MSGMNB: - data = &ns->msg_ctlmnb; - break; - case KERN_SEM: - data = &ns->sem_ctls; - break; - default: - return -EINVAL; - } - - return __do_proc_dointvec(data, table, write, filp, buffer, + void *which; + which = get_ipc(table, write); + return __do_proc_dointvec(which, table, write, filp, buffer, lenp, ppos, NULL, NULL); -proc_minmax: - return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, +} + +static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) +{ + void *which; + which = get_ipc(table, write); + return __do_proc_doulongvec_minmax(which, table, write, filp, buffer, lenp, ppos, 1l, 1l); } + #endif static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, @@ -2475,6 +2395,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, { return -ENOSYS; } +static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} #endif int proc_dointvec(ctl_table *table, int write, struct file *filp, @@ -2539,7 +2470,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, /* The generic string strategy routine: */ int sysctl_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (!table->data || !table->maxlen) return -ENOTDIR; @@ -2585,7 +2516,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, */ int sysctl_intvec(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (newval && newlen) { @@ -2621,7 +2552,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen, /* Strategy function to convert jiffies to seconds */ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (oldval) { size_t olen; @@ -2649,7 +2580,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, /* Strategy function to convert jiffies to seconds */ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (oldval) { size_t olen; @@ -2674,6 +2605,64 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, return 1; } + +/* The generic string strategy routine: */ +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + struct ctl_table uts_table; + int r, write; + write = newval && newlen; + memcpy(&uts_table, table, sizeof(uts_table)); + uts_table.data = get_uts(table, write); + r = sysctl_string(&uts_table, name, nlen, + oldval, oldlenp, newval, newlen); + put_uts(table, write, uts_table.data); + return r; +} + +#ifdef CONFIG_SYSVIPC +/* The generic sysctl ipc data routine. */ +static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + size_t len; + void *data; + + /* Get out of I don't have a variable */ + if (!table->data || !table->maxlen) + return -ENOTDIR; + + data = get_ipc(table, 1); + if (!data) + return -ENOTDIR; + + if (oldval && oldlenp) { + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, data, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + if (newval && newlen) { + if (newlen > table->maxlen) + newlen = table->maxlen; + + if (copy_from_user(data, newval, newlen)) + return -EFAULT; + } + return 1; +} +#endif + #else /* CONFIG_SYSCTL_SYSCALL */ @@ -2712,32 +2701,44 @@ out: int sysctl_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } int sysctl_intvec(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { return -ENOSYS; } +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + return -ENOSYS; +} +static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + return -ENOSYS; +} #endif /* CONFIG_SYSCTL_SYSCALL */ /* diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 74eca5939bd9..22504afc0d34 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c) /* check if clocksource is already registered */ if (is_registered_source(c)) { printk("register_clocksource: Cannot register %s. " - "Already registered!", c->name); + "Already registered!", c->name); ret = -EBUSY; } else { /* register it */ @@ -186,6 +186,7 @@ void clocksource_reselect(void) } EXPORT_SYMBOL(clocksource_reselect); +#ifdef CONFIG_SYSFS /** * sysfs_show_current_clocksources - sysfs interface for current clocksource * @dev: unused @@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf) * Sysfs setup bits: */ static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, - sysfs_override_clocksource); + sysfs_override_clocksource); static SYSDEV_ATTR(available_clocksource, 0600, - sysfs_show_available_clocksources, NULL); + sysfs_show_available_clocksources, NULL); static struct sysdev_class clocksource_sysclass = { set_kset_name("clocksource"), @@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void) } device_initcall(init_clocksource_sysfs); +#endif /* CONFIG_SYSFS */ /** * boot_override_clocksource - boot clock override diff --git a/kernel/timer.c b/kernel/timer.c index c1c7fbcffec1..0256ab443d8a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + int rem; + unsigned long original = j; + + /* + * We don't want all cpus firing their timers at once hitting the + * same lock or cachelines, so we skew each extra cpu with an extra + * 3 jiffies. This 3 jiffies came originally from the mm/ code which + * already did this. + * The skew is done by adding 3*cpunr, then round, then subtract this + * extra offset again. + */ + j += cpu * 3; + + rem = j % HZ; + + /* + * If the target jiffie is just after a whole second (which can happen + * due to delays of the timer irq, long irq off times etc etc) then + * we should round down to the whole second, not up. Use 1/4th second + * as cutoff for this rounding as an extreme upper bound for this. + */ + if (rem < HZ/4) /* round down */ + j = j - rem; + else /* round up */ + j = j - rem + HZ; + + /* now that we have rounded, subtract the extra skew again */ + j -= cpu * 3; + + if (j <= jiffies) /* rounding ate our timeout entirely; */ + return original; + return j; +} +EXPORT_SYMBOL_GPL(__round_jiffies); + +/** + * __round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies_relative rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long __round_jiffies_relative(unsigned long j, int cpu) +{ + /* + * In theory the following code can skip a jiffy in case jiffies + * increments right between the addition and the later subtraction. + * However since the entire point of this function is to use approximate + * timeouts, it's entirely ok to not handle that. + */ + return __round_jiffies(j + jiffies, cpu) - jiffies; +} +EXPORT_SYMBOL_GPL(__round_jiffies_relative); + +/** + * round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * round_jiffies rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long round_jiffies(unsigned long j) +{ + return __round_jiffies(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies); + +/** + * round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * round_jiffies_relative rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long round_jiffies_relative(unsigned long j) +{ + return __round_jiffies_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_relative); + + static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) { @@ -714,7 +846,7 @@ static int change_clocksource(void) clock = new; clock->cycle_last = now; printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); + clock->name); return 1; } else if (clock->update_callback) { return clock->update_callback(); @@ -722,7 +854,10 @@ static int change_clocksource(void) return 0; } #else -#define change_clocksource() (0) +static inline int change_clocksource(void) +{ + return 0; +} #endif /** @@ -820,7 +955,8 @@ device_initcall(timekeeping_init_device); * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) +static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, + s64 *offset) { s64 tick_error, i; u32 look_ahead, adj; @@ -844,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 * * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); + tick_error = current_tick_length() >> + (TICK_LENGTH_SHIFT - clock->shift + 1); tick_error -= clock->xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; @@ -896,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset) clock->mult += adj; clock->xtime_interval += interval; clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); + clock->error -= (interval - offset) << + (TICK_LENGTH_SHIFT - clock->shift); } /** diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 96f77013d3f0..baacc3691415 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -96,6 +96,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) stats->write_char = p->wchar; stats->read_syscalls = p->syscr; stats->write_syscalls = p->syscw; +#ifdef CONFIG_TASK_IO_ACCOUNTING + stats->read_bytes = p->ioac.read_bytes; + stats->write_bytes = p->ioac.write_bytes; + stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; +#else + stats->read_bytes = 0; + stats->write_bytes = 0; + stats->cancelled_write_bytes = 0; +#endif } #undef KB #undef MB diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6b186750e9be..db49886bfae1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -85,22 +85,19 @@ static inline int is_single_threaded(struct workqueue_struct *wq) return list_empty(&wq->list); } +/* + * Set the workqueue on which a work item is to be run + * - Must *only* be called if the pending flag is set + */ static inline void set_wq_data(struct work_struct *work, void *wq) { - unsigned long new, old, res; + unsigned long new; + + BUG_ON(!work_pending(work)); - /* assume the pending flag is already set and that the task has already - * been queued on this workqueue */ new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); - res = work->management; - if (res != new) { - do { - old = res; - new = (unsigned long) wq; - new |= (old & WORK_STRUCT_FLAG_MASK); - res = cmpxchg(&work->management, old, new); - } while (res != old); - } + new |= work->management & WORK_STRUCT_FLAG_MASK; + work->management = new; } static inline void *get_wq_data(struct work_struct *work) |