Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /kernel
67 files changed, 40718 insertions, 0 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
new file mode 100644
index 000000000000..eb88b446c2cc
--- /dev/null
+++ b/kernel/Makefile
@@ -0,0 +1,53 @@
+#
+# Makefile for the linux kernel.
+#
+
+obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
+	    exit.o itimer.o time.o softirq.o resource.o \
+	    sysctl.o capability.o ptrace.o timer.o user.o \
+	    signal.o sys.o kmod.o workqueue.o pid.o \
+	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o
+
+obj-$(CONFIG_FUTEX) += futex.o
+obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
+obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_UID16) += uid16.o
+obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_PM) += power/
+obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_COMPAT) += compat.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_IKCONFIG_PROC) += configs.o
+obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+obj-$(CONFIG_SECCOMP) += seccomp.o
+
+ifneq ($(CONFIG_IA64),y)
+# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+# needed for x86 only.  Why this used to be enabled for all architectures is beyond
+# me.  I suspect most platforms don't need this, but until we know that for sure
+# I turn this off for IA-64 only.  Andreas Schwab says it's also needed on m68k
+# to get a correct value for the wait-channel (WCHAN in ps). --davidm
+CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
+endif
+
+$(obj)/configs.o: $(obj)/config_data.h
+
+# config_data.h contains the same information as ikconfig.h but gzipped.
+# Info from config_data can be extracted from /proc/config*
+targets += config_data.gz
+$(obj)/config_data.gz: .config FORCE
+	$(call if_changed,gzip)
+
+quiet_cmd_ikconfiggz = IKCFG   $@
+      cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
+targets += config_data.h
+$(obj)/config_data.h: $(obj)/config_data.gz FORCE
+	$(call if_changed,ikconfiggz)
diff --git a/kernel/acct.c b/kernel/acct.c
new file mode 100644
index 000000000000..4168f631868e
--- /dev/null
+++ b/kernel/acct.c
@@ -0,0 +1,561 @@
+/*
+ *  linux/kernel/acct.c
+ *
+ *  BSD Process Accounting for Linux
+ *
+ *  Author: Marco van Wieringen <mvw@planets.elm.net>
+ *
+ *  Some code based on ideas and code from:
+ *  Thomas K. Dyas <tdyas@eden.rutgers.edu>
+ *
+ *  This file implements BSD-style process accounting. Whenever any
+ *  process exits, an accounting record of type "struct acct" is
+ *  written to the file specified with the acct() system call. It is
+ *  up to user-level programs to do useful things with the accounting
+ *  log. The kernel just provides the raw accounting information.
+ *
+ * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
+ *
+ *  Plugged two leaks. 1) It didn't return acct_file into the free_filps if
+ *  the file happened to be read-only. 2) If the accounting was suspended
+ *  due to the lack of space it happily allowed to reopen it and completely
+ *  lost the old acct_file. 3/10/98, Al Viro.
+ *
+ *  Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
+ *  XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
+ *
+ *  Fixed a nasty interaction with with sys_umount(). If the accointing
+ *  was suspeneded we failed to stop it on umount(). Messy.
+ *  Another one: remount to readonly didn't stop accounting.
+ *	Question: what should we do if we have CAP_SYS_ADMIN but not
+ *  CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
+ *  unless we are messing with the root. In that case we are getting a
+ *  real mess with do_remount_sb(). 9/11/98, AV.
+ *
+ *  Fixed a bunch of races (and pair of leaks). Probably not the best way,
+ *  but this one obviously doesn't introduce deadlocks. Later. BTW, found
+ *  one race (and leak) in BSD implementation.
+ *  OK, that's better. ANOTHER race and leak in BSD variant. There always
+ *  is one more bug... 10/11/98, AV.
+ *
+ *	Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
+ * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
+ * a struct file opened for write. Fixed. 2/6/2000, AV.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/acct.h>
+#include <linux/file.h>
+#include <linux/tty.h>
+#include <linux/security.h>
+#include <linux/vfs.h>
+#include <linux/jiffies.h>
+#include <linux/times.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include <linux/blkdev.h> /* sector_div */
+
+/*
+ * These constants control the amount of freespace that suspend and
+ * resume the process accounting system, and the time delay between
+ * each check.
+ * Turned into sysctl-controllable parameters. AV, 12/11/98
+ */
+
+int acct_parm[3] = {4, 2, 30};
+#define RESUME		(acct_parm[0])	/* >foo% free space - resume */
+#define SUSPEND		(acct_parm[1])	/* <foo% free space - suspend */
+#define ACCT_TIMEOUT	(acct_parm[2])	/* foo second timeout between checks */
+
+/*
+ * External references and all of the globals.
+ */
+static void do_acct_process(long, struct file *);
+
+/*
+ * This structure is used so that all the data protected by lock
+ * can be placed in the same cache line as the lock.  This primes
+ * the cache line to have the data after getting the lock.
+ */
+struct acct_glbs {
+	spinlock_t		lock;
+	volatile int		active;
+	volatile int		needcheck;
+	struct file		*file;
+	struct timer_list	timer;
+};
+
+static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED};
+
+/*
+ * Called whenever the timer says to check the free space.
+ */
+static void acct_timeout(unsigned long unused)
+{
+	acct_globals.needcheck = 1;
+}
+
+/*
+ * Check the amount of free space and suspend/resume accordingly.
+ */
+static int check_free_space(struct file *file)
+{
+	struct kstatfs sbuf;
+	int res;
+	int act;
+	sector_t resume;
+	sector_t suspend;
+
+	spin_lock(&acct_globals.lock);
+	res = acct_globals.active;
+	if (!file || !acct_globals.needcheck)
+		goto out;
+	spin_unlock(&acct_globals.lock);
+
+	/* May block */
+	if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf))
+		return res;
+	suspend = sbuf.f_blocks * SUSPEND;
+	resume = sbuf.f_blocks * RESUME;
+
+	sector_div(suspend, 100);
+	sector_div(resume, 100);
+
+	if (sbuf.f_bavail <= suspend)
+		act = -1;
+	else if (sbuf.f_bavail >= resume)
+		act = 1;
+	else
+		act = 0;
+
+	/*
+	 * If some joker switched acct_globals.file under us we'ld better be
+	 * silent and _not_ touch anything.
+	 */
+	spin_lock(&acct_globals.lock);
+	if (file != acct_globals.file) {
+		if (act)
+			res = act>0;
+		goto out;
+	}
+
+	if (acct_globals.active) {
+		if (act < 0) {
+			acct_globals.active = 0;
+			printk(KERN_INFO "Process accounting paused\n");
+		}
+	} else {
+		if (act > 0) {
+			acct_globals.active = 1;
+			printk(KERN_INFO "Process accounting resumed\n");
+		}
+	}
+
+	del_timer(&acct_globals.timer);
+	acct_globals.needcheck = 0;
+	acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+	add_timer(&acct_globals.timer);
+	res = acct_globals.active;
+out:
+	spin_unlock(&acct_globals.lock);
+	return res;
+}
+
+/*
+ * Close the old accouting file (if currently open) and then replace
+ * it with file (if non-NULL).
+ *
+ * NOTE: acct_globals.lock MUST be held on entry and exit.
+ */
+static void acct_file_reopen(struct file *file)
+{
+	struct file *old_acct = NULL;
+
+	if (acct_globals.file) {
+		old_acct = acct_globals.file;
+		del_timer(&acct_globals.timer);
+		acct_globals.active = 0;
+		acct_globals.needcheck = 0;
+		acct_globals.file = NULL;
+	}
+	if (file) {
+		acct_globals.file = file;
+		acct_globals.needcheck = 0;
+		acct_globals.active = 1;
+		/* It's been deleted if it was used before so this is safe */
+		init_timer(&acct_globals.timer);
+		acct_globals.timer.function = acct_timeout;
+		acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+		add_timer(&acct_globals.timer);
+	}
+	if (old_acct) {
+		spin_unlock(&acct_globals.lock);
+		do_acct_process(0, old_acct);
+		filp_close(old_acct, NULL);
+		spin_lock(&acct_globals.lock);
+	}
+}
+
+/*
+ *  sys_acct() is the only system call needed to implement process
+ *  accounting. It takes the name of the file where accounting records
+ *  should be written. If the filename is NULL, accounting will be
+ *  shutdown.
+ */
+asmlinkage long sys_acct(const char __user *name)
+{
+	struct file *file = NULL;
+	char *tmp;
+	int error;
+
+	if (!capable(CAP_SYS_PACCT))
+		return -EPERM;
+
+	if (name) {
+		tmp = getname(name);
+		if (IS_ERR(tmp)) {
+			return (PTR_ERR(tmp));
+		}
+		/* Difference from BSD - they don't do O_APPEND */
+		file = filp_open(tmp, O_WRONLY|O_APPEND, 0);
+		putname(tmp);
+		if (IS_ERR(file)) {
+			return (PTR_ERR(file));
+		}
+		if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+			filp_close(file, NULL);
+			return (-EACCES);
+		}
+
+		if (!file->f_op->write) {
+			filp_close(file, NULL);
+			return (-EIO);
+		}
+	}
+
+	error = security_acct(file);
+	if (error) {
+		if (file)
+			filp_close(file, NULL);
+		return error;
+	}
+
+	spin_lock(&acct_globals.lock);
+	acct_file_reopen(file);
+	spin_unlock(&acct_globals.lock);
+
+	return (0);
+}
+
+/*
+ * If the accouting is turned on for a file in the filesystem pointed
+ * to by sb, turn accouting off.
+ */
+void acct_auto_close(struct super_block *sb)
+{
+	spin_lock(&acct_globals.lock);
+	if (acct_globals.file &&
+	    acct_globals.file->f_dentry->d_inode->i_sb == sb) {
+		acct_file_reopen((struct file *)NULL);
+	}
+	spin_unlock(&acct_globals.lock);
+}
+
+/*
+ *  encode an unsigned long into a comp_t
+ *
+ *  This routine has been adopted from the encode_comp_t() function in
+ *  the kern_acct.c file of the FreeBSD operating system. The encoding
+ *  is a 13-bit fraction with a 3-bit (base 8) exponent.
+ */
+
+#define	MANTSIZE	13			/* 13 bit mantissa. */
+#define	EXPSIZE		3			/* Base 8 (3 bit) exponent. */
+#define	MAXFRACT	((1 << MANTSIZE) - 1)	/* Maximum fractional value. */
+
+static comp_t encode_comp_t(unsigned long value)
+{
+	int exp, rnd;
+
+	exp = rnd = 0;
+	while (value > MAXFRACT) {
+		rnd = value & (1 << (EXPSIZE - 1));	/* Round up? */
+		value >>= EXPSIZE;	/* Base 8 exponent == 3 bit shift. */
+		exp++;
+	}
+
+	/*
+         * If we need to round up, do it (and handle overflow correctly).
+         */
+	if (rnd && (++value > MAXFRACT)) {
+		value >>= EXPSIZE;
+		exp++;
+	}
+
+	/*
+         * Clean it up and polish it off.
+         */
+	exp <<= MANTSIZE;		/* Shift the exponent into place */
+	exp += value;			/* and add on the mantissa. */
+	return exp;
+}
+
+#if ACCT_VERSION==1 || ACCT_VERSION==2
+/*
+ * encode an u64 into a comp2_t (24 bits)
+ *
+ * Format: 5 bit base 2 exponent, 20 bits mantissa.
+ * The leading bit of the mantissa is not stored, but implied for
+ * non-zero exponents.
+ * Largest encodable value is 50 bits.
+ */
+
+#define MANTSIZE2       20                      /* 20 bit mantissa. */
+#define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
+#define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
+#define MAXEXP2         ((1 <<EXPSIZE2) - 1)    /* Maximum exponent. */
+
+static comp2_t encode_comp2_t(u64 value)
+{
+        int exp, rnd;
+
+        exp = (value > (MAXFRACT2>>1));
+        rnd = 0;
+        while (value > MAXFRACT2) {
+                rnd = value & 1;
+                value >>= 1;
+                exp++;
+        }
+
+        /*
+         * If we need to round up, do it (and handle overflow correctly).
+         */
+        if (rnd && (++value > MAXFRACT2)) {
+                value >>= 1;
+                exp++;
+        }
+
+        if (exp > MAXEXP2) {
+                /* Overflow. Return largest representable number instead. */
+                return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
+        } else {
+                return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
+        }
+}
+#endif
+
+#if ACCT_VERSION==3
+/*
+ * encode an u64 into a 32 bit IEEE float
+ */
+static u32 encode_float(u64 value)
+{
+	unsigned exp = 190;
+	unsigned u;
+
+	if (value==0) return 0;
+	while ((s64)value > 0){
+		value <<= 1;
+		exp--;
+	}
+	u = (u32)(value >> 40) & 0x7fffffu;
+	return u | (exp << 23);
+}
+#endif
+
+/*
+ *  Write an accounting entry for an exiting process
+ *
+ *  The acct_process() call is the workhorse of the process
+ *  accounting system. The struct acct is built here and then written
+ *  into the accounting file. This function should only be called from
+ *  do_exit().
+ */
+
+/*
+ *  do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(long exitcode, struct file *file)
+{
+	acct_t ac;
+	mm_segment_t fs;
+	unsigned long vsize;
+	unsigned long flim;
+	u64 elapsed;
+	u64 run_time;
+	struct timespec uptime;
+
+	/*
+	 * First check to see if there is enough free_space to continue
+	 * the process accounting system.
+	 */
+	if (!check_free_space(file))
+		return;
+
+	/*
+	 * Fill the accounting struct with the needed info as recorded
+	 * by the different kernel functions.
+	 */
+	memset((caddr_t)&ac, 0, sizeof(acct_t));
+
+	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
+
+	/* calculate run_time in nsec*/
+	do_posix_clock_monotonic_gettime(&uptime);
+	run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
+	run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
+					+ current->start_time.tv_nsec;
+	/* convert nsec -> AHZ */
+	elapsed = nsec_to_AHZ(run_time);
+#if ACCT_VERSION==3
+	ac.ac_etime = encode_float(elapsed);
+#else
+	ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+	                       (unsigned long) elapsed : (unsigned long) -1l);
+#endif
+#if ACCT_VERSION==1 || ACCT_VERSION==2
+	{
+		/* new enlarged etime field */
+		comp2_t etime = encode_comp2_t(elapsed);
+		ac.ac_etime_hi = etime >> 16;
+		ac.ac_etime_lo = (u16) etime;
+	}
+#endif
+	do_div(elapsed, AHZ);
+	ac.ac_btime = xtime.tv_sec - elapsed;
+	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(
+					    current->signal->utime +
+					    current->group_leader->utime));
+	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(
+					    current->signal->stime +
+					    current->group_leader->stime));
+	/* we really need to bite the bullet and change layout */
+	ac.ac_uid = current->uid;
+	ac.ac_gid = current->gid;
+#if ACCT_VERSION==2
+	ac.ac_ahz = AHZ;
+#endif
+#if ACCT_VERSION==1 || ACCT_VERSION==2
+	/* backward-compatible 16 bit fields */
+	ac.ac_uid16 = current->uid;
+	ac.ac_gid16 = current->gid;
+#endif
+#if ACCT_VERSION==3
+	ac.ac_pid = current->tgid;
+	ac.ac_ppid = current->parent->tgid;
+#endif
+
+	read_lock(&tasklist_lock);	/* pin current->signal */
+	ac.ac_tty = current->signal->tty ?
+		old_encode_dev(tty_devnum(current->signal->tty)) : 0;
+	read_unlock(&tasklist_lock);
+
+	ac.ac_flag = 0;
+	if (current->flags & PF_FORKNOEXEC)
+		ac.ac_flag |= AFORK;
+	if (current->flags & PF_SUPERPRIV)
+		ac.ac_flag |= ASU;
+	if (current->flags & PF_DUMPCORE)
+		ac.ac_flag |= ACORE;
+	if (current->flags & PF_SIGNALED)
+		ac.ac_flag |= AXSIG;
+
+	vsize = 0;
+	if (current->mm) {
+		struct vm_area_struct *vma;
+		down_read(&current->mm->mmap_sem);
+		vma = current->mm->mmap;
+		while (vma) {
+			vsize += vma->vm_end - vma->vm_start;
+			vma = vma->vm_next;
+		}
+		up_read(&current->mm->mmap_sem);
+	}
+	vsize = vsize / 1024;
+	ac.ac_mem = encode_comp_t(vsize);
+	ac.ac_io = encode_comp_t(0 /* current->io_usage */);	/* %% */
+	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
+	ac.ac_minflt = encode_comp_t(current->signal->min_flt +
+				     current->group_leader->min_flt);
+	ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
+				     current->group_leader->maj_flt);
+	ac.ac_swaps = encode_comp_t(0);
+	ac.ac_exitcode = exitcode;
+
+	/*
+         * Kernel segment override to datasegment and write it
+         * to the accounting file.
+         */
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	/*
+ 	 * Accounting records are not subject to resource limits.
+ 	 */
+	flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+	file->f_op->write(file, (char *)&ac,
+			       sizeof(acct_t), &file->f_pos);
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
+	set_fs(fs);
+}
+
+/*
+ * acct_process - now just a wrapper around do_acct_process
+ */
+void acct_process(long exitcode)
+{
+	struct file *file = NULL;
+
+	/*
+	 * accelerate the common fastpath:
+	 */
+	if (!acct_globals.file)
+		return;
+
+	spin_lock(&acct_globals.lock);
+	file = acct_globals.file;
+	if (unlikely(!file)) {
+		spin_unlock(&acct_globals.lock);
+		return;
+	}
+	get_file(file);
+	spin_unlock(&acct_globals.lock);
+
+	do_acct_process(exitcode, file);
+	fput(file);
+}
+
+
+/*
+ * acct_update_integrals
+ *    -  update mm integral fields in task_struct
+ */
+void acct_update_integrals(struct task_struct *tsk)
+{
+	if (likely(tsk->mm)) {
+		long delta = tsk->stime - tsk->acct_stimexpd;
+
+		if (delta == 0)
+			return;
+		tsk->acct_stimexpd = tsk->stime;
+		tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss);
+		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+	}
+}
+
+/*
+ * acct_clear_integrals
+ *    - clear the mm integral fields in task_struct
+ */
+void acct_clear_integrals(struct task_struct *tsk)
+{
+	if (tsk) {
+		tsk->acct_stimexpd = 0;
+		tsk->acct_rss_mem1 = 0;
+		tsk->acct_vm_mem1 = 0;
+	}
+}
diff --git a/kernel/audit.c b/kernel/audit.c
new file mode 100644
index 000000000000..0f84dd7af2c8
--- /dev/null
+++ b/kernel/audit.c
@@ -0,0 +1,839 @@
+/* audit.c -- Auditing support -*- linux-c -*-
+ * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
+ * System-call specific features have moved to auditsc.c
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Goals: 1) Integrate fully with SELinux.
+ *	  2) Minimal run-time overhead:
+ *	     a) Minimal when syscall auditing is disabled (audit_enable=0).
+ *	     b) Small when syscall auditing is enabled and no audit record
+ *		is generated (defer as much work as possible to record
+ *		generation time):
+ *		i) context is allocated,
+ *		ii) names from getname are stored without a copy, and
+ *		iii) inode information stored from path_lookup.
+ *	  3) Ability to disable syscall auditing at boot time (audit=0).
+ *	  4) Usable by other parts of the kernel (if audit_log* is called,
+ *	     then a syscall record will be generated automatically for the
+ *	     current syscall).
+ *	  5) Netlink interface to user-space.
+ *	  6) Support low-overhead kernel-based filtering to minimize the
+ *	     information that must be passed to user-space.
+ *
+ * Example user-space utilities: http://people.redhat.com/faith/audit/
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+
+#include <net/sock.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+
+/* No auditing will take place until audit_initialized != 0.
+ * (Initialization happens after skb_init is called.) */
+static int	audit_initialized;
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+int		audit_enabled;
+
+/* Default state when kernel boots without any parameters. */
+static int	audit_default;
+
+/* If auditing cannot proceed, audit_failure selects what happens. */
+static int	audit_failure = AUDIT_FAIL_PRINTK;
+
+/* If audit records are to be written to the netlink socket, audit_pid
+ * contains the (non-zero) pid. */
+static int	audit_pid;
+
+/* If audit_limit is non-zero, limit the rate of sending audit records
+ * to that number per second.  This prevents DoS attacks, but results in
+ * audit records being dropped. */
+static int	audit_rate_limit;
+
+/* Number of outstanding audit_buffers allowed. */
+static int	audit_backlog_limit = 64;
+static atomic_t	audit_backlog	    = ATOMIC_INIT(0);
+
+/* Records can be lost in several ways:
+   0) [suppressed in audit_alloc]
+   1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
+   2) out of memory in audit_log_move [alloc_skb]
+   3) suppressed due to audit_rate_limit
+   4) suppressed due to audit_backlog_limit
+*/
+static atomic_t    audit_lost = ATOMIC_INIT(0);
+
+/* The netlink socket. */
+static struct sock *audit_sock;
+
+/* There are two lists of audit buffers.  The txlist contains audit
+ * buffers that cannot be sent immediately to the netlink device because
+ * we are in an irq context (these are sent later in a tasklet).
+ *
+ * The second list is a list of pre-allocated audit buffers (if more
+ * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
+ * being placed on the freelist). */
+static DEFINE_SPINLOCK(audit_txlist_lock);
+static DEFINE_SPINLOCK(audit_freelist_lock);
+static int	   audit_freelist_count = 0;
+static LIST_HEAD(audit_txlist);
+static LIST_HEAD(audit_freelist);
+
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+/* The netlink socket is only to be read by 1 CPU, which lets us assume
+ * that list additions and deletions never happen simultaneiously in
+ * auditsc.c */
+static DECLARE_MUTEX(audit_netlink_sem);
+
+/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
+ * audit records.  Since printk uses a 1024 byte buffer, this buffer
+ * should be at least that large. */
+#define AUDIT_BUFSIZ 1024
+
+/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
+ * audit_freelist.  Doing so eliminates many kmalloc/kfree calls. */
+#define AUDIT_MAXFREE  (2*NR_CPUS)
+
+/* The audit_buffer is used when formatting an audit record.  The caller
+ * locks briefly to get the record off the freelist or to allocate the
+ * buffer, and locks briefly to send the buffer to the netlink layer or
+ * to place it on a transmit queue.  Multiple audit_buffers can be in
+ * use simultaneously. */
+struct audit_buffer {
+	struct list_head     list;
+	struct sk_buff_head  sklist;	/* formatted skbs ready to send */
+	struct audit_context *ctx;	/* NULL or associated context */
+	int		     len;	/* used area of tmp */
+	char		     tmp[AUDIT_BUFSIZ];
+
+				/* Pointer to header and contents */
+	struct nlmsghdr      *nlh;
+	int		     total;
+	int		     type;
+	int		     pid;
+	int		     count; /* Times requeued */
+};
+
+void audit_set_type(struct audit_buffer *ab, int type)
+{
+	ab->type = type;
+}
+
+struct audit_entry {
+	struct list_head  list;
+	struct audit_rule rule;
+};
+
+static void audit_log_end_irq(struct audit_buffer *ab);
+static void audit_log_end_fast(struct audit_buffer *ab);
+
+static void audit_panic(const char *message)
+{
+	switch (audit_failure)
+	{
+	case AUDIT_FAIL_SILENT:
+		break;
+	case AUDIT_FAIL_PRINTK:
+		printk(KERN_ERR "audit: %s\n", message);
+		break;
+	case AUDIT_FAIL_PANIC:
+		panic("audit: %s\n", message);
+		break;
+	}
+}
+
+static inline int audit_rate_check(void)
+{
+	static unsigned long	last_check = 0;
+	static int		messages   = 0;
+	static DEFINE_SPINLOCK(lock);
+	unsigned long		flags;
+	unsigned long		now;
+	unsigned long		elapsed;
+	int			retval	   = 0;
+
+	if (!audit_rate_limit) return 1;
+
+	spin_lock_irqsave(&lock, flags);
+	if (++messages < audit_rate_limit) {
+		retval = 1;
+	} else {
+		now     = jiffies;
+		elapsed = now - last_check;
+		if (elapsed > HZ) {
+			last_check = now;
+			messages   = 0;
+			retval     = 1;
+		}
+	}
+	spin_unlock_irqrestore(&lock, flags);
+
+	return retval;
+}
+
+/* Emit at least 1 message per second, even if audit_rate_check is
+ * throttling. */
+void audit_log_lost(const char *message)
+{
+	static unsigned long	last_msg = 0;
+	static DEFINE_SPINLOCK(lock);
+	unsigned long		flags;
+	unsigned long		now;
+	int			print;
+
+	atomic_inc(&audit_lost);
+
+	print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
+
+	if (!print) {
+		spin_lock_irqsave(&lock, flags);
+		now = jiffies;
+		if (now - last_msg > HZ) {
+			print = 1;
+			last_msg = now;
+		}
+		spin_unlock_irqrestore(&lock, flags);
+	}
+
+	if (print) {
+		printk(KERN_WARNING
+		       "audit: audit_lost=%d audit_backlog=%d"
+		       " audit_rate_limit=%d audit_backlog_limit=%d\n",
+		       atomic_read(&audit_lost),
+		       atomic_read(&audit_backlog),
+		       audit_rate_limit,
+		       audit_backlog_limit);
+		audit_panic(message);
+	}
+
+}
+
+static int audit_set_rate_limit(int limit)
+{
+	int old		 = audit_rate_limit;
+	audit_rate_limit = limit;
+	audit_log(current->audit_context, "audit_rate_limit=%d old=%d",
+		  audit_rate_limit, old);
+	return old;
+}
+
+static int audit_set_backlog_limit(int limit)
+{
+	int old		 = audit_backlog_limit;
+	audit_backlog_limit = limit;
+	audit_log(current->audit_context, "audit_backlog_limit=%d old=%d",
+		  audit_backlog_limit, old);
+	return old;
+}
+
+static int audit_set_enabled(int state)
+{
+	int old		 = audit_enabled;
+	if (state != 0 && state != 1)
+		return -EINVAL;
+	audit_enabled = state;
+	audit_log(current->audit_context, "audit_enabled=%d old=%d",
+		  audit_enabled, old);
+	return old;
+}
+
+static int audit_set_failure(int state)
+{
+	int old		 = audit_failure;
+	if (state != AUDIT_FAIL_SILENT
+	    && state != AUDIT_FAIL_PRINTK
+	    && state != AUDIT_FAIL_PANIC)
+		return -EINVAL;
+	audit_failure = state;
+	audit_log(current->audit_context, "audit_failure=%d old=%d",
+		  audit_failure, old);
+	return old;
+}
+
+#ifdef CONFIG_NET
+void audit_send_reply(int pid, int seq, int type, int done, int multi,
+		      void *payload, int size)
+{
+	struct sk_buff	*skb;
+	struct nlmsghdr	*nlh;
+	int		len = NLMSG_SPACE(size);
+	void		*data;
+	int		flags = multi ? NLM_F_MULTI : 0;
+	int		t     = done  ? NLMSG_DONE  : type;
+
+	skb = alloc_skb(len, GFP_KERNEL);
+	if (!skb)
+		goto nlmsg_failure;
+
+	nlh		 = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
+	nlh->nlmsg_flags = flags;
+	data		 = NLMSG_DATA(nlh);
+	memcpy(data, payload, size);
+	netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
+	return;
+
+nlmsg_failure:			/* Used by NLMSG_PUT */
+	if (skb)
+		kfree_skb(skb);
+}
+
+/*
+ * Check for appropriate CAP_AUDIT_ capabilities on incoming audit
+ * control messages.
+ */
+static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
+{
+	int err = 0;
+
+	switch (msg_type) {
+	case AUDIT_GET:
+	case AUDIT_LIST:
+	case AUDIT_SET:
+	case AUDIT_ADD:
+	case AUDIT_DEL:
+		if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
+			err = -EPERM;
+		break;
+	case AUDIT_USER:
+		if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
+			err = -EPERM;
+		break;
+	default:  /* bad msg */
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	u32			uid, pid, seq;
+	void			*data;
+	struct audit_status	*status_get, status_set;
+	int			err;
+	struct audit_buffer	*ab;
+	u16			msg_type = nlh->nlmsg_type;
+
+	err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
+	if (err)
+		return err;
+
+	pid  = NETLINK_CREDS(skb)->pid;
+	uid  = NETLINK_CREDS(skb)->uid;
+	seq  = nlh->nlmsg_seq;
+	data = NLMSG_DATA(nlh);
+
+	switch (msg_type) {
+	case AUDIT_GET:
+		status_set.enabled	 = audit_enabled;
+		status_set.failure	 = audit_failure;
+		status_set.pid		 = audit_pid;
+		status_set.rate_limit	 = audit_rate_limit;
+		status_set.backlog_limit = audit_backlog_limit;
+		status_set.lost		 = atomic_read(&audit_lost);
+		status_set.backlog	 = atomic_read(&audit_backlog);
+		audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
+				 &status_set, sizeof(status_set));
+		break;
+	case AUDIT_SET:
+		if (nlh->nlmsg_len < sizeof(struct audit_status))
+			return -EINVAL;
+		status_get   = (struct audit_status *)data;
+		if (status_get->mask & AUDIT_STATUS_ENABLED) {
+			err = audit_set_enabled(status_get->enabled);
+			if (err < 0) return err;
+		}
+		if (status_get->mask & AUDIT_STATUS_FAILURE) {
+			err = audit_set_failure(status_get->failure);
+			if (err < 0) return err;
+		}
+		if (status_get->mask & AUDIT_STATUS_PID) {
+			int old   = audit_pid;
+			audit_pid = status_get->pid;
+			audit_log(current->audit_context,
+				  "audit_pid=%d old=%d", audit_pid, old);
+		}
+		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+			audit_set_rate_limit(status_get->rate_limit);
+		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
+			audit_set_backlog_limit(status_get->backlog_limit);
+		break;
+	case AUDIT_USER:
+		ab = audit_log_start(NULL);
+		if (!ab)
+			break;	/* audit_panic has been called */
+		audit_log_format(ab,
+				 "user pid=%d uid=%d length=%d msg='%.1024s'",
+				 pid, uid,
+				 (int)(nlh->nlmsg_len
+				       - ((char *)data - (char *)nlh)),
+				 (char *)data);
+		ab->type = AUDIT_USER;
+		ab->pid  = pid;
+		audit_log_end(ab);
+		break;
+	case AUDIT_ADD:
+	case AUDIT_DEL:
+		if (nlh->nlmsg_len < sizeof(struct audit_rule))
+			return -EINVAL;
+		/* fallthrough */
+	case AUDIT_LIST:
+#ifdef CONFIG_AUDITSYSCALL
+		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+					   uid, seq, data);
+#else
+		err = -EOPNOTSUPP;
+#endif
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err < 0 ? err : 0;
+}
+
+/* Get message from skb (based on rtnetlink_rcv_skb).  Each message is
+ * processed by audit_receive_msg.  Malformed skbs with wrong length are
+ * discarded silently.  */
+static int audit_receive_skb(struct sk_buff *skb)
+{
+	int		err;
+	struct nlmsghdr	*nlh;
+	u32		rlen;
+
+	while (skb->len >= NLMSG_SPACE(0)) {
+		nlh = (struct nlmsghdr *)skb->data;
+		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+			return 0;
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+		if ((err = audit_receive_msg(skb, nlh))) {
+			netlink_ack(skb, nlh, err);
+		} else if (nlh->nlmsg_flags & NLM_F_ACK)
+			netlink_ack(skb, nlh, 0);
+		skb_pull(skb, rlen);
+	}
+	return 0;
+}
+
+/* Receive messages from netlink socket. */
+static void audit_receive(struct sock *sk, int length)
+{
+	struct sk_buff  *skb;
+
+	if (down_trylock(&audit_netlink_sem))
+		return;
+
+				/* FIXME: this must not cause starvation */
+	while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+		if (audit_receive_skb(skb) && skb->len)
+			skb_queue_head(&sk->sk_receive_queue, skb);
+		else
+			kfree_skb(skb);
+	}
+	up(&audit_netlink_sem);
+}
+
+/* Move data from tmp buffer into an skb.  This is an extra copy, and
+ * that is unfortunate.  However, the copy will only occur when a record
+ * is being written to user space, which is already a high-overhead
+ * operation.  (Elimination of the copy is possible, for example, by
+ * writing directly into a pre-allocated skb, at the cost of wasting
+ * memory. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+	struct sk_buff	*skb;
+	char		*start;
+	int		extra = ab->nlh ? 0 : NLMSG_SPACE(0);
+
+	/* possible resubmission */
+	if (ab->len == 0)
+		return;
+
+	skb = skb_peek(&ab->sklist);
+	if (!skb || skb_tailroom(skb) <= ab->len + extra) {
+		skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
+		if (!skb) {
+			ab->len = 0; /* Lose information in ab->tmp */
+			audit_log_lost("out of memory in audit_log_move");
+			return;
+		}
+		__skb_queue_tail(&ab->sklist, skb);
+		if (!ab->nlh)
+			ab->nlh = (struct nlmsghdr *)skb_put(skb,
+							     NLMSG_SPACE(0));
+	}
+	start = skb_put(skb, ab->len);
+	memcpy(start, ab->tmp, ab->len);
+	ab->len = 0;
+}
+
+/* Iterate over the skbuff in the audit_buffer, sending their contents
+ * to user space. */
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&ab->sklist))) {
+		int retval = 0;
+
+		if (audit_pid) {
+			if (ab->nlh) {
+				ab->nlh->nlmsg_len   = ab->total;
+				ab->nlh->nlmsg_type  = ab->type;
+				ab->nlh->nlmsg_flags = 0;
+				ab->nlh->nlmsg_seq   = 0;
+				ab->nlh->nlmsg_pid   = ab->pid;
+			}
+			skb_get(skb); /* because netlink_* frees */
+			retval = netlink_unicast(audit_sock, skb, audit_pid,
+						 MSG_DONTWAIT);
+		}
+		if (retval == -EAGAIN && ab->count < 5) {
+			++ab->count;
+			skb_queue_tail(&ab->sklist, skb);
+			audit_log_end_irq(ab);
+			return 1;
+		}
+		if (retval < 0) {
+			if (retval == -ECONNREFUSED) {
+				printk(KERN_ERR
+				       "audit: *NO* daemon at audit_pid=%d\n",
+				       audit_pid);
+				audit_pid = 0;
+			} else
+				audit_log_lost("netlink socket too busy");
+		}
+		if (!audit_pid) { /* No daemon */
+			int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
+			int len    = skb->len - offset;
+			printk(KERN_ERR "%*.*s\n",
+			       len, len, skb->data + offset);
+		}
+		kfree_skb(skb);
+		ab->nlh = NULL;
+	}
+	return 0;
+}
+
+/* Initialize audit support at boot time. */
+static int __init audit_init(void)
+{
+	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
+	       audit_default ? "enabled" : "disabled");
+	audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
+	if (!audit_sock)
+		audit_panic("cannot initialize netlink socket");
+
+	audit_initialized = 1;
+	audit_enabled = audit_default;
+	audit_log(NULL, "initialized");
+	return 0;
+}
+
+#else
+/* Without CONFIG_NET, we have no skbuffs.  For now, print what we have
+ * in the buffer. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+	printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp);
+	ab->len = 0;
+}
+
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+	return 0;
+}
+
+/* Initialize audit support at boot time. */
+int __init audit_init(void)
+{
+	printk(KERN_INFO "audit: initializing WITHOUT netlink support\n");
+	audit_sock = NULL;
+	audit_pid  = 0;
+
+	audit_initialized = 1;
+	audit_enabled = audit_default;
+	audit_log(NULL, "initialized");
+	return 0;
+}
+#endif
+
+__initcall(audit_init);
+
+/* Process kernel command-line parameter at boot time.  audit=0 or audit=1. */
+static int __init audit_enable(char *str)
+{
+	audit_default = !!simple_strtol(str, NULL, 0);
+	printk(KERN_INFO "audit: %s%s\n",
+	       audit_default ? "enabled" : "disabled",
+	       audit_initialized ? "" : " (after initialization)");
+	if (audit_initialized)
+		audit_enabled = audit_default;
+	return 0;
+}
+
+__setup("audit=", audit_enable);
+
+
+/* Obtain an audit buffer.  This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format.  If the tsk is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit.  If there is no associated task, tsk
+ * should be NULL. */
+struct audit_buffer *audit_log_start(struct audit_context *ctx)
+{
+	struct audit_buffer	*ab	= NULL;
+	unsigned long		flags;
+	struct timespec		t;
+	int			serial	= 0;
+
+	if (!audit_initialized)
+		return NULL;
+
+	if (audit_backlog_limit
+	    && atomic_read(&audit_backlog) > audit_backlog_limit) {
+		if (audit_rate_check())
+			printk(KERN_WARNING
+			       "audit: audit_backlog=%d > "
+			       "audit_backlog_limit=%d\n",
+			       atomic_read(&audit_backlog),
+			       audit_backlog_limit);
+		audit_log_lost("backlog limit exceeded");
+		return NULL;
+	}
+
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (!list_empty(&audit_freelist)) {
+		ab = list_entry(audit_freelist.next,
+				struct audit_buffer, list);
+		list_del(&ab->list);
+		--audit_freelist_count;
+	}
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+
+	if (!ab)
+		ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
+	if (!ab) {
+		audit_log_lost("out of memory in audit_log_start");
+		return NULL;
+	}
+
+	atomic_inc(&audit_backlog);
+	skb_queue_head_init(&ab->sklist);
+
+	ab->ctx   = ctx;
+	ab->len   = 0;
+	ab->nlh   = NULL;
+	ab->total = 0;
+	ab->type  = AUDIT_KERNEL;
+	ab->pid   = 0;
+	ab->count = 0;
+
+#ifdef CONFIG_AUDITSYSCALL
+	if (ab->ctx)
+		audit_get_stamp(ab->ctx, &t, &serial);
+	else
+#endif
+		t = CURRENT_TIME;
+
+	audit_log_format(ab, "audit(%lu.%03lu:%u): ",
+			 t.tv_sec, t.tv_nsec/1000000, serial);
+	return ab;
+}
+
+
+/* Format an audit message into the audit buffer.  If there isn't enough
+ * room in the audit buffer, more room will be allocated and vsnprint
+ * will be called a second time.  Currently, we assume that a printk
+ * can't format message larger than 1024 bytes, so we don't either. */
+static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
+			      va_list args)
+{
+	int len, avail;
+
+	if (!ab)
+		return;
+
+	avail = sizeof(ab->tmp) - ab->len;
+	if (avail <= 0) {
+		audit_log_move(ab);
+		avail = sizeof(ab->tmp) - ab->len;
+	}
+	len   = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+	if (len >= avail) {
+		/* The printk buffer is 1024 bytes long, so if we get
+		 * here and AUDIT_BUFSIZ is at least 1024, then we can
+		 * log everything that printk could have logged. */
+		audit_log_move(ab);
+		avail = sizeof(ab->tmp) - ab->len;
+		len   = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+	}
+	ab->len   += (len < avail) ? len : avail;
+	ab->total += (len < avail) ? len : avail;
+}
+
+/* Format a message into the audit buffer.  All the work is done in
+ * audit_log_vformat. */
+void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
+{
+	va_list args;
+
+	if (!ab)
+		return;
+	va_start(args, fmt);
+	audit_log_vformat(ab, fmt, args);
+	va_end(args);
+}
+
+/* This is a helper-function to print the d_path without using a static
+ * buffer or allocating another buffer in addition to the one in
+ * audit_buffer. */
+void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
+		      struct dentry *dentry, struct vfsmount *vfsmnt)
+{
+	char *p;
+	int  len, avail;
+
+	if (prefix) audit_log_format(ab, " %s", prefix);
+
+	if (ab->len > 128)
+		audit_log_move(ab);
+	avail = sizeof(ab->tmp) - ab->len;
+	p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
+	if (IS_ERR(p)) {
+		/* FIXME: can we save some information here? */
+		audit_log_format(ab, "<toolong>");
+	} else {
+				/* path isn't at start of buffer */
+		len	   = (ab->tmp + sizeof(ab->tmp) - 1) - p;
+		memmove(ab->tmp + ab->len, p, len);
+		ab->len   += len;
+		ab->total += len;
+	}
+}
+
+/* Remove queued messages from the audit_txlist and send them to userspace. */
+static void audit_tasklet_handler(unsigned long arg)
+{
+	LIST_HEAD(list);
+	struct audit_buffer *ab;
+	unsigned long	    flags;
+
+	spin_lock_irqsave(&audit_txlist_lock, flags);
+	list_splice_init(&audit_txlist, &list);
+	spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+	while (!list_empty(&list)) {
+		ab = list_entry(list.next, struct audit_buffer, list);
+		list_del(&ab->list);
+		audit_log_end_fast(ab);
+	}
+}
+
+static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
+
+/* The netlink_* functions cannot be called inside an irq context, so
+ * the audit buffer is places on a queue and a tasklet is scheduled to
+ * remove them from the queue outside the irq context.  May be called in
+ * any context. */
+static void audit_log_end_irq(struct audit_buffer *ab)
+{
+	unsigned long flags;
+
+	if (!ab)
+		return;
+	spin_lock_irqsave(&audit_txlist_lock, flags);
+	list_add_tail(&ab->list, &audit_txlist);
+	spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+	tasklet_schedule(&audit_tasklet);
+}
+
+/* Send the message in the audit buffer directly to user space.  May not
+ * be called in an irq context. */
+static void audit_log_end_fast(struct audit_buffer *ab)
+{
+	unsigned long flags;
+
+	BUG_ON(in_irq());
+	if (!ab)
+		return;
+	if (!audit_rate_check()) {
+		audit_log_lost("rate limit exceeded");
+	} else {
+		audit_log_move(ab);
+		if (audit_log_drain(ab))
+			return;
+	}
+
+	atomic_dec(&audit_backlog);
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (++audit_freelist_count > AUDIT_MAXFREE)
+		kfree(ab);
+	else
+		list_add(&ab->list, &audit_freelist);
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+}
+
+/* Send or queue the message in the audit buffer, depending on the
+ * current context.  (A convenience function that may be called in any
+ * context.) */
+void audit_log_end(struct audit_buffer *ab)
+{
+	if (in_irq())
+		audit_log_end_irq(ab);
+	else
+		audit_log_end_fast(ab);
+}
+
+/* Log an audit record.  This is a convenience function that calls
+ * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
+ * called in any context. */
+void audit_log(struct audit_context *ctx, const char *fmt, ...)
+{
+	struct audit_buffer *ab;
+	va_list args;
+
+	ab = audit_log_start(ctx);
+	if (ab) {
+		va_start(args, fmt);
+		audit_log_vformat(ab, fmt, args);
+		va_end(args);
+		audit_log_end(ab);
+	}
+}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
new file mode 100644
index 000000000000..8c454852d6a5
--- /dev/null
+++ b/kernel/auditsc.c
@@ -0,0 +1,1015 @@
+/* auditsc.c -- System-call auditing support -*- linux-c -*-
+ * Handles all system-call specific auditing features.
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Many of the ideas implemented here are from Stephen C. Tweedie,
+ * especially the idea of avoiding a copy by using getname.
+ *
+ * The method for actual interception of syscall entry and exit (not in
+ * this file -- see entry.S) is based on a GPL'd patch written by
+ * okir@suse.de and Copyright 2003 SuSE Linux AG.
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+#include <linux/personality.h>
+#include <linux/time.h>
+#include <asm/unistd.h>
+
+/* 0 = no checking
+   1 = put_count checking
+   2 = verbose put_count checking
+*/
+#define AUDIT_DEBUG 0
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+extern int audit_enabled;
+
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). */
+#define AUDIT_NAMES    20
+
+/* AUDIT_NAMES_RESERVED is the number of slots we reserve in the
+ * audit_context from being used for nameless inodes from
+ * path_lookup. */
+#define AUDIT_NAMES_RESERVED 7
+
+/* At task start time, the audit_state is set in the audit_context using
+   a per-task filter.  At syscall entry, the audit_state is augmented by
+   the syscall filter. */
+enum audit_state {
+	AUDIT_DISABLED,		/* Do not create per-task audit_context.
+				 * No syscall-specific audit records can
+				 * be generated. */
+	AUDIT_SETUP_CONTEXT,	/* Create the per-task audit_context,
+				 * but don't necessarily fill it in at
+				 * syscall entry time (i.e., filter
+				 * instead). */
+	AUDIT_BUILD_CONTEXT,	/* Create the per-task audit_context,
+				 * and always fill it in at syscall
+				 * entry time.  This makes a full
+				 * syscall record available if some
+				 * other part of the kernel decides it
+				 * should be recorded. */
+	AUDIT_RECORD_CONTEXT	/* Create the per-task audit_context,
+				 * always fill it in at syscall entry
+				 * time, and always write out the audit
+				 * record at syscall exit time.  */
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device. */
+struct audit_names {
+	const char	*name;
+	unsigned long	ino;
+	dev_t		dev;
+	umode_t		mode;
+	uid_t		uid;
+	gid_t		gid;
+	dev_t		rdev;
+};
+
+struct audit_aux_data {
+	struct audit_aux_data	*next;
+	int			type;
+};
+
+#define AUDIT_AUX_IPCPERM	0
+
+struct audit_aux_data_ipcctl {
+	struct audit_aux_data	d;
+	struct ipc_perm		p;
+	unsigned long		qbytes;
+	uid_t			uid;
+	gid_t			gid;
+	mode_t			mode;
+};
+
+
+/* The per-task audit context. */
+struct audit_context {
+	int		    in_syscall;	/* 1 if task is in a syscall */
+	enum audit_state    state;
+	unsigned int	    serial;     /* serial number for record */
+	struct timespec	    ctime;      /* time of syscall entry */
+	uid_t		    loginuid;   /* login uid (identity) */
+	int		    major;      /* syscall number */
+	unsigned long	    argv[4];    /* syscall arguments */
+	int		    return_valid; /* return code is valid */
+	int		    return_code;/* syscall return code */
+	int		    auditable;  /* 1 if record should be written */
+	int		    name_count;
+	struct audit_names  names[AUDIT_NAMES];
+	struct audit_context *previous; /* For nested syscalls */
+	struct audit_aux_data *aux;
+
+				/* Save things to print about task_struct */
+	pid_t		    pid;
+	uid_t		    uid, euid, suid, fsuid;
+	gid_t		    gid, egid, sgid, fsgid;
+	unsigned long	    personality;
+
+#if AUDIT_DEBUG
+	int		    put_count;
+	int		    ino_count;
+#endif
+};
+
+				/* Public API */
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+struct audit_entry {
+	struct list_head  list;
+	struct rcu_head   rcu;
+	struct audit_rule rule;
+};
+
+/* Check to see if two rules are identical.  It is called from
+ * audit_del_rule during AUDIT_DEL. */
+static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+{
+	int i;
+
+	if (a->flags != b->flags)
+		return 1;
+
+	if (a->action != b->action)
+		return 1;
+
+	if (a->field_count != b->field_count)
+		return 1;
+
+	for (i = 0; i < a->field_count; i++) {
+		if (a->fields[i] != b->fields[i]
+		    || a->values[i] != b->values[i])
+			return 1;
+	}
+
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+		if (a->mask[i] != b->mask[i])
+			return 1;
+
+	return 0;
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_add_rule(struct audit_entry *entry,
+				 struct list_head *list)
+{
+	if (entry->rule.flags & AUDIT_PREPEND) {
+		entry->rule.flags &= ~AUDIT_PREPEND;
+		list_add_rcu(&entry->list, list);
+	} else {
+		list_add_tail_rcu(&entry->list, list);
+	}
+	return 0;
+}
+
+static void audit_free_rule(struct rcu_head *head)
+{
+	struct audit_entry *e = container_of(head, struct audit_entry, rcu);
+	kfree(e);
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_del_rule(struct audit_rule *rule,
+				 struct list_head *list)
+{
+	struct audit_entry  *e;
+
+	/* Do not use the _rcu iterator here, since this is the only
+	 * deletion routine. */
+	list_for_each_entry(e, list, list) {
+		if (!audit_compare_rule(rule, &e->rule)) {
+			list_del_rcu(&e->list);
+			call_rcu(&e->rcu, audit_free_rule);
+			return 0;
+		}
+	}
+	return -EFAULT;		/* No matching rule */
+}
+
+#ifdef CONFIG_NET
+/* Copy rule from user-space to kernel-space.  Called during
+ * AUDIT_ADD. */
+static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+{
+	int i;
+
+	if (s->action != AUDIT_NEVER
+	    && s->action != AUDIT_POSSIBLE
+	    && s->action != AUDIT_ALWAYS)
+		return -1;
+	if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
+		return -1;
+
+	d->flags	= s->flags;
+	d->action	= s->action;
+	d->field_count	= s->field_count;
+	for (i = 0; i < d->field_count; i++) {
+		d->fields[i] = s->fields[i];
+		d->values[i] = s->values[i];
+	}
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+	return 0;
+}
+
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
+{
+	u32		   flags;
+	struct audit_entry *entry;
+	int		   err = 0;
+
+	switch (type) {
+	case AUDIT_LIST:
+		/* The *_rcu iterators not needed here because we are
+		   always called with audit_netlink_sem held. */
+		list_for_each_entry(entry, &audit_tsklist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		list_for_each_entry(entry, &audit_entlist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		list_for_each_entry(entry, &audit_extlist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+		break;
+	case AUDIT_ADD:
+		if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
+			return -ENOMEM;
+		if (audit_copy_rule(&entry->rule, data)) {
+			kfree(entry);
+			return -EINVAL;
+		}
+		flags = entry->rule.flags;
+		if (!err && (flags & AUDIT_PER_TASK))
+			err = audit_add_rule(entry, &audit_tsklist);
+		if (!err && (flags & AUDIT_AT_ENTRY))
+			err = audit_add_rule(entry, &audit_entlist);
+		if (!err && (flags & AUDIT_AT_EXIT))
+			err = audit_add_rule(entry, &audit_extlist);
+		break;
+	case AUDIT_DEL:
+		flags =((struct audit_rule *)data)->flags;
+		if (!err && (flags & AUDIT_PER_TASK))
+			err = audit_del_rule(data, &audit_tsklist);
+		if (!err && (flags & AUDIT_AT_ENTRY))
+			err = audit_del_rule(data, &audit_entlist);
+		if (!err && (flags & AUDIT_AT_EXIT))
+			err = audit_del_rule(data, &audit_extlist);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return err;
+}
+#endif
+
+/* Compare a task_struct with an audit_rule.  Return 1 on match, 0
+ * otherwise. */
+static int audit_filter_rules(struct task_struct *tsk,
+			      struct audit_rule *rule,
+			      struct audit_context *ctx,
+			      enum audit_state *state)
+{
+	int i, j;
+
+	for (i = 0; i < rule->field_count; i++) {
+		u32 field  = rule->fields[i] & ~AUDIT_NEGATE;
+		u32 value  = rule->values[i];
+		int result = 0;
+
+		switch (field) {
+		case AUDIT_PID:
+			result = (tsk->pid == value);
+			break;
+		case AUDIT_UID:
+			result = (tsk->uid == value);
+			break;
+		case AUDIT_EUID:
+			result = (tsk->euid == value);
+			break;
+		case AUDIT_SUID:
+			result = (tsk->suid == value);
+			break;
+		case AUDIT_FSUID:
+			result = (tsk->fsuid == value);
+			break;
+		case AUDIT_GID:
+			result = (tsk->gid == value);
+			break;
+		case AUDIT_EGID:
+			result = (tsk->egid == value);
+			break;
+		case AUDIT_SGID:
+			result = (tsk->sgid == value);
+			break;
+		case AUDIT_FSGID:
+			result = (tsk->fsgid == value);
+			break;
+		case AUDIT_PERS:
+			result = (tsk->personality == value);
+			break;
+
+		case AUDIT_EXIT:
+			if (ctx && ctx->return_valid)
+				result = (ctx->return_code == value);
+			break;
+		case AUDIT_SUCCESS:
+			if (ctx && ctx->return_valid)
+				result = (ctx->return_code >= 0);
+			break;
+		case AUDIT_DEVMAJOR:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MAJOR(ctx->names[j].dev)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_DEVMINOR:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MINOR(ctx->names[j].dev)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_INODE:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (ctx->names[j].ino == value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_LOGINUID:
+			result = 0;
+			if (ctx)
+				result = (ctx->loginuid == value);
+			break;
+		case AUDIT_ARG0:
+		case AUDIT_ARG1:
+		case AUDIT_ARG2:
+		case AUDIT_ARG3:
+			if (ctx)
+				result = (ctx->argv[field-AUDIT_ARG0]==value);
+			break;
+		}
+
+		if (rule->fields[i] & AUDIT_NEGATE)
+			result = !result;
+		if (!result)
+			return 0;
+	}
+	switch (rule->action) {
+	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
+	case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
+	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+	}
+	return 1;
+}
+
+/* At process creation time, we can determine if system-call auditing is
+ * completely disabled for this task.  Since we only have the task
+ * structure at this point, we can only check uid and gid.
+ */
+static enum audit_state audit_filter_task(struct task_struct *tsk)
+{
+	struct audit_entry *e;
+	enum audit_state   state;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, &audit_tsklist, list) {
+		if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+			rcu_read_unlock();
+			return state;
+		}
+	}
+	rcu_read_unlock();
+	return AUDIT_BUILD_CONTEXT;
+}
+
+/* At syscall entry and exit time, this filter is called if the
+ * audit_state is not low enough that auditing cannot take place, but is
+ * also not high enough that we already know we have to write and audit
+ * record (i.e., the state is AUDIT_SETUP_CONTEXT or  AUDIT_BUILD_CONTEXT).
+ */
+static enum audit_state audit_filter_syscall(struct task_struct *tsk,
+					     struct audit_context *ctx,
+					     struct list_head *list)
+{
+	struct audit_entry *e;
+	enum audit_state   state;
+	int		   word = AUDIT_WORD(ctx->major);
+	int		   bit  = AUDIT_BIT(ctx->major);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, list, list) {
+		if ((e->rule.mask[word] & bit) == bit
+ 		    && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+			rcu_read_unlock();
+			return state;
+		}
+	}
+	rcu_read_unlock();
+	return AUDIT_BUILD_CONTEXT;
+}
+
+/* This should be called with task_lock() held. */
+static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+						      int return_valid,
+						      int return_code)
+{
+	struct audit_context *context = tsk->audit_context;
+
+	if (likely(!context))
+		return NULL;
+	context->return_valid = return_valid;
+	context->return_code  = return_code;
+
+	if (context->in_syscall && !context->auditable) {
+		enum audit_state state;
+		state = audit_filter_syscall(tsk, context, &audit_extlist);
+		if (state == AUDIT_RECORD_CONTEXT)
+			context->auditable = 1;
+	}
+
+	context->pid = tsk->pid;
+	context->uid = tsk->uid;
+	context->gid = tsk->gid;
+	context->euid = tsk->euid;
+	context->suid = tsk->suid;
+	context->fsuid = tsk->fsuid;
+	context->egid = tsk->egid;
+	context->sgid = tsk->sgid;
+	context->fsgid = tsk->fsgid;
+	context->personality = tsk->personality;
+	tsk->audit_context = NULL;
+	return context;
+}
+
+static inline void audit_free_names(struct audit_context *context)
+{
+	int i;
+
+#if AUDIT_DEBUG == 2
+	if (context->auditable
+	    ||context->put_count + context->ino_count != context->name_count) {
+		printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d"
+		       " name_count=%d put_count=%d"
+		       " ino_count=%d [NOT freeing]\n",
+		       __LINE__,
+		       context->serial, context->major, context->in_syscall,
+		       context->name_count, context->put_count,
+		       context->ino_count);
+		for (i = 0; i < context->name_count; i++)
+			printk(KERN_ERR "names[%d] = %p = %s\n", i,
+			       context->names[i].name,
+			       context->names[i].name);
+		dump_stack();
+		return;
+	}
+#endif
+#if AUDIT_DEBUG
+	context->put_count  = 0;
+	context->ino_count  = 0;
+#endif
+
+	for (i = 0; i < context->name_count; i++)
+		if (context->names[i].name)
+			__putname(context->names[i].name);
+	context->name_count = 0;
+}
+
+static inline void audit_free_aux(struct audit_context *context)
+{
+	struct audit_aux_data *aux;
+
+	while ((aux = context->aux)) {
+		context->aux = aux->next;
+		kfree(aux);
+	}
+}
+
+static inline void audit_zero_context(struct audit_context *context,
+				      enum audit_state state)
+{
+	uid_t loginuid = context->loginuid;
+
+	memset(context, 0, sizeof(*context));
+	context->state      = state;
+	context->loginuid   = loginuid;
+}
+
+static inline struct audit_context *audit_alloc_context(enum audit_state state)
+{
+	struct audit_context *context;
+
+	if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+		return NULL;
+	audit_zero_context(context, state);
+	return context;
+}
+
+/* Filter on the task information and allocate a per-task audit context
+ * if necessary.  Doing so turns on system call auditing for the
+ * specified task.  This is called from copy_process, so no lock is
+ * needed. */
+int audit_alloc(struct task_struct *tsk)
+{
+	struct audit_context *context;
+	enum audit_state     state;
+
+	if (likely(!audit_enabled))
+		return 0; /* Return if not auditing. */
+
+	state = audit_filter_task(tsk);
+	if (likely(state == AUDIT_DISABLED))
+		return 0;
+
+	if (!(context = audit_alloc_context(state))) {
+		audit_log_lost("out of memory in audit_alloc");
+		return -ENOMEM;
+	}
+
+				/* Preserve login uid */
+	context->loginuid = -1;
+	if (current->audit_context)
+		context->loginuid = current->audit_context->loginuid;
+
+	tsk->audit_context  = context;
+	set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+	return 0;
+}
+
+static inline void audit_free_context(struct audit_context *context)
+{
+	struct audit_context *previous;
+	int		     count = 0;
+
+	do {
+		previous = context->previous;
+		if (previous || (count &&  count < 10)) {
+			++count;
+			printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
+			       " freeing multiple contexts (%d)\n",
+			       context->serial, context->major,
+			       context->name_count, count);
+		}
+		audit_free_names(context);
+		audit_free_aux(context);
+		kfree(context);
+		context  = previous;
+	} while (context);
+	if (count >= 10)
+		printk(KERN_ERR "audit: freed %d contexts\n", count);
+}
+
+static void audit_log_exit(struct audit_context *context)
+{
+	int i;
+	struct audit_buffer *ab;
+
+	ab = audit_log_start(context);
+	if (!ab)
+		return;		/* audit_panic has been called */
+	audit_log_format(ab, "syscall=%d", context->major);
+	if (context->personality != PER_LINUX)
+		audit_log_format(ab, " per=%lx", context->personality);
+	if (context->return_valid)
+		audit_log_format(ab, " exit=%d", context->return_code);
+	audit_log_format(ab,
+		  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
+		  " pid=%d loginuid=%d uid=%d gid=%d"
+		  " euid=%d suid=%d fsuid=%d"
+		  " egid=%d sgid=%d fsgid=%d",
+		  context->argv[0],
+		  context->argv[1],
+		  context->argv[2],
+		  context->argv[3],
+		  context->name_count,
+		  context->pid,
+		  context->loginuid,
+		  context->uid,
+		  context->gid,
+		  context->euid, context->suid, context->fsuid,
+		  context->egid, context->sgid, context->fsgid);
+	audit_log_end(ab);
+	while (context->aux) {
+		struct audit_aux_data *aux;
+
+		ab = audit_log_start(context);
+		if (!ab)
+			continue; /* audit_panic has been called */
+
+		aux = context->aux;
+		context->aux = aux->next;
+
+		audit_log_format(ab, "auxitem=%d", aux->type);
+		switch (aux->type) {
+		case AUDIT_AUX_IPCPERM: {
+			struct audit_aux_data_ipcctl *axi = (void *)aux;
+			audit_log_format(ab, 
+					 " qbytes=%lx uid=%d gid=%d mode=%x",
+					 axi->qbytes, axi->uid, axi->gid, axi->mode);
+			}
+		}
+		audit_log_end(ab);
+		kfree(aux);
+	}
+
+	for (i = 0; i < context->name_count; i++) {
+		ab = audit_log_start(context);
+		if (!ab)
+			continue; /* audit_panic has been called */
+		audit_log_format(ab, "item=%d", i);
+		if (context->names[i].name)
+			audit_log_format(ab, " name=%s",
+					 context->names[i].name);
+		if (context->names[i].ino != (unsigned long)-1)
+			audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
+					     " uid=%d gid=%d rdev=%02x:%02x",
+					 context->names[i].ino,
+					 MAJOR(context->names[i].dev),
+					 MINOR(context->names[i].dev),
+					 context->names[i].mode,
+					 context->names[i].uid,
+					 context->names[i].gid,
+					 MAJOR(context->names[i].rdev),
+					 MINOR(context->names[i].rdev));
+		audit_log_end(ab);
+	}
+}
+
+/* Free a per-task audit context.  Called from copy_process and
+ * __put_task_struct. */
+void audit_free(struct task_struct *tsk)
+{
+	struct audit_context *context;
+
+	task_lock(tsk);
+	context = audit_get_context(tsk, 0, 0);
+	task_unlock(tsk);
+
+	if (likely(!context))
+		return;
+
+	/* Check for system calls that do not go through the exit
+	 * function (e.g., exit_group), then free context block. */
+	if (context->in_syscall && context->auditable)
+		audit_log_exit(context);
+
+	audit_free_context(context);
+}
+
+/* Compute a serial number for the audit record.  Audit records are
+ * written to user-space as soon as they are generated, so a complete
+ * audit record may be written in several pieces.  The timestamp of the
+ * record and this serial number are used by the user-space daemon to
+ * determine which pieces belong to the same audit record.  The
+ * (timestamp,serial) tuple is unique for each syscall and is live from
+ * syscall entry to syscall exit.
+ *
+ * Atomic values are only guaranteed to be 24-bit, so we count down.
+ *
+ * NOTE: Another possibility is to store the formatted records off the
+ * audit context (for those records that have a context), and emit them
+ * all at syscall exit.  However, this could delay the reporting of
+ * significant errors until syscall exit (or never, if the system
+ * halts). */
+static inline unsigned int audit_serial(void)
+{
+	static atomic_t serial = ATOMIC_INIT(0xffffff);
+	unsigned int a, b;
+
+	do {
+		a = atomic_read(&serial);
+		if (atomic_dec_and_test(&serial))
+			atomic_set(&serial, 0xffffff);
+		b = atomic_read(&serial);
+	} while (b != a - 1);
+
+	return 0xffffff - b;
+}
+
+/* Fill in audit context at syscall entry.  This only happens if the
+ * audit context was created when the task was created and the state or
+ * filters demand the audit context be built.  If the state from the
+ * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
+ * then the record will be written at syscall exit time (otherwise, it
+ * will only be written if another part of the kernel requests that it
+ * be written). */
+void audit_syscall_entry(struct task_struct *tsk, int major,
+			 unsigned long a1, unsigned long a2,
+			 unsigned long a3, unsigned long a4)
+{
+	struct audit_context *context = tsk->audit_context;
+	enum audit_state     state;
+
+	BUG_ON(!context);
+
+	/* This happens only on certain architectures that make system
+	 * calls in kernel_thread via the entry.S interface, instead of
+	 * with direct calls.  (If you are porting to a new
+	 * architecture, hitting this condition can indicate that you
+	 * got the _exit/_leave calls backward in entry.S.)
+	 *
+	 * i386     no
+	 * x86_64   no
+	 * ppc64    yes (see arch/ppc64/kernel/misc.S)
+	 *
+	 * This also happens with vm86 emulation in a non-nested manner
+	 * (entries without exits), so this case must be caught.
+	 */
+	if (context->in_syscall) {
+		struct audit_context *newctx;
+
+#if defined(__NR_vm86) && defined(__NR_vm86old)
+		/* vm86 mode should only be entered once */
+		if (major == __NR_vm86 || major == __NR_vm86old)
+			return;
+#endif
+#if AUDIT_DEBUG
+		printk(KERN_ERR
+		       "audit(:%d) pid=%d in syscall=%d;"
+		       " entering syscall=%d\n",
+		       context->serial, tsk->pid, context->major, major);
+#endif
+		newctx = audit_alloc_context(context->state);
+		if (newctx) {
+			newctx->previous   = context;
+			context		   = newctx;
+			tsk->audit_context = newctx;
+		} else	{
+			/* If we can't alloc a new context, the best we
+			 * can do is to leak memory (any pending putname
+			 * will be lost).  The only other alternative is
+			 * to abandon auditing. */
+			audit_zero_context(context, context->state);
+		}
+	}
+	BUG_ON(context->in_syscall || context->name_count);
+
+	if (!audit_enabled)
+		return;
+
+	context->major      = major;
+	context->argv[0]    = a1;
+	context->argv[1]    = a2;
+	context->argv[2]    = a3;
+	context->argv[3]    = a4;
+
+	state = context->state;
+	if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
+		state = audit_filter_syscall(tsk, context, &audit_entlist);
+	if (likely(state == AUDIT_DISABLED))
+		return;
+
+	context->serial     = audit_serial();
+	context->ctime      = CURRENT_TIME;
+	context->in_syscall = 1;
+	context->auditable  = !!(state == AUDIT_RECORD_CONTEXT);
+}
+
+/* Tear down after system call.  If the audit context has been marked as
+ * auditable (either because of the AUDIT_RECORD_CONTEXT state from
+ * filtering, or because some other part of the kernel write an audit
+ * message), then write out the syscall information.  In call cases,
+ * free the names stored from getname(). */
+void audit_syscall_exit(struct task_struct *tsk, int return_code)
+{
+	struct audit_context *context;
+
+	get_task_struct(tsk);
+	task_lock(tsk);
+	context = audit_get_context(tsk, 1, return_code);
+	task_unlock(tsk);
+
+	/* Not having a context here is ok, since the parent may have
+	 * called __put_task_struct. */
+	if (likely(!context))
+		return;
+
+	if (context->in_syscall && context->auditable)
+		audit_log_exit(context);
+
+	context->in_syscall = 0;
+	context->auditable  = 0;
+	if (context->previous) {
+		struct audit_context *new_context = context->previous;
+		context->previous  = NULL;
+		audit_free_context(context);
+		tsk->audit_context = new_context;
+	} else {
+		audit_free_names(context);
+		audit_free_aux(context);
+		audit_zero_context(context, context->state);
+		tsk->audit_context = context;
+	}
+	put_task_struct(tsk);
+}
+
+/* Add a name to the list.  Called from fs/namei.c:getname(). */
+void audit_getname(const char *name)
+{
+	struct audit_context *context = current->audit_context;
+
+	if (!context || IS_ERR(name) || !name)
+		return;
+
+	if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+		printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+		       __FILE__, __LINE__, context->serial, name);
+		dump_stack();
+#endif
+		return;
+	}
+	BUG_ON(context->name_count >= AUDIT_NAMES);
+	context->names[context->name_count].name = name;
+	context->names[context->name_count].ino  = (unsigned long)-1;
+	++context->name_count;
+}
+
+/* Intercept a putname request.  Called from
+ * include/linux/fs.h:putname().  If we have stored the name from
+ * getname in the audit context, then we delay the putname until syscall
+ * exit. */
+void audit_putname(const char *name)
+{
+	struct audit_context *context = current->audit_context;
+
+	BUG_ON(!context);
+	if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+		printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
+		       __FILE__, __LINE__, context->serial, name);
+		if (context->name_count) {
+			int i;
+			for (i = 0; i < context->name_count; i++)
+				printk(KERN_ERR "name[%d] = %p = %s\n", i,
+				       context->names[i].name,
+				       context->names[i].name);
+		}
+#endif
+		__putname(name);
+	}
+#if AUDIT_DEBUG
+	else {
+		++context->put_count;
+		if (context->put_count > context->name_count) {
+			printk(KERN_ERR "%s:%d(:%d): major=%d"
+			       " in_syscall=%d putname(%p) name_count=%d"
+			       " put_count=%d\n",
+			       __FILE__, __LINE__,
+			       context->serial, context->major,
+			       context->in_syscall, name, context->name_count,
+			       context->put_count);
+			dump_stack();
+		}
+	}
+#endif
+}
+
+/* Store the inode and device from a lookup.  Called from
+ * fs/namei.c:path_lookup(). */
+void audit_inode(const char *name, const struct inode *inode)
+{
+	int idx;
+	struct audit_context *context = current->audit_context;
+
+	if (!context->in_syscall)
+		return;
+	if (context->name_count
+	    && context->names[context->name_count-1].name
+	    && context->names[context->name_count-1].name == name)
+		idx = context->name_count - 1;
+	else if (context->name_count > 1
+		 && context->names[context->name_count-2].name
+		 && context->names[context->name_count-2].name == name)
+		idx = context->name_count - 2;
+	else {
+		/* FIXME: how much do we care about inodes that have no
+		 * associated name? */
+		if (context->name_count >= AUDIT_NAMES - AUDIT_NAMES_RESERVED)
+			return;
+		idx = context->name_count++;
+		context->names[idx].name = NULL;
+#if AUDIT_DEBUG
+		++context->ino_count;
+#endif
+	}
+	context->names[idx].ino  = inode->i_ino;
+	context->names[idx].dev	 = inode->i_sb->s_dev;
+	context->names[idx].mode = inode->i_mode;
+	context->names[idx].uid  = inode->i_uid;
+	context->names[idx].gid  = inode->i_gid;
+	context->names[idx].rdev = inode->i_rdev;
+}
+
+void audit_get_stamp(struct audit_context *ctx,
+		     struct timespec *t, int *serial)
+{
+	if (ctx) {
+		t->tv_sec  = ctx->ctime.tv_sec;
+		t->tv_nsec = ctx->ctime.tv_nsec;
+		*serial    = ctx->serial;
+		ctx->auditable = 1;
+	} else {
+		*t      = CURRENT_TIME;
+		*serial = 0;
+	}
+}
+
+extern int audit_set_type(struct audit_buffer *ab, int type);
+
+int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid)
+{
+	if (ctx) {
+		struct audit_buffer *ab;
+
+		ab = audit_log_start(NULL);
+		if (ab) {
+			audit_log_format(ab, "login pid=%d uid=%u "
+				"old loginuid=%u new loginuid=%u",
+				ctx->pid, ctx->uid, ctx->loginuid, loginuid);
+			audit_set_type(ab, AUDIT_LOGIN);
+			audit_log_end(ab);
+		}
+		ctx->loginuid = loginuid;
+	}
+	return 0;
+}
+
+uid_t audit_get_loginuid(struct audit_context *ctx)
+{
+	return ctx ? ctx->loginuid : -1;
+}
+
+int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+{
+	struct audit_aux_data_ipcctl *ax;
+	struct audit_context *context = current->audit_context;
+
+	if (likely(!context))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+	if (!ax)
+		return -ENOMEM;
+
+	ax->qbytes = qbytes;
+	ax->uid = uid;
+	ax->gid = gid;
+	ax->mode = mode;
+
+	ax->d.type = AUDIT_AUX_IPCPERM;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+	return 0;
+}
diff --git a/kernel/capability.c b/kernel/capability.c
new file mode 100644
index 000000000000..64db1ee820c2
--- /dev/null
+++ b/kernel/capability.c
@@ -0,0 +1,220 @@
+/*
+ * linux/kernel/capability.c
+ *
+ * Copyright (C) 1997  Andrew Main <zefram@fysh.org>
+ *
+ * Integrated into 2.1.97+,  Andrew G. Morgan <morgan@transmeta.com>
+ * 30 May 2002:	Cleanup, Robert M. Love <rml@tech9.net>
+ */ 
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+
+unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
+kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
+
+EXPORT_SYMBOL(securebits);
+EXPORT_SYMBOL(cap_bset);
+
+/*
+ * This lock protects task->cap_* for all tasks including current.
+ * Locking rule: acquire this prior to tasklist_lock.
+ */
+static DEFINE_SPINLOCK(task_capability_lock);
+
+/*
+ * For sys_getproccap() and sys_setproccap(), any of the three
+ * capability set pointers may be NULL -- indicating that that set is
+ * uninteresting and/or not to be changed.
+ */
+
+/*
+ * sys_capget - get the capabilities of a given process.
+ */
+asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
+{
+     int ret = 0;
+     pid_t pid;
+     __u32 version;
+     task_t *target;
+     struct __user_cap_data_struct data;
+
+     if (get_user(version, &header->version))
+	     return -EFAULT;
+
+     if (version != _LINUX_CAPABILITY_VERSION) {
+	     if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
+		     return -EFAULT; 
+             return -EINVAL;
+     }
+
+     if (get_user(pid, &header->pid))
+	     return -EFAULT;
+
+     if (pid < 0) 
+             return -EINVAL;
+
+     spin_lock(&task_capability_lock);
+     read_lock(&tasklist_lock); 
+
+     if (pid && pid != current->pid) {
+	     target = find_task_by_pid(pid);
+	     if (!target) {
+	          ret = -ESRCH;
+	          goto out;
+	     }
+     } else
+	     target = current;
+
+     ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted);
+
+out:
+     read_unlock(&tasklist_lock); 
+     spin_unlock(&task_capability_lock);
+
+     if (!ret && copy_to_user(dataptr, &data, sizeof data))
+          return -EFAULT; 
+
+     return ret;
+}
+
+/*
+ * cap_set_pg - set capabilities for all processes in a given process
+ * group.  We call this holding task_capability_lock and tasklist_lock.
+ */
+static inline int cap_set_pg(int pgrp, kernel_cap_t *effective,
+			      kernel_cap_t *inheritable,
+			      kernel_cap_t *permitted)
+{
+	task_t *g, *target;
+	int ret = -EPERM;
+	int found = 0;
+
+	do_each_task_pid(pgrp, PIDTYPE_PGID, g) {
+		target = g;
+		while_each_thread(g, target) {
+			if (!security_capset_check(target, effective,
+							inheritable,
+							permitted)) {
+				security_capset_set(target, effective,
+							inheritable,
+							permitted);
+				ret = 0;
+			}
+			found = 1;
+		}
+	} while_each_task_pid(pgrp, PIDTYPE_PGID, g);
+
+	if (!found)
+	     ret = 0;
+	return ret;
+}
+
+/*
+ * cap_set_all - set capabilities for all processes other than init
+ * and self.  We call this holding task_capability_lock and tasklist_lock.
+ */
+static inline int cap_set_all(kernel_cap_t *effective,
+			       kernel_cap_t *inheritable,
+			       kernel_cap_t *permitted)
+{
+     task_t *g, *target;
+     int ret = -EPERM;
+     int found = 0;
+
+     do_each_thread(g, target) {
+             if (target == current || target->pid == 1)
+                     continue;
+             found = 1;
+	     if (security_capset_check(target, effective, inheritable,
+						permitted))
+		     continue;
+	     ret = 0;
+	     security_capset_set(target, effective, inheritable, permitted);
+     } while_each_thread(g, target);
+
+     if (!found)
+	     ret = 0;
+     return ret;
+}
+
+/*
+ * sys_capset - set capabilities for a given process, all processes, or all
+ * processes in a given process group.
+ *
+ * The restrictions on setting capabilities are specified as:
+ *
+ * [pid is for the 'target' task.  'current' is the calling task.]
+ *
+ * I: any raised capabilities must be a subset of the (old current) permitted
+ * P: any raised capabilities must be a subset of the (old current) permitted
+ * E: must be set to a subset of (new target) permitted
+ */
+asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
+{
+     kernel_cap_t inheritable, permitted, effective;
+     __u32 version;
+     task_t *target;
+     int ret;
+     pid_t pid;
+
+     if (get_user(version, &header->version))
+	     return -EFAULT; 
+
+     if (version != _LINUX_CAPABILITY_VERSION) {
+	     if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
+		     return -EFAULT; 
+             return -EINVAL;
+     }
+
+     if (get_user(pid, &header->pid))
+	     return -EFAULT; 
+
+     if (pid && pid != current->pid && !capable(CAP_SETPCAP))
+             return -EPERM;
+
+     if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
+	 copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) ||
+	 copy_from_user(&permitted, &data->permitted, sizeof(permitted)))
+	     return -EFAULT; 
+
+     spin_lock(&task_capability_lock);
+     read_lock(&tasklist_lock);
+
+     if (pid > 0 && pid != current->pid) {
+          target = find_task_by_pid(pid);
+          if (!target) {
+               ret = -ESRCH;
+               goto out;
+          }
+     } else
+               target = current;
+
+     ret = 0;
+
+     /* having verified that the proposed changes are legal,
+           we now put them into effect. */
+     if (pid < 0) {
+             if (pid == -1)  /* all procs other than current and init */
+                     ret = cap_set_all(&effective, &inheritable, &permitted);
+
+             else            /* all procs in process group */
+                     ret = cap_set_pg(-pid, &effective, &inheritable,
+		     					&permitted);
+     } else {
+	     ret = security_capset_check(target, &effective, &inheritable,
+	     						&permitted);
+	     if (!ret)
+		     security_capset_set(target, &effective, &inheritable,
+		     					&permitted);
+     }
+
+out:
+     read_unlock(&tasklist_lock);
+     spin_unlock(&task_capability_lock);
+
+     return ret;
+}
diff --git a/kernel/compat.c b/kernel/compat.c
new file mode 100644
index 000000000000..dad10656bf14
--- /dev/null
+++ b/kernel/compat.c
@@ -0,0 +1,860 @@
+/*
+ *  linux/kernel/compat.c
+ *
+ *  Kernel compatibililty routines for e.g. 32 bit syscall support
+ *  on 64 bit kernels.
+ *
+ *  Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/signal.h>
+#include <linux/sched.h>	/* for MAX_SCHEDULE_TIMEOUT */
+#include <linux/futex.h>	/* for FUTEX_WAIT */
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/security.h>
+
+#include <asm/uaccess.h>
+#include <asm/bug.h>
+
+int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
+{
+	return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
+			__get_user(ts->tv_sec, &cts->tv_sec) ||
+			__get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
+}
+
+int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
+{
+	return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
+			__put_user(ts->tv_sec, &cts->tv_sec) ||
+			__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
+}
+
+static long compat_nanosleep_restart(struct restart_block *restart)
+{
+	unsigned long expire = restart->arg0, now = jiffies;
+	struct compat_timespec __user *rmtp;
+
+	/* Did it expire while we handled signals? */
+	if (!time_after(expire, now))
+		return 0;
+
+	current->state = TASK_INTERRUPTIBLE;
+	expire = schedule_timeout(expire - now);
+	if (expire == 0)
+		return 0;
+
+	rmtp = (struct compat_timespec __user *)restart->arg1;
+	if (rmtp) {
+		struct compat_timespec ct;
+		struct timespec t;
+
+		jiffies_to_timespec(expire, &t);
+		ct.tv_sec = t.tv_sec;
+		ct.tv_nsec = t.tv_nsec;
+		if (copy_to_user(rmtp, &ct, sizeof(ct)))
+			return -EFAULT;
+	}
+	/* The 'restart' block is already filled in */
+	return -ERESTART_RESTARTBLOCK;
+}
+
+asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
+		struct compat_timespec __user *rmtp)
+{
+	struct timespec t;
+	struct restart_block *restart;
+	unsigned long expire;
+
+	if (get_compat_timespec(&t, rqtp))
+		return -EFAULT;
+
+	if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
+		return -EINVAL;
+
+	expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
+	current->state = TASK_INTERRUPTIBLE;
+	expire = schedule_timeout(expire);
+	if (expire == 0)
+		return 0;
+
+	if (rmtp) {
+		jiffies_to_timespec(expire, &t);
+		if (put_compat_timespec(&t, rmtp))
+			return -EFAULT;
+	}
+	restart = &current_thread_info()->restart_block;
+	restart->fn = compat_nanosleep_restart;
+	restart->arg0 = jiffies + expire;
+	restart->arg1 = (unsigned long) rmtp;
+	return -ERESTART_RESTARTBLOCK;
+}
+
+static inline long get_compat_itimerval(struct itimerval *o,
+		struct compat_itimerval __user *i)
+{
+	return (!access_ok(VERIFY_READ, i, sizeof(*i)) ||
+		(__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) |
+		 __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) |
+		 __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) |
+		 __get_user(o->it_value.tv_usec, &i->it_value.tv_usec)));
+}
+
+static inline long put_compat_itimerval(struct compat_itimerval __user *o,
+		struct itimerval *i)
+{
+	return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) ||
+		(__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) |
+		 __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) |
+		 __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) |
+		 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
+}
+
+asmlinkage long compat_sys_getitimer(int which,
+		struct compat_itimerval __user *it)
+{
+	struct itimerval kit;
+	int error;
+
+	error = do_getitimer(which, &kit);
+	if (!error && put_compat_itimerval(it, &kit))
+		error = -EFAULT;
+	return error;
+}
+
+asmlinkage long compat_sys_setitimer(int which,
+		struct compat_itimerval __user *in,
+		struct compat_itimerval __user *out)
+{
+	struct itimerval kin, kout;
+	int error;
+
+	if (in) {
+		if (get_compat_itimerval(&kin, in))
+			return -EFAULT;
+	} else
+		memset(&kin, 0, sizeof(kin));
+
+	error = do_setitimer(which, &kin, out ? &kout : NULL);
+	if (error || !out)
+		return error;
+	if (put_compat_itimerval(out, &kout))
+		return -EFAULT;
+	return 0;
+}
+
+asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
+{
+	/*
+	 *	In the SMP world we might just be unlucky and have one of
+	 *	the times increment as we use it. Since the value is an
+	 *	atomically safe type this is just fine. Conceptually its
+	 *	as if the syscall took an instant longer to occur.
+	 */
+	if (tbuf) {
+		struct compat_tms tmp;
+		struct task_struct *tsk = current;
+		struct task_struct *t;
+		cputime_t utime, stime, cutime, cstime;
+
+		read_lock(&tasklist_lock);
+		utime = tsk->signal->utime;
+		stime = tsk->signal->stime;
+		t = tsk;
+		do {
+			utime = cputime_add(utime, t->utime);
+			stime = cputime_add(stime, t->stime);
+			t = next_thread(t);
+		} while (t != tsk);
+
+		/*
+		 * While we have tasklist_lock read-locked, no dying thread
+		 * can be updating current->signal->[us]time.  Instead,
+		 * we got their counts included in the live thread loop.
+		 * However, another thread can come in right now and
+		 * do a wait call that updates current->signal->c[us]time.
+		 * To make sure we always see that pair updated atomically,
+		 * we take the siglock around fetching them.
+		 */
+		spin_lock_irq(&tsk->sighand->siglock);
+		cutime = tsk->signal->cutime;
+		cstime = tsk->signal->cstime;
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+
+		tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
+		tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
+		tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
+		tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
+		if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
+			return -EFAULT;
+	}
+	return compat_jiffies_to_clock_t(jiffies);
+}
+
+/*
+ * Assumption: old_sigset_t and compat_old_sigset_t are both
+ * types that can be passed to put_user()/get_user().
+ */
+
+asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
+{
+	old_sigset_t s;
+	long ret;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(KERNEL_DS);
+	ret = sys_sigpending((old_sigset_t __user *) &s);
+	set_fs(old_fs);
+	if (ret == 0)
+		ret = put_user(s, set);
+	return ret;
+}
+
+asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
+		compat_old_sigset_t __user *oset)
+{
+	old_sigset_t s;
+	long ret;
+	mm_segment_t old_fs;
+
+	if (set && get_user(s, set))
+		return -EFAULT;
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = sys_sigprocmask(how,
+			      set ? (old_sigset_t __user *) &s : NULL,
+			      oset ? (old_sigset_t __user *) &s : NULL);
+	set_fs(old_fs);
+	if (ret == 0)
+		if (oset)
+			ret = put_user(s, oset);
+	return ret;
+}
+
+#ifdef CONFIG_FUTEX
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
+		struct compat_timespec __user *utime, u32 __user *uaddr2,
+		int val3)
+{
+	struct timespec t;
+	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+	int val2 = 0;
+
+	if ((op == FUTEX_WAIT) && utime) {
+		if (get_compat_timespec(&t, utime))
+			return -EFAULT;
+		timeout = timespec_to_jiffies(&t) + 1;
+	}
+	if (op >= FUTEX_REQUEUE)
+		val2 = (int) (unsigned long) utime;
+
+	return do_futex((unsigned long)uaddr, op, val, timeout,
+			(unsigned long)uaddr2, val2, val3);
+}
+#endif
+
+asmlinkage long compat_sys_setrlimit(unsigned int resource,
+		struct compat_rlimit __user *rlim)
+{
+	struct rlimit r;
+	int ret;
+	mm_segment_t old_fs = get_fs ();
+
+	if (resource >= RLIM_NLIMITS) 
+		return -EINVAL;	
+
+	if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
+	    __get_user(r.rlim_cur, &rlim->rlim_cur) ||
+	    __get_user(r.rlim_max, &rlim->rlim_max))
+		return -EFAULT;
+
+	if (r.rlim_cur == COMPAT_RLIM_INFINITY)
+		r.rlim_cur = RLIM_INFINITY;
+	if (r.rlim_max == COMPAT_RLIM_INFINITY)
+		r.rlim_max = RLIM_INFINITY;
+	set_fs(KERNEL_DS);
+	ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
+	set_fs(old_fs);
+	return ret;
+}
+
+#ifdef COMPAT_RLIM_OLD_INFINITY
+
+asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
+		struct compat_rlimit __user *rlim)
+{
+	struct rlimit r;
+	int ret;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(KERNEL_DS);
+	ret = sys_old_getrlimit(resource, &r);
+	set_fs(old_fs);
+
+	if (!ret) {
+		if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY)
+			r.rlim_cur = COMPAT_RLIM_INFINITY;
+		if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY)
+			r.rlim_max = COMPAT_RLIM_INFINITY;
+
+		if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
+		    __put_user(r.rlim_cur, &rlim->rlim_cur) ||
+		    __put_user(r.rlim_max, &rlim->rlim_max))
+			return -EFAULT;
+	}
+	return ret;
+}
+
+#endif
+
+asmlinkage long compat_sys_getrlimit (unsigned int resource,
+		struct compat_rlimit __user *rlim)
+{
+	struct rlimit r;
+	int ret;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(KERNEL_DS);
+	ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
+	set_fs(old_fs);
+	if (!ret) {
+		if (r.rlim_cur > COMPAT_RLIM_INFINITY)
+			r.rlim_cur = COMPAT_RLIM_INFINITY;
+		if (r.rlim_max > COMPAT_RLIM_INFINITY)
+			r.rlim_max = COMPAT_RLIM_INFINITY;
+
+		if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
+		    __put_user(r.rlim_cur, &rlim->rlim_cur) ||
+		    __put_user(r.rlim_max, &rlim->rlim_max))
+			return -EFAULT;
+	}
+	return ret;
+}
+
+int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
+{
+	if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) ||
+	    __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) ||
+	    __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) ||
+	    __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) ||
+	    __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) ||
+	    __put_user(r->ru_maxrss, &ru->ru_maxrss) ||
+	    __put_user(r->ru_ixrss, &ru->ru_ixrss) ||
+	    __put_user(r->ru_idrss, &ru->ru_idrss) ||
+	    __put_user(r->ru_isrss, &ru->ru_isrss) ||
+	    __put_user(r->ru_minflt, &ru->ru_minflt) ||
+	    __put_user(r->ru_majflt, &ru->ru_majflt) ||
+	    __put_user(r->ru_nswap, &ru->ru_nswap) ||
+	    __put_user(r->ru_inblock, &ru->ru_inblock) ||
+	    __put_user(r->ru_oublock, &ru->ru_oublock) ||
+	    __put_user(r->ru_msgsnd, &ru->ru_msgsnd) ||
+	    __put_user(r->ru_msgrcv, &ru->ru_msgrcv) ||
+	    __put_user(r->ru_nsignals, &ru->ru_nsignals) ||
+	    __put_user(r->ru_nvcsw, &ru->ru_nvcsw) ||
+	    __put_user(r->ru_nivcsw, &ru->ru_nivcsw))
+		return -EFAULT;
+	return 0;
+}
+
+asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
+{
+	struct rusage r;
+	int ret;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(KERNEL_DS);
+	ret = sys_getrusage(who, (struct rusage __user *) &r);
+	set_fs(old_fs);
+
+	if (ret)
+		return ret;
+
+	if (put_compat_rusage(&r, ru))
+		return -EFAULT;
+
+	return 0;
+}
+
+asmlinkage long
+compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
+	struct compat_rusage __user *ru)
+{
+	if (!ru) {
+		return sys_wait4(pid, stat_addr, options, NULL);
+	} else {
+		struct rusage r;
+		int ret;
+		unsigned int status;
+		mm_segment_t old_fs = get_fs();
+
+		set_fs (KERNEL_DS);
+		ret = sys_wait4(pid,
+				(stat_addr ?
+				 (unsigned int __user *) &status : NULL),
+				options, (struct rusage __user *) &r);
+		set_fs (old_fs);
+
+		if (ret > 0) {
+			if (put_compat_rusage(&r, ru))
+				return -EFAULT;
+			if (stat_addr && put_user(status, stat_addr))
+				return -EFAULT;
+		}
+		return ret;
+	}
+}
+
+asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
+		struct compat_siginfo __user *uinfo, int options,
+		struct compat_rusage __user *uru)
+{
+	siginfo_t info;
+	struct rusage ru;
+	long ret;
+	mm_segment_t old_fs = get_fs();
+
+	memset(&info, 0, sizeof(info));
+
+	set_fs(KERNEL_DS);
+	ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
+			 uru ? (struct rusage __user *)&ru : NULL);
+	set_fs(old_fs);
+
+	if ((ret < 0) || (info.si_signo == 0))
+		return ret;
+
+	if (uru) {
+		ret = put_compat_rusage(&ru, uru);
+		if (ret)
+			return ret;
+	}
+
+	BUG_ON(info.si_code & __SI_MASK);
+	info.si_code |= __SI_CHLD;
+	return copy_siginfo_to_user32(uinfo, &info);
+}
+
+static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
+				    unsigned len, cpumask_t *new_mask)
+{
+	unsigned long *k;
+
+	if (len < sizeof(cpumask_t))
+		memset(new_mask, 0, sizeof(cpumask_t));
+	else if (len > sizeof(cpumask_t))
+		len = sizeof(cpumask_t);
+
+	k = cpus_addr(*new_mask);
+	return compat_get_bitmap(k, user_mask_ptr, len * 8);
+}
+
+asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
+					     unsigned int len,
+					     compat_ulong_t __user *user_mask_ptr)
+{
+	cpumask_t new_mask;
+	int retval;
+
+	retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask);
+	if (retval)
+		return retval;
+
+	return sched_setaffinity(pid, new_mask);
+}
+
+asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
+					     compat_ulong_t __user *user_mask_ptr)
+{
+	int ret;
+	cpumask_t mask;
+	unsigned long *k;
+	unsigned int min_length = sizeof(cpumask_t);
+
+	if (NR_CPUS <= BITS_PER_COMPAT_LONG)
+		min_length = sizeof(compat_ulong_t);
+
+	if (len < min_length)
+		return -EINVAL;
+
+	ret = sched_getaffinity(pid, &mask);
+	if (ret < 0)
+		return ret;
+
+	k = cpus_addr(mask);
+	ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
+	if (ret)
+		return ret;
+
+	return min_length;
+}
+
+static int get_compat_itimerspec(struct itimerspec *dst, 
+				 struct compat_itimerspec __user *src)
+{ 
+	if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
+	    get_compat_timespec(&dst->it_value, &src->it_value))
+		return -EFAULT;
+	return 0;
+} 
+
+static int put_compat_itimerspec(struct compat_itimerspec __user *dst, 
+				 struct itimerspec *src)
+{ 
+	if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
+	    put_compat_timespec(&src->it_value, &dst->it_value))
+		return -EFAULT;
+	return 0;
+} 
+
+long compat_sys_timer_settime(timer_t timer_id, int flags,
+			  struct compat_itimerspec __user *new, 
+			  struct compat_itimerspec __user *old)
+{ 
+	long err;
+	mm_segment_t oldfs;
+	struct itimerspec newts, oldts;
+
+	if (!new)
+		return -EINVAL;
+	if (get_compat_itimerspec(&newts, new))
+		return -EFAULT;	
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = sys_timer_settime(timer_id, flags,
+				(struct itimerspec __user *) &newts,
+				(struct itimerspec __user *) &oldts);
+	set_fs(oldfs); 
+	if (!err && old && put_compat_itimerspec(old, &oldts))
+		return -EFAULT;
+	return err;
+} 
+
+long compat_sys_timer_gettime(timer_t timer_id,
+		struct compat_itimerspec __user *setting)
+{ 
+	long err;
+	mm_segment_t oldfs;
+	struct itimerspec ts; 
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = sys_timer_gettime(timer_id,
+				(struct itimerspec __user *) &ts); 
+	set_fs(oldfs); 
+	if (!err && put_compat_itimerspec(setting, &ts))
+		return -EFAULT;
+	return err;
+} 
+
+long compat_sys_clock_settime(clockid_t which_clock,
+		struct compat_timespec __user *tp)
+{
+	long err;
+	mm_segment_t oldfs;
+	struct timespec ts; 
+
+	if (get_compat_timespec(&ts, tp))
+		return -EFAULT; 
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);	
+	err = sys_clock_settime(which_clock,
+				(struct timespec __user *) &ts);
+	set_fs(oldfs);
+	return err;
+} 
+
+long compat_sys_clock_gettime(clockid_t which_clock,
+		struct compat_timespec __user *tp)
+{
+	long err;
+	mm_segment_t oldfs;
+	struct timespec ts; 
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = sys_clock_gettime(which_clock,
+				(struct timespec __user *) &ts);
+	set_fs(oldfs);
+	if (!err && put_compat_timespec(&ts, tp))
+		return -EFAULT; 
+	return err;
+} 
+
+long compat_sys_clock_getres(clockid_t which_clock,
+		struct compat_timespec __user *tp)
+{
+	long err;
+	mm_segment_t oldfs;
+	struct timespec ts; 
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = sys_clock_getres(which_clock,
+			       (struct timespec __user *) &ts);
+	set_fs(oldfs);
+	if (!err && tp && put_compat_timespec(&ts, tp))
+		return -EFAULT; 
+	return err;
+} 
+
+long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
+			    struct compat_timespec __user *rqtp,
+			    struct compat_timespec __user *rmtp)
+{
+	long err;
+	mm_segment_t oldfs;
+	struct timespec in, out; 
+
+	if (get_compat_timespec(&in, rqtp)) 
+		return -EFAULT;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = sys_clock_nanosleep(which_clock, flags,
+				  (struct timespec __user *) &in,
+				  (struct timespec __user *) &out);
+	set_fs(oldfs);
+	if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
+	    put_compat_timespec(&out, rmtp))
+		return -EFAULT;
+	return err;	
+} 
+
+/*
+ * We currently only need the following fields from the sigevent
+ * structure: sigev_value, sigev_signo, sig_notify and (sometimes
+ * sigev_notify_thread_id).  The others are handled in user mode.
+ * We also assume that copying sigev_value.sival_int is sufficient
+ * to keep all the bits of sigev_value.sival_ptr intact.
+ */
+int get_compat_sigevent(struct sigevent *event,
+		const struct compat_sigevent __user *u_event)
+{
+	memset(&event, 0, sizeof(*event));
+	return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) ||
+		__get_user(event->sigev_value.sival_int,
+			&u_event->sigev_value.sival_int) ||
+		__get_user(event->sigev_signo, &u_event->sigev_signo) ||
+		__get_user(event->sigev_notify, &u_event->sigev_notify) ||
+		__get_user(event->sigev_notify_thread_id,
+			&u_event->sigev_notify_thread_id))
+		? -EFAULT : 0;
+}
+
+/* timer_create is architecture specific because it needs sigevent conversion */
+
+long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask,
+		       unsigned long bitmap_size)
+{
+	int i, j;
+	unsigned long m;
+	compat_ulong_t um;
+	unsigned long nr_compat_longs;
+
+	/* align bitmap up to nearest compat_long_t boundary */
+	bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+
+	if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
+		return -EFAULT;
+
+	nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
+
+	for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
+		m = 0;
+
+		for (j = 0; j < sizeof(m)/sizeof(um); j++) {
+			/*
+			 * We dont want to read past the end of the userspace
+			 * bitmap. We must however ensure the end of the
+			 * kernel bitmap is zeroed.
+			 */
+			if (nr_compat_longs-- > 0) {
+				if (__get_user(um, umask))
+					return -EFAULT;
+			} else {
+				um = 0;
+			}
+
+			umask++;
+			m |= (long)um << (j * BITS_PER_COMPAT_LONG);
+		}
+		*mask++ = m;
+	}
+
+	return 0;
+}
+
+long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
+		       unsigned long bitmap_size)
+{
+	int i, j;
+	unsigned long m;
+	compat_ulong_t um;
+	unsigned long nr_compat_longs;
+
+	/* align bitmap up to nearest compat_long_t boundary */
+	bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+
+	if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
+		return -EFAULT;
+
+	nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
+
+	for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
+		m = *mask++;
+
+		for (j = 0; j < sizeof(m)/sizeof(um); j++) {
+			um = m;
+
+			/*
+			 * We dont want to write past the end of the userspace
+			 * bitmap.
+			 */
+			if (nr_compat_longs-- > 0) {
+				if (__put_user(um, umask))
+					return -EFAULT;
+			}
+
+			umask++;
+			m >>= 4*sizeof(um);
+			m >>= 4*sizeof(um);
+		}
+	}
+
+	return 0;
+}
+
+void
+sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
+{
+	switch (_NSIG_WORDS) {
+#if defined (__COMPAT_ENDIAN_SWAP__)
+	case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 );
+	case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 );
+	case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 );
+	case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 );
+#else
+	case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
+	case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
+	case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
+	case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
+#endif
+	}
+}
+
+asmlinkage long
+compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
+		struct compat_siginfo __user *uinfo,
+		struct compat_timespec __user *uts, compat_size_t sigsetsize)
+{
+	compat_sigset_t s32;
+	sigset_t s;
+	int sig;
+	struct timespec t;
+	siginfo_t info;
+	long ret, timeout = 0;
+
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+
+	if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+		return -EFAULT;
+	sigset_from_compat(&s, &s32);
+	sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
+	signotset(&s);
+
+	if (uts) {
+		if (get_compat_timespec (&t, uts))
+			return -EFAULT;
+		if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
+				|| t.tv_sec < 0)
+			return -EINVAL;
+	}
+
+	spin_lock_irq(&current->sighand->siglock);
+	sig = dequeue_signal(current, &s, &info);
+	if (!sig) {
+		timeout = MAX_SCHEDULE_TIMEOUT;
+		if (uts)
+			timeout = timespec_to_jiffies(&t)
+				+(t.tv_sec || t.tv_nsec);
+		if (timeout) {
+			current->real_blocked = current->blocked;
+			sigandsets(&current->blocked, &current->blocked, &s);
+
+			recalc_sigpending();
+			spin_unlock_irq(&current->sighand->siglock);
+
+			current->state = TASK_INTERRUPTIBLE;
+			timeout = schedule_timeout(timeout);
+
+			spin_lock_irq(&current->sighand->siglock);
+			sig = dequeue_signal(current, &s, &info);
+			current->blocked = current->real_blocked;
+			siginitset(&current->real_blocked, 0);
+			recalc_sigpending();
+		}
+	}
+	spin_unlock_irq(&current->sighand->siglock);
+
+	if (sig) {
+		ret = sig;
+		if (uinfo) {
+			if (copy_siginfo_to_user32(uinfo, &info))
+				ret = -EFAULT;
+		}
+	}else {
+		ret = timeout?-EINTR:-EAGAIN;
+	}
+	return ret;
+
+}
+
+#ifdef __ARCH_WANT_COMPAT_SYS_TIME
+
+/* compat_time_t is a 32 bit "long" and needs to get converted. */
+
+asmlinkage long compat_sys_time(compat_time_t __user * tloc)
+{
+	compat_time_t i;
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+	i = tv.tv_sec;
+
+	if (tloc) {
+		if (put_user(i,tloc))
+			i = -EFAULT;
+	}
+	return i;
+}
+
+asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
+{
+	struct timespec tv;
+	int err;
+
+	if (get_user(tv.tv_sec, tptr))
+		return -EFAULT;
+
+	tv.tv_nsec = 0;
+
+	err = security_settime(&tv, NULL);
+	if (err)
+		return err;
+
+	do_settimeofday(&tv);
+	return 0;
+}
+
+#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
diff --git a/kernel/configs.c b/kernel/configs.c
new file mode 100644
index 000000000000..986f7af31e0a
--- /dev/null
+++ b/kernel/configs.c
@@ -0,0 +1,118 @@
+/*
+ * kernel/configs.c
+ * Echo the kernel .config file used to build the kernel
+ *
+ * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
+ * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org>
+ * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
+ * Copyright (C) 2002 Hewlett-Packard Company
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+
+/**************************************************/
+/* the actual current config file                 */
+
+/*
+ * Define kernel_config_data and kernel_config_data_size, which contains the
+ * wrapped and compressed configuration file.  The file is first compressed
+ * with gzip and then bounded by two eight byte magic numbers to allow
+ * extraction from a binary kernel image:
+ *
+ *   IKCFG_ST
+ *   <image>
+ *   IKCFG_ED
+ */
+#define MAGIC_START	"IKCFG_ST"
+#define MAGIC_END	"IKCFG_ED"
+#include "config_data.h"
+
+
+#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
+#define kernel_config_data_size \
+	(sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
+
+#ifdef CONFIG_IKCONFIG_PROC
+
+/**************************************************/
+/* globals and useful constants                   */
+
+static ssize_t
+ikconfig_read_current(struct file *file, char __user *buf,
+		      size_t len, loff_t * offset)
+{
+	loff_t pos = *offset;
+	ssize_t count;
+
+	if (pos >= kernel_config_data_size)
+		return 0;
+
+	count = min(len, (size_t)(kernel_config_data_size - pos));
+	if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
+		return -EFAULT;
+
+	*offset += count;
+	return count;
+}
+
+static struct file_operations ikconfig_file_ops = {
+	.owner = THIS_MODULE,
+	.read = ikconfig_read_current,
+};
+
+/***************************************************/
+/* ikconfig_init: start up everything we need to */
+
+static int __init ikconfig_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	/* create the current config file */
+	entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO,
+				  &proc_root);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->proc_fops = &ikconfig_file_ops;
+	entry->size = kernel_config_data_size;
+
+	return 0;
+}
+
+/***************************************************/
+/* ikconfig_cleanup: clean up our mess           */
+
+static void __exit ikconfig_cleanup(void)
+{
+	remove_proc_entry("config.gz", &proc_root);
+}
+
+module_init(ikconfig_init);
+module_exit(ikconfig_cleanup);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Randy Dunlap");
+MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
+
+#endif /* CONFIG_IKCONFIG_PROC */
diff --git a/kernel/cpu.c b/kernel/cpu.c
new file mode 100644
index 000000000000..628f4ccda127
--- /dev/null
+++ b/kernel/cpu.c
@@ -0,0 +1,193 @@
+/* CPU control.
+ * (C) 2001, 2002, 2003, 2004 Rusty Russell
+ *
+ * This code is licenced under the GPL.
+ */
+#include <linux/proc_fs.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/unistd.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/stop_machine.h>
+#include <asm/semaphore.h>
+
+/* This protects CPUs going up and down... */
+DECLARE_MUTEX(cpucontrol);
+
+static struct notifier_block *cpu_chain;
+
+/* Need to know about CPUs going up/down? */
+int register_cpu_notifier(struct notifier_block *nb)
+{
+	int ret;
+
+	if ((ret = down_interruptible(&cpucontrol)) != 0)
+		return ret;
+	ret = notifier_chain_register(&cpu_chain, nb);
+	up(&cpucontrol);
+	return ret;
+}
+EXPORT_SYMBOL(register_cpu_notifier);
+
+void unregister_cpu_notifier(struct notifier_block *nb)
+{
+	down(&cpucontrol);
+	notifier_chain_unregister(&cpu_chain, nb);
+	up(&cpucontrol);
+}
+EXPORT_SYMBOL(unregister_cpu_notifier);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static inline void check_for_tasks(int cpu)
+{
+	struct task_struct *p;
+
+	write_lock_irq(&tasklist_lock);
+	for_each_process(p) {
+		if (task_cpu(p) == cpu &&
+		    (!cputime_eq(p->utime, cputime_zero) ||
+		     !cputime_eq(p->stime, cputime_zero)))
+			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+				(state = %ld, flags = %lx) \n",
+				 p->comm, p->pid, cpu, p->state, p->flags);
+	}
+	write_unlock_irq(&tasklist_lock);
+}
+
+/* Take this CPU down. */
+static int take_cpu_down(void *unused)
+{
+	int err;
+
+	/* Take offline: makes arch_cpu_down somewhat easier. */
+	cpu_clear(smp_processor_id(), cpu_online_map);
+
+	/* Ensure this CPU doesn't handle any more interrupts. */
+	err = __cpu_disable();
+	if (err < 0)
+		cpu_set(smp_processor_id(), cpu_online_map);
+	else
+		/* Force idle task to run as soon as we yield: it should
+		   immediately notice cpu is offline and die quickly. */
+		sched_idle_next();
+
+	return err;
+}
+
+int cpu_down(unsigned int cpu)
+{
+	int err;
+	struct task_struct *p;
+	cpumask_t old_allowed, tmp;
+
+	if ((err = lock_cpu_hotplug_interruptible()) != 0)
+		return err;
+
+	if (num_online_cpus() == 1) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	if (!cpu_online(cpu)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+						(void *)(long)cpu);
+	if (err == NOTIFY_BAD) {
+		printk("%s: attempt to take down CPU %u failed\n",
+				__FUNCTION__, cpu);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* Ensure that we are not runnable on dying cpu */
+	old_allowed = current->cpus_allowed;
+	tmp = CPU_MASK_ALL;
+	cpu_clear(cpu, tmp);
+	set_cpus_allowed(current, tmp);
+
+	p = __stop_machine_run(take_cpu_down, NULL, cpu);
+	if (IS_ERR(p)) {
+		/* CPU didn't die: tell everyone.  Can't complain. */
+		if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+				(void *)(long)cpu) == NOTIFY_BAD)
+			BUG();
+
+		err = PTR_ERR(p);
+		goto out_allowed;
+	}
+
+	if (cpu_online(cpu))
+		goto out_thread;
+
+	/* Wait for it to sleep (leaving idle task). */
+	while (!idle_cpu(cpu))
+		yield();
+
+	/* This actually kills the CPU. */
+	__cpu_die(cpu);
+
+	/* Move it here so it can run. */
+	kthread_bind(p, get_cpu());
+	put_cpu();
+
+	/* CPU is completely dead: tell everyone.  Too late to complain. */
+	if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu)
+	    == NOTIFY_BAD)
+		BUG();
+
+	check_for_tasks(cpu);
+
+out_thread:
+	err = kthread_stop(p);
+out_allowed:
+	set_cpus_allowed(current, old_allowed);
+out:
+	unlock_cpu_hotplug();
+	return err;
+}
+#endif /*CONFIG_HOTPLUG_CPU*/
+
+int __devinit cpu_up(unsigned int cpu)
+{
+	int ret;
+	void *hcpu = (void *)(long)cpu;
+
+	if ((ret = down_interruptible(&cpucontrol)) != 0)
+		return ret;
+
+	if (cpu_online(cpu) || !cpu_present(cpu)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+	if (ret == NOTIFY_BAD) {
+		printk("%s: attempt to bring up CPU %u failed\n",
+				__FUNCTION__, cpu);
+		ret = -EINVAL;
+		goto out_notify;
+	}
+
+	/* Arch-specific enabling code. */
+	ret = __cpu_up(cpu);
+	if (ret != 0)
+		goto out_notify;
+	if (!cpu_online(cpu))
+		BUG();
+
+	/* Now call notifier in preparation. */
+	notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+
+out_notify:
+	if (ret != 0)
+		notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
+out:
+	up(&cpucontrol);
+	return ret;
+}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
new file mode 100644
index 000000000000..69792bbe2281
--- /dev/null
+++ b/kernel/cpuset.c
@@ -0,0 +1,1564 @@
+/*
+ *  kernel/cpuset.c
+ *
+ *  Processor and Memory placement constraints for sets of tasks.
+ *
+ *  Copyright (C) 2003 BULL SA.
+ *  Copyright (C) 2004 Silicon Graphics, Inc.
+ *
+ *  Portions derived from Patrick Mochel's sysfs code.
+ *  sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *  Portions Copyright (c) 2004 Silicon Graphics, Inc.
+ *
+ *  2003-10-10 Written by Simon Derr <simon.derr@bull.net>
+ *  2003-10-22 Updates by Stephen Hemminger.
+ *  2004 May-July Rework by Paul Jackson <pj@sgi.com>
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/config.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/backing-dev.h>
+#include <linux/sort.h>
+
+#include <asm/uaccess.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+#define CPUSET_SUPER_MAGIC 		0x27e0eb
+
+struct cpuset {
+	unsigned long flags;		/* "unsigned long" so bitops work */
+	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
+	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
+
+	atomic_t count;			/* count tasks using this cpuset */
+
+	/*
+	 * We link our 'sibling' struct into our parents 'children'.
+	 * Our children link their 'sibling' into our 'children'.
+	 */
+	struct list_head sibling;	/* my parents children */
+	struct list_head children;	/* my children */
+
+	struct cpuset *parent;		/* my parent */
+	struct dentry *dentry;		/* cpuset fs entry */
+
+	/*
+	 * Copy of global cpuset_mems_generation as of the most
+	 * recent time this cpuset changed its mems_allowed.
+	 */
+	 int mems_generation;
+};
+
+/* bits in struct cpuset flags field */
+typedef enum {
+	CS_CPU_EXCLUSIVE,
+	CS_MEM_EXCLUSIVE,
+	CS_REMOVED,
+	CS_NOTIFY_ON_RELEASE
+} cpuset_flagbits_t;
+
+/* convenient tests for these bits */
+static inline int is_cpu_exclusive(const struct cpuset *cs)
+{
+	return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_exclusive(const struct cpuset *cs)
+{
+	return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_removed(const struct cpuset *cs)
+{
+	return !!test_bit(CS_REMOVED, &cs->flags);
+}
+
+static inline int notify_on_release(const struct cpuset *cs)
+{
+	return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+}
+
+/*
+ * Increment this atomic integer everytime any cpuset changes its
+ * mems_allowed value.  Users of cpusets can track this generation
+ * number, and avoid having to lock and reload mems_allowed unless
+ * the cpuset they're using changes generation.
+ *
+ * A single, global generation is needed because attach_task() could
+ * reattach a task to a different cpuset, which must not have its
+ * generation numbers aliased with those of that tasks previous cpuset.
+ *
+ * Generations are needed for mems_allowed because one task cannot
+ * modify anothers memory placement.  So we must enable every task,
+ * on every visit to __alloc_pages(), to efficiently check whether
+ * its current->cpuset->mems_allowed has changed, requiring an update
+ * of its current->mems_allowed.
+ */
+static atomic_t cpuset_mems_generation = ATOMIC_INIT(1);
+
+static struct cpuset top_cpuset = {
+	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+	.cpus_allowed = CPU_MASK_ALL,
+	.mems_allowed = NODE_MASK_ALL,
+	.count = ATOMIC_INIT(0),
+	.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
+	.children = LIST_HEAD_INIT(top_cpuset.children),
+	.parent = NULL,
+	.dentry = NULL,
+	.mems_generation = 0,
+};
+
+static struct vfsmount *cpuset_mount;
+static struct super_block *cpuset_sb = NULL;
+
+/*
+ * cpuset_sem should be held by anyone who is depending on the children
+ * or sibling lists of any cpuset, or performing non-atomic operations
+ * on the flags or *_allowed values of a cpuset, such as raising the
+ * CS_REMOVED flag bit iff it is not already raised, or reading and
+ * conditionally modifying the *_allowed values.  One kernel global
+ * cpuset semaphore should be sufficient - these things don't change
+ * that much.
+ *
+ * The code that modifies cpusets holds cpuset_sem across the entire
+ * operation, from cpuset_common_file_write() down, single threading
+ * all cpuset modifications (except for counter manipulations from
+ * fork and exit) across the system.  This presumes that cpuset
+ * modifications are rare - better kept simple and safe, even if slow.
+ *
+ * The code that reads cpusets, such as in cpuset_common_file_read()
+ * and below, only holds cpuset_sem across small pieces of code, such
+ * as when reading out possibly multi-word cpumasks and nodemasks, as
+ * the risks are less, and the desire for performance a little greater.
+ * The proc_cpuset_show() routine needs to hold cpuset_sem to insure
+ * that no cs->dentry is NULL, as it walks up the cpuset tree to root.
+ *
+ * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
+ * (usually) grab cpuset_sem.  These are the two most performance
+ * critical pieces of code here.  The exception occurs on exit(),
+ * if the last task using a cpuset exits, and the cpuset was marked
+ * notify_on_release.  In that case, the cpuset_sem is taken, the
+ * path to the released cpuset calculated, and a usermode call made
+ * to /sbin/cpuset_release_agent with the name of the cpuset (path
+ * relative to the root of cpuset file system) as the argument.
+ *
+ * A cpuset can only be deleted if both its 'count' of using tasks is
+ * zero, and its list of 'children' cpusets is empty.  Since all tasks
+ * in the system use _some_ cpuset, and since there is always at least
+ * one task in the system (init, pid == 1), therefore, top_cpuset
+ * always has either children cpusets and/or using tasks.  So no need
+ * for any special hack to ensure that top_cpuset cannot be deleted.
+ */
+
+static DECLARE_MUTEX(cpuset_sem);
+
+/*
+ * A couple of forward declarations required, due to cyclic reference loop:
+ *  cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file
+ *  -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
+ */
+
+static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry);
+
+static struct backing_dev_info cpuset_backing_dev_info = {
+	.ra_pages = 0,		/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+static struct inode *cpuset_new_inode(mode_t mode)
+{
+	struct inode *inode = new_inode(cpuset_sb);
+
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
+	}
+	return inode;
+}
+
+static void cpuset_diput(struct dentry *dentry, struct inode *inode)
+{
+	/* is dentry a directory ? if so, kfree() associated cpuset */
+	if (S_ISDIR(inode->i_mode)) {
+		struct cpuset *cs = dentry->d_fsdata;
+		BUG_ON(!(is_removed(cs)));
+		kfree(cs);
+	}
+	iput(inode);
+}
+
+static struct dentry_operations cpuset_dops = {
+	.d_iput = cpuset_diput,
+};
+
+static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
+{
+	struct qstr qstr;
+	struct dentry *d;
+
+	qstr.name = name;
+	qstr.len = strlen(name);
+	qstr.hash = full_name_hash(name, qstr.len);
+	d = lookup_hash(&qstr, parent);
+	if (!IS_ERR(d))
+		d->d_op = &cpuset_dops;
+	return d;
+}
+
+static void remove_dir(struct dentry *d)
+{
+	struct dentry *parent = dget(d->d_parent);
+
+	d_delete(d);
+	simple_rmdir(parent->d_inode, d);
+	dput(parent);
+}
+
+/*
+ * NOTE : the dentry must have been dget()'ed
+ */
+static void cpuset_d_remove_dir(struct dentry *dentry)
+{
+	struct list_head *node;
+
+	spin_lock(&dcache_lock);
+	node = dentry->d_subdirs.next;
+	while (node != &dentry->d_subdirs) {
+		struct dentry *d = list_entry(node, struct dentry, d_child);
+		list_del_init(node);
+		if (d->d_inode) {
+			d = dget_locked(d);
+			spin_unlock(&dcache_lock);
+			d_delete(d);
+			simple_unlink(dentry->d_inode, d);
+			dput(d);
+			spin_lock(&dcache_lock);
+		}
+		node = dentry->d_subdirs.next;
+	}
+	list_del_init(&dentry->d_child);
+	spin_unlock(&dcache_lock);
+	remove_dir(dentry);
+}
+
+static struct super_operations cpuset_ops = {
+	.statfs = simple_statfs,
+	.drop_inode = generic_delete_inode,
+};
+
+static int cpuset_fill_super(struct super_block *sb, void *unused_data,
+							int unused_silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = CPUSET_SUPER_MAGIC;
+	sb->s_op = &cpuset_ops;
+	cpuset_sb = sb;
+
+	inode = cpuset_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR);
+	if (inode) {
+		inode->i_op = &simple_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+		/* directories start off with i_nlink == 2 (for "." entry) */
+		inode->i_nlink++;
+	} else {
+		return -ENOMEM;
+	}
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	sb->s_root = root;
+	return 0;
+}
+
+static struct super_block *cpuset_get_sb(struct file_system_type *fs_type,
+					int flags, const char *unused_dev_name,
+					void *data)
+{
+	return get_sb_single(fs_type, flags, data, cpuset_fill_super);
+}
+
+static struct file_system_type cpuset_fs_type = {
+	.name = "cpuset",
+	.get_sb = cpuset_get_sb,
+	.kill_sb = kill_litter_super,
+};
+
+/* struct cftype:
+ *
+ * The files in the cpuset filesystem mostly have a very simple read/write
+ * handling, some common function will take care of it. Nevertheless some cases
+ * (read tasks) are special and therefore I define this structure for every
+ * kind of file.
+ *
+ *
+ * When reading/writing to a file:
+ *	- the cpuset to use in file->f_dentry->d_parent->d_fsdata
+ *	- the 'cftype' of the file is file->f_dentry->d_fsdata
+ */
+
+struct cftype {
+	char *name;
+	int private;
+	int (*open) (struct inode *inode, struct file *file);
+	ssize_t (*read) (struct file *file, char __user *buf, size_t nbytes,
+							loff_t *ppos);
+	int (*write) (struct file *file, const char __user *buf, size_t nbytes,
+							loff_t *ppos);
+	int (*release) (struct inode *inode, struct file *file);
+};
+
+static inline struct cpuset *__d_cs(struct dentry *dentry)
+{
+	return dentry->d_fsdata;
+}
+
+static inline struct cftype *__d_cft(struct dentry *dentry)
+{
+	return dentry->d_fsdata;
+}
+
+/*
+ * Call with cpuset_sem held.  Writes path of cpuset into buf.
+ * Returns 0 on success, -errno on error.
+ */
+
+static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
+{
+	char *start;
+
+	start = buf + buflen;
+
+	*--start = '\0';
+	for (;;) {
+		int len = cs->dentry->d_name.len;
+		if ((start -= len) < buf)
+			return -ENAMETOOLONG;
+		memcpy(start, cs->dentry->d_name.name, len);
+		cs = cs->parent;
+		if (!cs)
+			break;
+		if (!cs->parent)
+			continue;
+		if (--start < buf)
+			return -ENAMETOOLONG;
+		*start = '/';
+	}
+	memmove(buf, start, buf + buflen - start);
+	return 0;
+}
+
+/*
+ * Notify userspace when a cpuset is released, by running
+ * /sbin/cpuset_release_agent with the name of the cpuset (path
+ * relative to the root of cpuset file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cpuset.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cpuset before it is removed, or that some other
+ * user task will 'mkdir' a child cpuset of this cpuset.  That's ok.
+ * The presumed 'rmdir' will fail quietly if this cpuset is no longer
+ * unused, and this cpuset will be reprieved from its death sentence,
+ * to continue to serve a useful existence.  Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * Note final arg to call_usermodehelper() is 0 - that means
+ * don't wait.  Since we are holding the global cpuset_sem here,
+ * and we are asking another thread (started from keventd) to rmdir a
+ * cpuset, we can't wait - or we'd deadlock with the removing thread
+ * on cpuset_sem.
+ */
+
+static int cpuset_release_agent(char *cpuset_str)
+{
+	char *argv[3], *envp[3];
+	int i;
+
+	i = 0;
+	argv[i++] = "/sbin/cpuset_release_agent";
+	argv[i++] = cpuset_str;
+	argv[i] = NULL;
+
+	i = 0;
+	/* minimal command environment */
+	envp[i++] = "HOME=/";
+	envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[i] = NULL;
+
+	return call_usermodehelper(argv[0], argv, envp, 0);
+}
+
+/*
+ * Either cs->count of using tasks transitioned to zero, or the
+ * cs->children list of child cpusets just became empty.  If this
+ * cs is notify_on_release() and now both the user count is zero and
+ * the list of children is empty, send notice to user land.
+ */
+
+static void check_for_release(struct cpuset *cs)
+{
+	if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
+	    list_empty(&cs->children)) {
+		char *buf;
+
+		buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+		if (!buf)
+			return;
+		if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
+			goto out;
+		cpuset_release_agent(buf);
+out:
+		kfree(buf);
+	}
+}
+
+/*
+ * Return in *pmask the portion of a cpusets's cpus_allowed that
+ * are online.  If none are online, walk up the cpuset hierarchy
+ * until we find one that does have some online cpus.  If we get
+ * all the way to the top and still haven't found any online cpus,
+ * return cpu_online_map.  Or if passed a NULL cs from an exit'ing
+ * task, return cpu_online_map.
+ *
+ * One way or another, we guarantee to return some non-empty subset
+ * of cpu_online_map.
+ *
+ * Call with cpuset_sem held.
+ */
+
+static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
+{
+	while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map))
+		cs = cs->parent;
+	if (cs)
+		cpus_and(*pmask, cs->cpus_allowed, cpu_online_map);
+	else
+		*pmask = cpu_online_map;
+	BUG_ON(!cpus_intersects(*pmask, cpu_online_map));
+}
+
+/*
+ * Return in *pmask the portion of a cpusets's mems_allowed that
+ * are online.  If none are online, walk up the cpuset hierarchy
+ * until we find one that does have some online mems.  If we get
+ * all the way to the top and still haven't found any online mems,
+ * return node_online_map.
+ *
+ * One way or another, we guarantee to return some non-empty subset
+ * of node_online_map.
+ *
+ * Call with cpuset_sem held.
+ */
+
+static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
+{
+	while (cs && !nodes_intersects(cs->mems_allowed, node_online_map))
+		cs = cs->parent;
+	if (cs)
+		nodes_and(*pmask, cs->mems_allowed, node_online_map);
+	else
+		*pmask = node_online_map;
+	BUG_ON(!nodes_intersects(*pmask, node_online_map));
+}
+
+/*
+ * Refresh current tasks mems_allowed and mems_generation from
+ * current tasks cpuset.  Call with cpuset_sem held.
+ *
+ * Be sure to call refresh_mems() on any cpuset operation which
+ * (1) holds cpuset_sem, and (2) might possibly alloc memory.
+ * Call after obtaining cpuset_sem lock, before any possible
+ * allocation.  Otherwise one risks trying to allocate memory
+ * while the task cpuset_mems_generation is not the same as
+ * the mems_generation in its cpuset, which would deadlock on
+ * cpuset_sem in cpuset_update_current_mems_allowed().
+ *
+ * Since we hold cpuset_sem, once refresh_mems() is called, the
+ * test (current->cpuset_mems_generation != cs->mems_generation)
+ * in cpuset_update_current_mems_allowed() will remain false,
+ * until we drop cpuset_sem.  Anyone else who would change our
+ * cpusets mems_generation needs to lock cpuset_sem first.
+ */
+
+static void refresh_mems(void)
+{
+	struct cpuset *cs = current->cpuset;
+
+	if (current->cpuset_mems_generation != cs->mems_generation) {
+		guarantee_online_mems(cs, &current->mems_allowed);
+		current->cpuset_mems_generation = cs->mems_generation;
+	}
+}
+
+/*
+ * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
+ *
+ * One cpuset is a subset of another if all its allowed CPUs and
+ * Memory Nodes are a subset of the other, and its exclusive flags
+ * are only set if the other's are set.
+ */
+
+static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
+{
+	return	cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
+		nodes_subset(p->mems_allowed, q->mems_allowed) &&
+		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
+		is_mem_exclusive(p) <= is_mem_exclusive(q);
+}
+
+/*
+ * validate_change() - Used to validate that any proposed cpuset change
+ *		       follows the structural rules for cpusets.
+ *
+ * If we replaced the flag and mask values of the current cpuset
+ * (cur) with those values in the trial cpuset (trial), would
+ * our various subset and exclusive rules still be valid?  Presumes
+ * cpuset_sem held.
+ *
+ * 'cur' is the address of an actual, in-use cpuset.  Operations
+ * such as list traversal that depend on the actual address of the
+ * cpuset in the list must use cur below, not trial.
+ *
+ * 'trial' is the address of bulk structure copy of cur, with
+ * perhaps one or more of the fields cpus_allowed, mems_allowed,
+ * or flags changed to new, trial values.
+ *
+ * Return 0 if valid, -errno if not.
+ */
+
+static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
+{
+	struct cpuset *c, *par;
+
+	/* Each of our child cpusets must be a subset of us */
+	list_for_each_entry(c, &cur->children, sibling) {
+		if (!is_cpuset_subset(c, trial))
+			return -EBUSY;
+	}
+
+	/* Remaining checks don't apply to root cpuset */
+	if ((par = cur->parent) == NULL)
+		return 0;
+
+	/* We must be a subset of our parent cpuset */
+	if (!is_cpuset_subset(trial, par))
+		return -EACCES;
+
+	/* If either I or some sibling (!= me) is exclusive, we can't overlap */
+	list_for_each_entry(c, &par->children, sibling) {
+		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
+		    c != cur &&
+		    cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
+			return -EINVAL;
+		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
+		    c != cur &&
+		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int update_cpumask(struct cpuset *cs, char *buf)
+{
+	struct cpuset trialcs;
+	int retval;
+
+	trialcs = *cs;
+	retval = cpulist_parse(buf, trialcs.cpus_allowed);
+	if (retval < 0)
+		return retval;
+	cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
+	if (cpus_empty(trialcs.cpus_allowed))
+		return -ENOSPC;
+	retval = validate_change(cs, &trialcs);
+	if (retval == 0)
+		cs->cpus_allowed = trialcs.cpus_allowed;
+	return retval;
+}
+
+static int update_nodemask(struct cpuset *cs, char *buf)
+{
+	struct cpuset trialcs;
+	int retval;
+
+	trialcs = *cs;
+	retval = nodelist_parse(buf, trialcs.mems_allowed);
+	if (retval < 0)
+		return retval;
+	nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
+	if (nodes_empty(trialcs.mems_allowed))
+		return -ENOSPC;
+	retval = validate_change(cs, &trialcs);
+	if (retval == 0) {
+		cs->mems_allowed = trialcs.mems_allowed;
+		atomic_inc(&cpuset_mems_generation);
+		cs->mems_generation = atomic_read(&cpuset_mems_generation);
+	}
+	return retval;
+}
+
+/*
+ * update_flag - read a 0 or a 1 in a file and update associated flag
+ * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
+ *						CS_NOTIFY_ON_RELEASE)
+ * cs:	the cpuset to update
+ * buf:	the buffer where we read the 0 or 1
+ */
+
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
+{
+	int turning_on;
+	struct cpuset trialcs;
+	int err;
+
+	turning_on = (simple_strtoul(buf, NULL, 10) != 0);
+
+	trialcs = *cs;
+	if (turning_on)
+		set_bit(bit, &trialcs.flags);
+	else
+		clear_bit(bit, &trialcs.flags);
+
+	err = validate_change(cs, &trialcs);
+	if (err == 0) {
+		if (turning_on)
+			set_bit(bit, &cs->flags);
+		else
+			clear_bit(bit, &cs->flags);
+	}
+	return err;
+}
+
+static int attach_task(struct cpuset *cs, char *buf)
+{
+	pid_t pid;
+	struct task_struct *tsk;
+	struct cpuset *oldcs;
+	cpumask_t cpus;
+
+	if (sscanf(buf, "%d", &pid) != 1)
+		return -EIO;
+	if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+		return -ENOSPC;
+
+	if (pid) {
+		read_lock(&tasklist_lock);
+
+		tsk = find_task_by_pid(pid);
+		if (!tsk) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+
+		get_task_struct(tsk);
+		read_unlock(&tasklist_lock);
+
+		if ((current->euid) && (current->euid != tsk->uid)
+		    && (current->euid != tsk->suid)) {
+			put_task_struct(tsk);
+			return -EACCES;
+		}
+	} else {
+		tsk = current;
+		get_task_struct(tsk);
+	}
+
+	task_lock(tsk);
+	oldcs = tsk->cpuset;
+	if (!oldcs) {
+		task_unlock(tsk);
+		put_task_struct(tsk);
+		return -ESRCH;
+	}
+	atomic_inc(&cs->count);
+	tsk->cpuset = cs;
+	task_unlock(tsk);
+
+	guarantee_online_cpus(cs, &cpus);
+	set_cpus_allowed(tsk, cpus);
+
+	put_task_struct(tsk);
+	if (atomic_dec_and_test(&oldcs->count))
+		check_for_release(oldcs);
+	return 0;
+}
+
+/* The various types of files and directories in a cpuset file system */
+
+typedef enum {
+	FILE_ROOT,
+	FILE_DIR,
+	FILE_CPULIST,
+	FILE_MEMLIST,
+	FILE_CPU_EXCLUSIVE,
+	FILE_MEM_EXCLUSIVE,
+	FILE_NOTIFY_ON_RELEASE,
+	FILE_TASKLIST,
+} cpuset_filetype_t;
+
+static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf,
+					size_t nbytes, loff_t *unused_ppos)
+{
+	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+	struct cftype *cft = __d_cft(file->f_dentry);
+	cpuset_filetype_t type = cft->private;
+	char *buffer;
+	int retval = 0;
+
+	/* Crude upper limit on largest legitimate cpulist user might write. */
+	if (nbytes > 100 + 6 * NR_CPUS)
+		return -E2BIG;
+
+	/* +1 for nul-terminator */
+	if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0)
+		return -ENOMEM;
+
+	if (copy_from_user(buffer, userbuf, nbytes)) {
+		retval = -EFAULT;
+		goto out1;
+	}
+	buffer[nbytes] = 0;	/* nul-terminate */
+
+	down(&cpuset_sem);
+
+	if (is_removed(cs)) {
+		retval = -ENODEV;
+		goto out2;
+	}
+
+	switch (type) {
+	case FILE_CPULIST:
+		retval = update_cpumask(cs, buffer);
+		break;
+	case FILE_MEMLIST:
+		retval = update_nodemask(cs, buffer);
+		break;
+	case FILE_CPU_EXCLUSIVE:
+		retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
+		break;
+	case FILE_MEM_EXCLUSIVE:
+		retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
+		break;
+	case FILE_NOTIFY_ON_RELEASE:
+		retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
+		break;
+	case FILE_TASKLIST:
+		retval = attach_task(cs, buffer);
+		break;
+	default:
+		retval = -EINVAL;
+		goto out2;
+	}
+
+	if (retval == 0)
+		retval = nbytes;
+out2:
+	up(&cpuset_sem);
+out1:
+	kfree(buffer);
+	return retval;
+}
+
+static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
+						size_t nbytes, loff_t *ppos)
+{
+	ssize_t retval = 0;
+	struct cftype *cft = __d_cft(file->f_dentry);
+	if (!cft)
+		return -ENODEV;
+
+	/* special function ? */
+	if (cft->write)
+		retval = cft->write(file, buf, nbytes, ppos);
+	else
+		retval = cpuset_common_file_write(file, buf, nbytes, ppos);
+
+	return retval;
+}
+
+/*
+ * These ascii lists should be read in a single call, by using a user
+ * buffer large enough to hold the entire map.  If read in smaller
+ * chunks, there is no guarantee of atomicity.  Since the display format
+ * used, list of ranges of sequential numbers, is variable length,
+ * and since these maps can change value dynamically, one could read
+ * gibberish by doing partial reads while a list was changing.
+ * A single large read to a buffer that crosses a page boundary is
+ * ok, because the result being copied to user land is not recomputed
+ * across a page fault.
+ */
+
+static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+{
+	cpumask_t mask;
+
+	down(&cpuset_sem);
+	mask = cs->cpus_allowed;
+	up(&cpuset_sem);
+
+	return cpulist_scnprintf(page, PAGE_SIZE, mask);
+}
+
+static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
+{
+	nodemask_t mask;
+
+	down(&cpuset_sem);
+	mask = cs->mems_allowed;
+	up(&cpuset_sem);
+
+	return nodelist_scnprintf(page, PAGE_SIZE, mask);
+}
+
+static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
+				size_t nbytes, loff_t *ppos)
+{
+	struct cftype *cft = __d_cft(file->f_dentry);
+	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+	cpuset_filetype_t type = cft->private;
+	char *page;
+	ssize_t retval = 0;
+	char *s;
+	char *start;
+	size_t n;
+
+	if (!(page = (char *)__get_free_page(GFP_KERNEL)))
+		return -ENOMEM;
+
+	s = page;
+
+	switch (type) {
+	case FILE_CPULIST:
+		s += cpuset_sprintf_cpulist(s, cs);
+		break;
+	case FILE_MEMLIST:
+		s += cpuset_sprintf_memlist(s, cs);
+		break;
+	case FILE_CPU_EXCLUSIVE:
+		*s++ = is_cpu_exclusive(cs) ? '1' : '0';
+		break;
+	case FILE_MEM_EXCLUSIVE:
+		*s++ = is_mem_exclusive(cs) ? '1' : '0';
+		break;
+	case FILE_NOTIFY_ON_RELEASE:
+		*s++ = notify_on_release(cs) ? '1' : '0';
+		break;
+	default:
+		retval = -EINVAL;
+		goto out;
+	}
+	*s++ = '\n';
+	*s = '\0';
+
+	start = page + *ppos;
+	n = s - start;
+	retval = n - copy_to_user(buf, start, min(n, nbytes));
+	*ppos += retval;
+out:
+	free_page((unsigned long)page);
+	return retval;
+}
+
+static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbytes,
+								loff_t *ppos)
+{
+	ssize_t retval = 0;
+	struct cftype *cft = __d_cft(file->f_dentry);
+	if (!cft)
+		return -ENODEV;
+
+	/* special function ? */
+	if (cft->read)
+		retval = cft->read(file, buf, nbytes, ppos);
+	else
+		retval = cpuset_common_file_read(file, buf, nbytes, ppos);
+
+	return retval;
+}
+
+static int cpuset_file_open(struct inode *inode, struct file *file)
+{
+	int err;
+	struct cftype *cft;
+
+	err = generic_file_open(inode, file);
+	if (err)
+		return err;
+
+	cft = __d_cft(file->f_dentry);
+	if (!cft)
+		return -ENODEV;
+	if (cft->open)
+		err = cft->open(inode, file);
+	else
+		err = 0;
+
+	return err;
+}
+
+static int cpuset_file_release(struct inode *inode, struct file *file)
+{
+	struct cftype *cft = __d_cft(file->f_dentry);
+	if (cft->release)
+		return cft->release(inode, file);
+	return 0;
+}
+
+static struct file_operations cpuset_file_operations = {
+	.read = cpuset_file_read,
+	.write = cpuset_file_write,
+	.llseek = generic_file_llseek,
+	.open = cpuset_file_open,
+	.release = cpuset_file_release,
+};
+
+static struct inode_operations cpuset_dir_inode_operations = {
+	.lookup = simple_lookup,
+	.mkdir = cpuset_mkdir,
+	.rmdir = cpuset_rmdir,
+};
+
+static int cpuset_create_file(struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+
+	if (!dentry)
+		return -ENOENT;
+	if (dentry->d_inode)
+		return -EEXIST;
+
+	inode = cpuset_new_inode(mode);
+	if (!inode)
+		return -ENOMEM;
+
+	if (S_ISDIR(mode)) {
+		inode->i_op = &cpuset_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+
+		/* start off with i_nlink == 2 (for "." entry) */
+		inode->i_nlink++;
+	} else if (S_ISREG(mode)) {
+		inode->i_size = 0;
+		inode->i_fop = &cpuset_file_operations;
+	}
+
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+	return 0;
+}
+
+/*
+ *	cpuset_create_dir - create a directory for an object.
+ *	cs: 	the cpuset we create the directory for.
+ *		It must have a valid ->parent field
+ *		And we are going to fill its ->dentry field.
+ *	name:	The name to give to the cpuset directory. Will be copied.
+ *	mode:	mode to set on new directory.
+ */
+
+static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
+{
+	struct dentry *dentry = NULL;
+	struct dentry *parent;
+	int error = 0;
+
+	parent = cs->parent->dentry;
+	dentry = cpuset_get_dentry(parent, name);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	error = cpuset_create_file(dentry, S_IFDIR | mode);
+	if (!error) {
+		dentry->d_fsdata = cs;
+		parent->d_inode->i_nlink++;
+		cs->dentry = dentry;
+	}
+	dput(dentry);
+
+	return error;
+}
+
+static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
+{
+	struct dentry *dentry;
+	int error;
+
+	down(&dir->d_inode->i_sem);
+	dentry = cpuset_get_dentry(dir, cft->name);
+	if (!IS_ERR(dentry)) {
+		error = cpuset_create_file(dentry, 0644 | S_IFREG);
+		if (!error)
+			dentry->d_fsdata = (void *)cft;
+		dput(dentry);
+	} else
+		error = PTR_ERR(dentry);
+	up(&dir->d_inode->i_sem);
+	return error;
+}
+
+/*
+ * Stuff for reading the 'tasks' file.
+ *
+ * Reading this file can return large amounts of data if a cpuset has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ * Upon tasks file open(), a struct ctr_struct is allocated, that
+ * will have a pointer to an array (also allocated here).  The struct
+ * ctr_struct * is stored in file->private_data.  Its resources will
+ * be freed by release() when the file is closed.  The array is used
+ * to sprintf the PIDs and then used by read().
+ */
+
+/* cpusets_tasks_read array */
+
+struct ctr_struct {
+	char *buf;
+	int bufsz;
+};
+
+/*
+ * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
+ * Return actual number of pids loaded.
+ */
+static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
+{
+	int n = 0;
+	struct task_struct *g, *p;
+
+	read_lock(&tasklist_lock);
+
+	do_each_thread(g, p) {
+		if (p->cpuset == cs) {
+			pidarray[n++] = p->pid;
+			if (unlikely(n == npids))
+				goto array_full;
+		}
+	} while_each_thread(g, p);
+
+array_full:
+	read_unlock(&tasklist_lock);
+	return n;
+}
+
+static int cmppid(const void *a, const void *b)
+{
+	return *(pid_t *)a - *(pid_t *)b;
+}
+
+/*
+ * Convert array 'a' of 'npids' pid_t's to a string of newline separated
+ * decimal pids in 'buf'.  Don't write more than 'sz' chars, but return
+ * count 'cnt' of how many chars would be written if buf were large enough.
+ */
+static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
+{
+	int cnt = 0;
+	int i;
+
+	for (i = 0; i < npids; i++)
+		cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
+	return cnt;
+}
+
+static int cpuset_tasks_open(struct inode *unused, struct file *file)
+{
+	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+	struct ctr_struct *ctr;
+	pid_t *pidarray;
+	int npids;
+	char c;
+
+	if (!(file->f_mode & FMODE_READ))
+		return 0;
+
+	ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
+	if (!ctr)
+		goto err0;
+
+	/*
+	 * If cpuset gets more users after we read count, we won't have
+	 * enough space - tough.  This race is indistinguishable to the
+	 * caller from the case that the additional cpuset users didn't
+	 * show up until sometime later on.
+	 */
+	npids = atomic_read(&cs->count);
+	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+	if (!pidarray)
+		goto err1;
+
+	npids = pid_array_load(pidarray, npids, cs);
+	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
+
+	/* Call pid_array_to_buf() twice, first just to get bufsz */
+	ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
+	ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
+	if (!ctr->buf)
+		goto err2;
+	ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
+
+	kfree(pidarray);
+	file->private_data = ctr;
+	return 0;
+
+err2:
+	kfree(pidarray);
+err1:
+	kfree(ctr);
+err0:
+	return -ENOMEM;
+}
+
+static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
+						size_t nbytes, loff_t *ppos)
+{
+	struct ctr_struct *ctr = file->private_data;
+
+	if (*ppos + nbytes > ctr->bufsz)
+		nbytes = ctr->bufsz - *ppos;
+	if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
+		return -EFAULT;
+	*ppos += nbytes;
+	return nbytes;
+}
+
+static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
+{
+	struct ctr_struct *ctr;
+
+	if (file->f_mode & FMODE_READ) {
+		ctr = file->private_data;
+		kfree(ctr->buf);
+		kfree(ctr);
+	}
+	return 0;
+}
+
+/*
+ * for the common functions, 'private' gives the type of file
+ */
+
+static struct cftype cft_tasks = {
+	.name = "tasks",
+	.open = cpuset_tasks_open,
+	.read = cpuset_tasks_read,
+	.release = cpuset_tasks_release,
+	.private = FILE_TASKLIST,
+};
+
+static struct cftype cft_cpus = {
+	.name = "cpus",
+	.private = FILE_CPULIST,
+};
+
+static struct cftype cft_mems = {
+	.name = "mems",
+	.private = FILE_MEMLIST,
+};
+
+static struct cftype cft_cpu_exclusive = {
+	.name = "cpu_exclusive",
+	.private = FILE_CPU_EXCLUSIVE,
+};
+
+static struct cftype cft_mem_exclusive = {
+	.name = "mem_exclusive",
+	.private = FILE_MEM_EXCLUSIVE,
+};
+
+static struct cftype cft_notify_on_release = {
+	.name = "notify_on_release",
+	.private = FILE_NOTIFY_ON_RELEASE,
+};
+
+static int cpuset_populate_dir(struct dentry *cs_dentry)
+{
+	int err;
+
+	if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
+		return err;
+	return 0;
+}
+
+/*
+ *	cpuset_create - create a cpuset
+ *	parent:	cpuset that will be parent of the new cpuset.
+ *	name:		name of the new cpuset. Will be strcpy'ed.
+ *	mode:		mode to set on new inode
+ *
+ *	Must be called with the semaphore on the parent inode held
+ */
+
+static long cpuset_create(struct cpuset *parent, const char *name, int mode)
+{
+	struct cpuset *cs;
+	int err;
+
+	cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+	if (!cs)
+		return -ENOMEM;
+
+	down(&cpuset_sem);
+	refresh_mems();
+	cs->flags = 0;
+	if (notify_on_release(parent))
+		set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+	cs->cpus_allowed = CPU_MASK_NONE;
+	cs->mems_allowed = NODE_MASK_NONE;
+	atomic_set(&cs->count, 0);
+	INIT_LIST_HEAD(&cs->sibling);
+	INIT_LIST_HEAD(&cs->children);
+	atomic_inc(&cpuset_mems_generation);
+	cs->mems_generation = atomic_read(&cpuset_mems_generation);
+
+	cs->parent = parent;
+
+	list_add(&cs->sibling, &cs->parent->children);
+
+	err = cpuset_create_dir(cs, name, mode);
+	if (err < 0)
+		goto err;
+
+	/*
+	 * Release cpuset_sem before cpuset_populate_dir() because it
+	 * will down() this new directory's i_sem and if we race with
+	 * another mkdir, we might deadlock.
+	 */
+	up(&cpuset_sem);
+
+	err = cpuset_populate_dir(cs->dentry);
+	/* If err < 0, we have a half-filled directory - oh well ;) */
+	return 0;
+err:
+	list_del(&cs->sibling);
+	up(&cpuset_sem);
+	kfree(cs);
+	return err;
+}
+
+static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct cpuset *c_parent = dentry->d_parent->d_fsdata;
+
+	/* the vfs holds inode->i_sem already */
+	return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
+}
+
+static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
+{
+	struct cpuset *cs = dentry->d_fsdata;
+	struct dentry *d;
+	struct cpuset *parent;
+
+	/* the vfs holds both inode->i_sem already */
+
+	down(&cpuset_sem);
+	refresh_mems();
+	if (atomic_read(&cs->count) > 0) {
+		up(&cpuset_sem);
+		return -EBUSY;
+	}
+	if (!list_empty(&cs->children)) {
+		up(&cpuset_sem);
+		return -EBUSY;
+	}
+	spin_lock(&cs->dentry->d_lock);
+	parent = cs->parent;
+	set_bit(CS_REMOVED, &cs->flags);
+	list_del(&cs->sibling);	/* delete my sibling from parent->children */
+	if (list_empty(&parent->children))
+		check_for_release(parent);
+	d = dget(cs->dentry);
+	cs->dentry = NULL;
+	spin_unlock(&d->d_lock);
+	cpuset_d_remove_dir(d);
+	dput(d);
+	up(&cpuset_sem);
+	return 0;
+}
+
+/**
+ * cpuset_init - initialize cpusets at system boot
+ *
+ * Description: Initialize top_cpuset and the cpuset internal file system,
+ **/
+
+int __init cpuset_init(void)
+{
+	struct dentry *root;
+	int err;
+
+	top_cpuset.cpus_allowed = CPU_MASK_ALL;
+	top_cpuset.mems_allowed = NODE_MASK_ALL;
+
+	atomic_inc(&cpuset_mems_generation);
+	top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
+
+	init_task.cpuset = &top_cpuset;
+
+	err = register_filesystem(&cpuset_fs_type);
+	if (err < 0)
+		goto out;
+	cpuset_mount = kern_mount(&cpuset_fs_type);
+	if (IS_ERR(cpuset_mount)) {
+		printk(KERN_ERR "cpuset: could not mount!\n");
+		err = PTR_ERR(cpuset_mount);
+		cpuset_mount = NULL;
+		goto out;
+	}
+	root = cpuset_mount->mnt_sb->s_root;
+	root->d_fsdata = &top_cpuset;
+	root->d_inode->i_nlink++;
+	top_cpuset.dentry = root;
+	root->d_inode->i_op = &cpuset_dir_inode_operations;
+	err = cpuset_populate_dir(root);
+out:
+	return err;
+}
+
+/**
+ * cpuset_init_smp - initialize cpus_allowed
+ *
+ * Description: Finish top cpuset after cpu, node maps are initialized
+ **/
+
+void __init cpuset_init_smp(void)
+{
+	top_cpuset.cpus_allowed = cpu_online_map;
+	top_cpuset.mems_allowed = node_online_map;
+}
+
+/**
+ * cpuset_fork - attach newly forked task to its parents cpuset.
+ * @p: pointer to task_struct of forking parent process.
+ *
+ * Description: By default, on fork, a task inherits its
+ * parents cpuset.  The pointer to the shared cpuset is
+ * automatically copied in fork.c by dup_task_struct().
+ * This cpuset_fork() routine need only increment the usage
+ * counter in that cpuset.
+ **/
+
+void cpuset_fork(struct task_struct *tsk)
+{
+	atomic_inc(&tsk->cpuset->count);
+}
+
+/**
+ * cpuset_exit - detach cpuset from exiting task
+ * @tsk: pointer to task_struct of exiting process
+ *
+ * Description: Detach cpuset from @tsk and release it.
+ *
+ **/
+
+void cpuset_exit(struct task_struct *tsk)
+{
+	struct cpuset *cs;
+
+	task_lock(tsk);
+	cs = tsk->cpuset;
+	tsk->cpuset = NULL;
+	task_unlock(tsk);
+
+	if (atomic_dec_and_test(&cs->count)) {
+		down(&cpuset_sem);
+		check_for_release(cs);
+		up(&cpuset_sem);
+	}
+}
+
+/**
+ * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ *
+ * Description: Returns the cpumask_t cpus_allowed of the cpuset
+ * attached to the specified @tsk.  Guaranteed to return some non-empty
+ * subset of cpu_online_map, even if this means going outside the
+ * tasks cpuset.
+ **/
+
+const cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
+{
+	cpumask_t mask;
+
+	down(&cpuset_sem);
+	task_lock((struct task_struct *)tsk);
+	guarantee_online_cpus(tsk->cpuset, &mask);
+	task_unlock((struct task_struct *)tsk);
+	up(&cpuset_sem);
+
+	return mask;
+}
+
+void cpuset_init_current_mems_allowed(void)
+{
+	current->mems_allowed = NODE_MASK_ALL;
+}
+
+/*
+ * If the current tasks cpusets mems_allowed changed behind our backs,
+ * update current->mems_allowed and mems_generation to the new value.
+ * Do not call this routine if in_interrupt().
+ */
+
+void cpuset_update_current_mems_allowed(void)
+{
+	struct cpuset *cs = current->cpuset;
+
+	if (!cs)
+		return;		/* task is exiting */
+	if (current->cpuset_mems_generation != cs->mems_generation) {
+		down(&cpuset_sem);
+		refresh_mems();
+		up(&cpuset_sem);
+	}
+}
+
+void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
+{
+	bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
+							MAX_NUMNODES);
+}
+
+/*
+ * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
+ */
+int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
+{
+	int i;
+
+	for (i = 0; zl->zones[i]; i++) {
+		int nid = zl->zones[i]->zone_pgdat->node_id;
+
+		if (node_isset(nid, current->mems_allowed))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Is 'current' valid, and is zone z allowed in current->mems_allowed?
+ */
+int cpuset_zone_allowed(struct zone *z)
+{
+	return in_interrupt() ||
+		node_isset(z->zone_pgdat->node_id, current->mems_allowed);
+}
+
+/*
+ * proc_cpuset_show()
+ *  - Print tasks cpuset path into seq_file.
+ *  - Used for /proc/<pid>/cpuset.
+ */
+
+static int proc_cpuset_show(struct seq_file *m, void *v)
+{
+	struct cpuset *cs;
+	struct task_struct *tsk;
+	char *buf;
+	int retval = 0;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	tsk = m->private;
+	down(&cpuset_sem);
+	task_lock(tsk);
+	cs = tsk->cpuset;
+	task_unlock(tsk);
+	if (!cs) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	retval = cpuset_path(cs, buf, PAGE_SIZE);
+	if (retval < 0)
+		goto out;
+	seq_puts(m, buf);
+	seq_putc(m, '\n');
+out:
+	up(&cpuset_sem);
+	kfree(buf);
+	return retval;
+}
+
+static int cpuset_open(struct inode *inode, struct file *file)
+{
+	struct task_struct *tsk = PROC_I(inode)->task;
+	return single_open(file, proc_cpuset_show, tsk);
+}
+
+struct file_operations proc_cpuset_operations = {
+	.open		= cpuset_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
+char *cpuset_task_status_allowed(struct task_struct *task, char *buffer)
+{
+	buffer += sprintf(buffer, "Cpus_allowed:\t");
+	buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed);
+	buffer += sprintf(buffer, "\n");
+	buffer += sprintf(buffer, "Mems_allowed:\t");
+	buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed);
+	buffer += sprintf(buffer, "\n");
+	return buffer;
+}
diff --git a/kernel/dma.c b/kernel/dma.c
new file mode 100644
index 000000000000..aef0a45b7893
--- /dev/null
+++ b/kernel/dma.c
@@ -0,0 +1,158 @@
+/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $
+ * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
+ *
+ * Written by Hennus Bergman, 1992.
+ *
+ * 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma.
+ *   In the previous version the reported device could end up being wrong,
+ *   if a device requested a DMA channel that was already in use.
+ *   [It also happened to remove the sizeof(char *) == sizeof(int)
+ *   assumption introduced because of those /proc/dma patches. -- Hennus]
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <asm/dma.h>
+#include <asm/system.h>
+
+ 
+
+/* A note on resource allocation:
+ *
+ * All drivers needing DMA channels, should allocate and release them
+ * through the public routines `request_dma()' and `free_dma()'.
+ *
+ * In order to avoid problems, all processes should allocate resources in
+ * the same sequence and release them in the reverse order.
+ *
+ * So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA.
+ * When releasing them, first release the DMA, then release the IRQ.
+ * If you don't, you may cause allocation requests to fail unnecessarily.
+ * This doesn't really matter now, but it will once we get real semaphores
+ * in the kernel.
+ */
+
+
+DEFINE_SPINLOCK(dma_spin_lock);
+
+/*
+ *	If our port doesn't define this it has no PC like DMA
+ */
+
+#ifdef MAX_DMA_CHANNELS
+
+
+/* Channel n is busy iff dma_chan_busy[n].lock != 0.
+ * DMA0 used to be reserved for DRAM refresh, but apparently not any more...
+ * DMA4 is reserved for cascading.
+ */
+
+struct dma_chan {
+	int  lock;
+	const char *device_id;
+};
+
+static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
+	[4] = { 1, "cascade" },
+};
+
+
+int request_dma(unsigned int dmanr, const char * device_id)
+{
+	if (dmanr >= MAX_DMA_CHANNELS)
+		return -EINVAL;
+
+	if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0)
+		return -EBUSY;
+
+	dma_chan_busy[dmanr].device_id = device_id;
+
+	/* old flag was 0, now contains 1 to indicate busy */
+	return 0;
+} /* request_dma */
+
+
+void free_dma(unsigned int dmanr)
+{
+	if (dmanr >= MAX_DMA_CHANNELS) {
+		printk(KERN_WARNING "Trying to free DMA%d\n", dmanr);
+		return;
+	}
+
+	if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) {
+		printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr);
+		return;
+	}	
+
+} /* free_dma */
+
+#else
+
+int request_dma(unsigned int dmanr, const char *device_id)
+{
+	return -EINVAL;
+}
+
+void free_dma(unsigned int dmanr)
+{
+}
+
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+#ifdef MAX_DMA_CHANNELS
+static int proc_dma_show(struct seq_file *m, void *v)
+{
+	int i;
+
+	for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) {
+		if (dma_chan_busy[i].lock) {
+		    seq_printf(m, "%2d: %s\n", i,
+			       dma_chan_busy[i].device_id);
+		}
+	}
+	return 0;
+}
+#else
+static int proc_dma_show(struct seq_file *m, void *v)
+{
+	seq_puts(m, "No DMA\n");
+	return 0;
+}
+#endif /* MAX_DMA_CHANNELS */
+
+static int proc_dma_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_dma_show, NULL);
+}
+
+static struct file_operations proc_dma_operations = {
+	.open		= proc_dma_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_dma_init(void)
+{
+	struct proc_dir_entry *e;
+
+	e = create_proc_entry("dma", 0, NULL);
+	if (e)
+		e->proc_fops = &proc_dma_operations;
+
+	return 0;
+}
+
+__initcall(proc_dma_init);
+#endif
+
+EXPORT_SYMBOL(request_dma);
+EXPORT_SYMBOL(free_dma);
+EXPORT_SYMBOL(dma_spin_lock);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
new file mode 100644
index 000000000000..867d6dbeb574
--- /dev/null
+++ b/kernel/exec_domain.c
@@ -0,0 +1,209 @@
+/*
+ * Handling of different ABIs (personalities).
+ *
+ * We group personalities into execution domains which have their
+ * own handlers for kernel entry points, signal mapping, etc...
+ *
+ * 2001-05-06	Complete rewrite,  Christoph Hellwig (hch@infradead.org)
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/personality.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/sysctl.h>
+#include <linux/types.h>
+
+
+static void default_handler(int, struct pt_regs *);
+
+static struct exec_domain *exec_domains = &default_exec_domain;
+static DEFINE_RWLOCK(exec_domains_lock);
+
+
+static u_long ident_map[32] = {
+	0,	1,	2,	3,	4,	5,	6,	7,
+	8,	9,	10,	11,	12,	13,	14,	15,
+	16,	17,	18,	19,	20,	21,	22,	23,
+	24,	25,	26,	27,	28,	29,	30,	31
+};
+
+struct exec_domain default_exec_domain = {
+	.name		= "Linux",		/* name */
+	.handler	= default_handler,	/* lcall7 causes a seg fault. */
+	.pers_low	= 0, 			/* PER_LINUX personality. */
+	.pers_high	= 0,			/* PER_LINUX personality. */
+	.signal_map	= ident_map,		/* Identity map signals. */
+	.signal_invmap	= ident_map,		/*  - both ways. */
+};
+
+
+static void
+default_handler(int segment, struct pt_regs *regp)
+{
+	set_personality(0);
+
+	if (current_thread_info()->exec_domain->handler != default_handler)
+		current_thread_info()->exec_domain->handler(segment, regp);
+	else
+		send_sig(SIGSEGV, current, 1);
+}
+
+static struct exec_domain *
+lookup_exec_domain(u_long personality)
+{
+	struct exec_domain *	ep;
+	u_long			pers = personality(personality);
+		
+	read_lock(&exec_domains_lock);
+	for (ep = exec_domains; ep; ep = ep->next) {
+		if (pers >= ep->pers_low && pers <= ep->pers_high)
+			if (try_module_get(ep->module))
+				goto out;
+	}
+
+#ifdef CONFIG_KMOD
+	read_unlock(&exec_domains_lock);
+	request_module("personality-%ld", pers);
+	read_lock(&exec_domains_lock);
+
+	for (ep = exec_domains; ep; ep = ep->next) {
+		if (pers >= ep->pers_low && pers <= ep->pers_high)
+			if (try_module_get(ep->module))
+				goto out;
+	}
+#endif
+
+	ep = &default_exec_domain;
+out:
+	read_unlock(&exec_domains_lock);
+	return (ep);
+}
+
+int
+register_exec_domain(struct exec_domain *ep)
+{
+	struct exec_domain	*tmp;
+	int			err = -EBUSY;
+
+	if (ep == NULL)
+		return -EINVAL;
+
+	if (ep->next != NULL)
+		return -EBUSY;
+
+	write_lock(&exec_domains_lock);
+	for (tmp = exec_domains; tmp; tmp = tmp->next) {
+		if (tmp == ep)
+			goto out;
+	}
+
+	ep->next = exec_domains;
+	exec_domains = ep;
+	err = 0;
+
+out:
+	write_unlock(&exec_domains_lock);
+	return (err);
+}
+
+int
+unregister_exec_domain(struct exec_domain *ep)
+{
+	struct exec_domain	**epp;
+
+	epp = &exec_domains;
+	write_lock(&exec_domains_lock);
+	for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
+		if (ep == *epp)
+			goto unregister;
+	}
+	write_unlock(&exec_domains_lock);
+	return -EINVAL;
+
+unregister:
+	*epp = ep->next;
+	ep->next = NULL;
+	write_unlock(&exec_domains_lock);
+	return 0;
+}
+
+int
+__set_personality(u_long personality)
+{
+	struct exec_domain	*ep, *oep;
+
+	ep = lookup_exec_domain(personality);
+	if (ep == current_thread_info()->exec_domain) {
+		current->personality = personality;
+		return 0;
+	}
+
+	if (atomic_read(&current->fs->count) != 1) {
+		struct fs_struct *fsp, *ofsp;
+
+		fsp = copy_fs_struct(current->fs);
+		if (fsp == NULL) {
+			module_put(ep->module);
+			return -ENOMEM;
+		}
+
+		task_lock(current);
+		ofsp = current->fs;
+		current->fs = fsp;
+		task_unlock(current);
+
+		put_fs_struct(ofsp);
+	}
+
+	/*
+	 * At that point we are guaranteed to be the sole owner of
+	 * current->fs.
+	 */
+
+	current->personality = personality;
+	oep = current_thread_info()->exec_domain;
+	current_thread_info()->exec_domain = ep;
+	set_fs_altroot();
+
+	module_put(oep->module);
+	return 0;
+}
+
+int
+get_exec_domain_list(char *page)
+{
+	struct exec_domain	*ep;
+	int			len = 0;
+
+	read_lock(&exec_domains_lock);
+	for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next)
+		len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n",
+			       ep->pers_low, ep->pers_high, ep->name,
+			       module_name(ep->module));
+	read_unlock(&exec_domains_lock);
+	return (len);
+}
+
+asmlinkage long
+sys_personality(u_long personality)
+{
+	u_long old = current->personality;
+
+	if (personality != 0xffffffff) {
+		set_personality(personality);
+		if (current->personality != personality)
+			return -EINVAL;
+	}
+
+	return (long)old;
+}
+
+
+EXPORT_SYMBOL(register_exec_domain);
+EXPORT_SYMBOL(unregister_exec_domain);
+EXPORT_SYMBOL(__set_personality);
diff --git a/kernel/exit.c b/kernel/exit.c
new file mode 100644
index 000000000000..6dd4ebe1dd90
--- /dev/null
+++ b/kernel/exit.c
@@ -0,0 +1,1527 @@
+/*
+ *  linux/kernel/exit.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/personality.h>
+#include <linux/tty.h>
+#include <linux/namespace.h>
+#include <linux/key.h>
+#include <linux/security.h>
+#include <linux/cpu.h>
+#include <linux/acct.h>
+#include <linux/file.h>
+#include <linux/binfmts.h>
+#include <linux/ptrace.h>
+#include <linux/profile.h>
+#include <linux/mount.h>
+#include <linux/proc_fs.h>
+#include <linux/mempolicy.h>
+#include <linux/cpuset.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/pgtable.h>
+#include <asm/mmu_context.h>
+
+extern void sem_exit (void);
+extern struct task_struct *child_reaper;
+
+int getrusage(struct task_struct *, int, struct rusage __user *);
+
+static void __unhash_process(struct task_struct *p)
+{
+	nr_threads--;
+	detach_pid(p, PIDTYPE_PID);
+	detach_pid(p, PIDTYPE_TGID);
+	if (thread_group_leader(p)) {
+		detach_pid(p, PIDTYPE_PGID);
+		detach_pid(p, PIDTYPE_SID);
+		if (p->pid)
+			__get_cpu_var(process_counts)--;
+	}
+
+	REMOVE_LINKS(p);
+}
+
+void release_task(struct task_struct * p)
+{
+	int zap_leader;
+	task_t *leader;
+	struct dentry *proc_dentry;
+
+repeat: 
+	atomic_dec(&p->user->processes);
+	spin_lock(&p->proc_lock);
+	proc_dentry = proc_pid_unhash(p);
+	write_lock_irq(&tasklist_lock);
+	if (unlikely(p->ptrace))
+		__ptrace_unlink(p);
+	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+	__exit_signal(p);
+	__exit_sighand(p);
+	__unhash_process(p);
+
+	/*
+	 * If we are the last non-leader member of the thread
+	 * group, and the leader is zombie, then notify the
+	 * group leader's parent process. (if it wants notification.)
+	 */
+	zap_leader = 0;
+	leader = p->group_leader;
+	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
+		BUG_ON(leader->exit_signal == -1);
+		do_notify_parent(leader, leader->exit_signal);
+		/*
+		 * If we were the last child thread and the leader has
+		 * exited already, and the leader's parent ignores SIGCHLD,
+		 * then we are the one who should release the leader.
+		 *
+		 * do_notify_parent() will have marked it self-reaping in
+		 * that case.
+		 */
+		zap_leader = (leader->exit_signal == -1);
+	}
+
+	sched_exit(p);
+	write_unlock_irq(&tasklist_lock);
+	spin_unlock(&p->proc_lock);
+	proc_pid_flush(proc_dentry);
+	release_thread(p);
+	put_task_struct(p);
+
+	p = leader;
+	if (unlikely(zap_leader))
+		goto repeat;
+}
+
+/* we are using it only for SMP init */
+
+void unhash_process(struct task_struct *p)
+{
+	struct dentry *proc_dentry;
+
+	spin_lock(&p->proc_lock);
+	proc_dentry = proc_pid_unhash(p);
+	write_lock_irq(&tasklist_lock);
+	__unhash_process(p);
+	write_unlock_irq(&tasklist_lock);
+	spin_unlock(&p->proc_lock);
+	proc_pid_flush(proc_dentry);
+}
+
+/*
+ * This checks not only the pgrp, but falls back on the pid if no
+ * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
+ * without this...
+ */
+int session_of_pgrp(int pgrp)
+{
+	struct task_struct *p;
+	int sid = -1;
+
+	read_lock(&tasklist_lock);
+	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
+		if (p->signal->session > 0) {
+			sid = p->signal->session;
+			goto out;
+		}
+	} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+	p = find_task_by_pid(pgrp);
+	if (p)
+		sid = p->signal->session;
+out:
+	read_unlock(&tasklist_lock);
+	
+	return sid;
+}
+
+/*
+ * Determine if a process group is "orphaned", according to the POSIX
+ * definition in 2.2.2.52.  Orphaned process groups are not to be affected
+ * by terminal-generated stop signals.  Newly orphaned process groups are
+ * to receive a SIGHUP and a SIGCONT.
+ *
+ * "I ask you, have you ever known what it is to be an orphan?"
+ */
+static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
+{
+	struct task_struct *p;
+	int ret = 1;
+
+	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
+		if (p == ignored_task
+				|| p->exit_state
+				|| p->real_parent->pid == 1)
+			continue;
+		if (process_group(p->real_parent) != pgrp
+			    && p->real_parent->signal->session == p->signal->session) {
+			ret = 0;
+			break;
+		}
+	} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+	return ret;	/* (sighing) "Often!" */
+}
+
+int is_orphaned_pgrp(int pgrp)
+{
+	int retval;
+
+	read_lock(&tasklist_lock);
+	retval = will_become_orphaned_pgrp(pgrp, NULL);
+	read_unlock(&tasklist_lock);
+
+	return retval;
+}
+
+static inline int has_stopped_jobs(int pgrp)
+{
+	int retval = 0;
+	struct task_struct *p;
+
+	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
+		if (p->state != TASK_STOPPED)
+			continue;
+
+		/* If p is stopped by a debugger on a signal that won't
+		   stop it, then don't count p as stopped.  This isn't
+		   perfect but it's a good approximation.  */
+		if (unlikely (p->ptrace)
+		    && p->exit_code != SIGSTOP
+		    && p->exit_code != SIGTSTP
+		    && p->exit_code != SIGTTOU
+		    && p->exit_code != SIGTTIN)
+			continue;
+
+		retval = 1;
+		break;
+	} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+	return retval;
+}
+
+/**
+ * reparent_to_init() - Reparent the calling kernel thread to the init task.
+ *
+ * If a kernel thread is launched as a result of a system call, or if
+ * it ever exits, it should generally reparent itself to init so that
+ * it is correctly cleaned up on exit.
+ *
+ * The various task state such as scheduling policy and priority may have
+ * been inherited from a user process, so we reset them to sane values here.
+ *
+ * NOTE that reparent_to_init() gives the caller full capabilities.
+ */
+void reparent_to_init(void)
+{
+	write_lock_irq(&tasklist_lock);
+
+	ptrace_unlink(current);
+	/* Reparent to init */
+	REMOVE_LINKS(current);
+	current->parent = child_reaper;
+	current->real_parent = child_reaper;
+	SET_LINKS(current);
+
+	/* Set the exit signal to SIGCHLD so we signal init on exit */
+	current->exit_signal = SIGCHLD;
+
+	if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0))
+		set_user_nice(current, 0);
+	/* cpus_allowed? */
+	/* rt_priority? */
+	/* signals? */
+	security_task_reparent_to_init(current);
+	memcpy(current->signal->rlim, init_task.signal->rlim,
+	       sizeof(current->signal->rlim));
+	atomic_inc(&(INIT_USER->__count));
+	write_unlock_irq(&tasklist_lock);
+	switch_uid(INIT_USER);
+}
+
+void __set_special_pids(pid_t session, pid_t pgrp)
+{
+	struct task_struct *curr = current;
+
+	if (curr->signal->session != session) {
+		detach_pid(curr, PIDTYPE_SID);
+		curr->signal->session = session;
+		attach_pid(curr, PIDTYPE_SID, session);
+	}
+	if (process_group(curr) != pgrp) {
+		detach_pid(curr, PIDTYPE_PGID);
+		curr->signal->pgrp = pgrp;
+		attach_pid(curr, PIDTYPE_PGID, pgrp);
+	}
+}
+
+void set_special_pids(pid_t session, pid_t pgrp)
+{
+	write_lock_irq(&tasklist_lock);
+	__set_special_pids(session, pgrp);
+	write_unlock_irq(&tasklist_lock);
+}
+
+/*
+ * Let kernel threads use this to say that they
+ * allow a certain signal (since daemonize() will
+ * have disabled all of them by default).
+ */
+int allow_signal(int sig)
+{
+	if (sig < 1 || sig > _NSIG)
+		return -EINVAL;
+
+	spin_lock_irq(&current->sighand->siglock);
+	sigdelset(&current->blocked, sig);
+	if (!current->mm) {
+		/* Kernel threads handle their own signals.
+		   Let the signal code know it'll be handled, so
+		   that they don't get converted to SIGKILL or
+		   just silently dropped */
+		current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
+	}
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+	return 0;
+}
+
+EXPORT_SYMBOL(allow_signal);
+
+int disallow_signal(int sig)
+{
+	if (sig < 1 || sig > _NSIG)
+		return -EINVAL;
+
+	spin_lock_irq(&current->sighand->siglock);
+	sigaddset(&current->blocked, sig);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+	return 0;
+}
+
+EXPORT_SYMBOL(disallow_signal);
+
+/*
+ *	Put all the gunge required to become a kernel thread without
+ *	attached user resources in one place where it belongs.
+ */
+
+void daemonize(const char *name, ...)
+{
+	va_list args;
+	struct fs_struct *fs;
+	sigset_t blocked;
+
+	va_start(args, name);
+	vsnprintf(current->comm, sizeof(current->comm), name, args);
+	va_end(args);
+
+	/*
+	 * If we were started as result of loading a module, close all of the
+	 * user space pages.  We don't need them, and if we didn't close them
+	 * they would be locked into memory.
+	 */
+	exit_mm(current);
+
+	set_special_pids(1, 1);
+	down(&tty_sem);
+	current->signal->tty = NULL;
+	up(&tty_sem);
+
+	/* Block and flush all signals */
+	sigfillset(&blocked);
+	sigprocmask(SIG_BLOCK, &blocked, NULL);
+	flush_signals(current);
+
+	/* Become as one with the init task */
+
+	exit_fs(current);	/* current->fs->count--; */
+	fs = init_task.fs;
+	current->fs = fs;
+	atomic_inc(&fs->count);
+ 	exit_files(current);
+	current->files = init_task.files;
+	atomic_inc(&current->files->count);
+
+	reparent_to_init();
+}
+
+EXPORT_SYMBOL(daemonize);
+
+static inline void close_files(struct files_struct * files)
+{
+	int i, j;
+
+	j = 0;
+	for (;;) {
+		unsigned long set;
+		i = j * __NFDBITS;
+		if (i >= files->max_fdset || i >= files->max_fds)
+			break;
+		set = files->open_fds->fds_bits[j++];
+		while (set) {
+			if (set & 1) {
+				struct file * file = xchg(&files->fd[i], NULL);
+				if (file)
+					filp_close(file, files);
+			}
+			i++;
+			set >>= 1;
+		}
+	}
+}
+
+struct files_struct *get_files_struct(struct task_struct *task)
+{
+	struct files_struct *files;
+
+	task_lock(task);
+	files = task->files;
+	if (files)
+		atomic_inc(&files->count);
+	task_unlock(task);
+
+	return files;
+}
+
+void fastcall put_files_struct(struct files_struct *files)
+{
+	if (atomic_dec_and_test(&files->count)) {
+		close_files(files);
+		/*
+		 * Free the fd and fdset arrays if we expanded them.
+		 */
+		if (files->fd != &files->fd_array[0])
+			free_fd_array(files->fd, files->max_fds);
+		if (files->max_fdset > __FD_SETSIZE) {
+			free_fdset(files->open_fds, files->max_fdset);
+			free_fdset(files->close_on_exec, files->max_fdset);
+		}
+		kmem_cache_free(files_cachep, files);
+	}
+}
+
+EXPORT_SYMBOL(put_files_struct);
+
+static inline void __exit_files(struct task_struct *tsk)
+{
+	struct files_struct * files = tsk->files;
+
+	if (files) {
+		task_lock(tsk);
+		tsk->files = NULL;
+		task_unlock(tsk);
+		put_files_struct(files);
+	}
+}
+
+void exit_files(struct task_struct *tsk)
+{
+	__exit_files(tsk);
+}
+
+static inline void __put_fs_struct(struct fs_struct *fs)
+{
+	/* No need to hold fs->lock if we are killing it */
+	if (atomic_dec_and_test(&fs->count)) {
+		dput(fs->root);
+		mntput(fs->rootmnt);
+		dput(fs->pwd);
+		mntput(fs->pwdmnt);
+		if (fs->altroot) {
+			dput(fs->altroot);
+			mntput(fs->altrootmnt);
+		}
+		kmem_cache_free(fs_cachep, fs);
+	}
+}
+
+void put_fs_struct(struct fs_struct *fs)
+{
+	__put_fs_struct(fs);
+}
+
+static inline void __exit_fs(struct task_struct *tsk)
+{
+	struct fs_struct * fs = tsk->fs;
+
+	if (fs) {
+		task_lock(tsk);
+		tsk->fs = NULL;
+		task_unlock(tsk);
+		__put_fs_struct(fs);
+	}
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+	__exit_fs(tsk);
+}
+
+EXPORT_SYMBOL_GPL(exit_fs);
+
+/*
+ * Turn us into a lazy TLB process if we
+ * aren't already..
+ */
+void exit_mm(struct task_struct * tsk)
+{
+	struct mm_struct *mm = tsk->mm;
+
+	mm_release(tsk, mm);
+	if (!mm)
+		return;
+	/*
+	 * Serialize with any possible pending coredump.
+	 * We must hold mmap_sem around checking core_waiters
+	 * and clearing tsk->mm.  The core-inducing thread
+	 * will increment core_waiters for each thread in the
+	 * group with ->mm != NULL.
+	 */
+	down_read(&mm->mmap_sem);
+	if (mm->core_waiters) {
+		up_read(&mm->mmap_sem);
+		down_write(&mm->mmap_sem);
+		if (!--mm->core_waiters)
+			complete(mm->core_startup_done);
+		up_write(&mm->mmap_sem);
+
+		wait_for_completion(&mm->core_done);
+		down_read(&mm->mmap_sem);
+	}
+	atomic_inc(&mm->mm_count);
+	if (mm != tsk->active_mm) BUG();
+	/* more a memory barrier than a real lock */
+	task_lock(tsk);
+	tsk->mm = NULL;
+	up_read(&mm->mmap_sem);
+	enter_lazy_tlb(mm, current);
+	task_unlock(tsk);
+	mmput(mm);
+}
+
+static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+{
+	/*
+	 * Make sure we're not reparenting to ourselves and that
+	 * the parent is not a zombie.
+	 */
+	BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE);
+	p->real_parent = reaper;
+	if (p->parent == p->real_parent)
+		BUG();
+}
+
+static inline void reparent_thread(task_t *p, task_t *father, int traced)
+{
+	/* We don't want people slaying init.  */
+	if (p->exit_signal != -1)
+		p->exit_signal = SIGCHLD;
+
+	if (p->pdeath_signal)
+		/* We already hold the tasklist_lock here.  */
+		group_send_sig_info(p->pdeath_signal, (void *) 0, p);
+
+	/* Move the child from its dying parent to the new one.  */
+	if (unlikely(traced)) {
+		/* Preserve ptrace links if someone else is tracing this child.  */
+		list_del_init(&p->ptrace_list);
+		if (p->parent != p->real_parent)
+			list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
+	} else {
+		/* If this child is being traced, then we're the one tracing it
+		 * anyway, so let go of it.
+		 */
+		p->ptrace = 0;
+		list_del_init(&p->sibling);
+		p->parent = p->real_parent;
+		list_add_tail(&p->sibling, &p->parent->children);
+
+		/* If we'd notified the old parent about this child's death,
+		 * also notify the new parent.
+		 */
+		if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
+		    thread_group_empty(p))
+			do_notify_parent(p, p->exit_signal);
+		else if (p->state == TASK_TRACED) {
+			/*
+			 * If it was at a trace stop, turn it into
+			 * a normal stop since it's no longer being
+			 * traced.
+			 */
+			ptrace_untrace(p);
+		}
+	}
+
+	/*
+	 * process group orphan check
+	 * Case ii: Our child is in a different pgrp
+	 * than we are, and it was the only connection
+	 * outside, so the child pgrp is now orphaned.
+	 */
+	if ((process_group(p) != process_group(father)) &&
+	    (p->signal->session == father->signal->session)) {
+		int pgrp = process_group(p);
+
+		if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
+			__kill_pg_info(SIGHUP, (void *)1, pgrp);
+			__kill_pg_info(SIGCONT, (void *)1, pgrp);
+		}
+	}
+}
+
+/*
+ * When we die, we re-parent all our children.
+ * Try to give them to another thread in our thread
+ * group, and if no such member exists, give it to
+ * the global child reaper process (ie "init")
+ */
+static inline void forget_original_parent(struct task_struct * father,
+					  struct list_head *to_release)
+{
+	struct task_struct *p, *reaper = father;
+	struct list_head *_p, *_n;
+
+	do {
+		reaper = next_thread(reaper);
+		if (reaper == father) {
+			reaper = child_reaper;
+			break;
+		}
+	} while (reaper->exit_state);
+
+	/*
+	 * There are only two places where our children can be:
+	 *
+	 * - in our child list
+	 * - in our ptraced child list
+	 *
+	 * Search them and reparent children.
+	 */
+	list_for_each_safe(_p, _n, &father->children) {
+		int ptrace;
+		p = list_entry(_p,struct task_struct,sibling);
+
+		ptrace = p->ptrace;
+
+		/* if father isn't the real parent, then ptrace must be enabled */
+		BUG_ON(father != p->real_parent && !ptrace);
+
+		if (father == p->real_parent) {
+			/* reparent with a reaper, real father it's us */
+			choose_new_parent(p, reaper, child_reaper);
+			reparent_thread(p, father, 0);
+		} else {
+			/* reparent ptraced task to its real parent */
+			__ptrace_unlink (p);
+			if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
+			    thread_group_empty(p))
+				do_notify_parent(p, p->exit_signal);
+		}
+
+		/*
+		 * if the ptraced child is a zombie with exit_signal == -1
+		 * we must collect it before we exit, or it will remain
+		 * zombie forever since we prevented it from self-reap itself
+		 * while it was being traced by us, to be able to see it in wait4.
+		 */
+		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
+			list_add(&p->ptrace_list, to_release);
+	}
+	list_for_each_safe(_p, _n, &father->ptrace_children) {
+		p = list_entry(_p,struct task_struct,ptrace_list);
+		choose_new_parent(p, reaper, child_reaper);
+		reparent_thread(p, father, 1);
+	}
+}
+
+/*
+ * Send signals to all our closest relatives so that they know
+ * to properly mourn us..
+ */
+static void exit_notify(struct task_struct *tsk)
+{
+	int state;
+	struct task_struct *t;
+	struct list_head ptrace_dead, *_p, *_n;
+
+	if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
+	    && !thread_group_empty(tsk)) {
+		/*
+		 * This occurs when there was a race between our exit
+		 * syscall and a group signal choosing us as the one to
+		 * wake up.  It could be that we are the only thread
+		 * alerted to check for pending signals, but another thread
+		 * should be woken now to take the signal since we will not.
+		 * Now we'll wake all the threads in the group just to make
+		 * sure someone gets all the pending signals.
+		 */
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		for (t = next_thread(tsk); t != tsk; t = next_thread(t))
+			if (!signal_pending(t) && !(t->flags & PF_EXITING)) {
+				recalc_sigpending_tsk(t);
+				if (signal_pending(t))
+					signal_wake_up(t, 0);
+			}
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+	}
+
+	write_lock_irq(&tasklist_lock);
+
+	/*
+	 * This does two things:
+	 *
+  	 * A.  Make init inherit all the child processes
+	 * B.  Check to see if any process groups have become orphaned
+	 *	as a result of our exiting, and if they have any stopped
+	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
+	 */
+
+	INIT_LIST_HEAD(&ptrace_dead);
+	forget_original_parent(tsk, &ptrace_dead);
+	BUG_ON(!list_empty(&tsk->children));
+	BUG_ON(!list_empty(&tsk->ptrace_children));
+
+	/*
+	 * Check to see if any process groups have become orphaned
+	 * as a result of our exiting, and if they have any stopped
+	 * jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
+	 *
+	 * Case i: Our father is in a different pgrp than we are
+	 * and we were the only connection outside, so our pgrp
+	 * is about to become orphaned.
+	 */
+	 
+	t = tsk->real_parent;
+	
+	if ((process_group(t) != process_group(tsk)) &&
+	    (t->signal->session == tsk->signal->session) &&
+	    will_become_orphaned_pgrp(process_group(tsk), tsk) &&
+	    has_stopped_jobs(process_group(tsk))) {
+		__kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
+		__kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
+	}
+
+	/* Let father know we died 
+	 *
+	 * Thread signals are configurable, but you aren't going to use
+	 * that to send signals to arbitary processes. 
+	 * That stops right now.
+	 *
+	 * If the parent exec id doesn't match the exec id we saved
+	 * when we started then we know the parent has changed security
+	 * domain.
+	 *
+	 * If our self_exec id doesn't match our parent_exec_id then
+	 * we have changed execution domain as these two values started
+	 * the same after a fork.
+	 *	
+	 */
+	
+	if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
+	    ( tsk->parent_exec_id != t->self_exec_id  ||
+	      tsk->self_exec_id != tsk->parent_exec_id)
+	    && !capable(CAP_KILL))
+		tsk->exit_signal = SIGCHLD;
+
+
+	/* If something other than our normal parent is ptracing us, then
+	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
+	 * only has special meaning to our real parent.
+	 */
+	if (tsk->exit_signal != -1 && thread_group_empty(tsk)) {
+		int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD;
+		do_notify_parent(tsk, signal);
+	} else if (tsk->ptrace) {
+		do_notify_parent(tsk, SIGCHLD);
+	}
+
+	state = EXIT_ZOMBIE;
+	if (tsk->exit_signal == -1 &&
+	    (likely(tsk->ptrace == 0) ||
+	     unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT)))
+		state = EXIT_DEAD;
+	tsk->exit_state = state;
+
+	write_unlock_irq(&tasklist_lock);
+
+	list_for_each_safe(_p, _n, &ptrace_dead) {
+		list_del_init(_p);
+		t = list_entry(_p,struct task_struct,ptrace_list);
+		release_task(t);
+	}
+
+	/* If the process is dead, release it - nobody will wait for it */
+	if (state == EXIT_DEAD)
+		release_task(tsk);
+
+	/* PF_DEAD causes final put_task_struct after we schedule. */
+	preempt_disable();
+	tsk->flags |= PF_DEAD;
+}
+
+fastcall NORET_TYPE void do_exit(long code)
+{
+	struct task_struct *tsk = current;
+	int group_dead;
+
+	profile_task_exit(tsk);
+
+	if (unlikely(in_interrupt()))
+		panic("Aiee, killing interrupt handler!");
+	if (unlikely(!tsk->pid))
+		panic("Attempted to kill the idle task!");
+	if (unlikely(tsk->pid == 1))
+		panic("Attempted to kill init!");
+	if (tsk->io_context)
+		exit_io_context();
+
+	if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
+		current->ptrace_message = code;
+		ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
+	}
+
+	tsk->flags |= PF_EXITING;
+
+	/*
+	 * Make sure we don't try to process any timer firings
+	 * while we are already exiting.
+	 */
+ 	tsk->it_virt_expires = cputime_zero;
+ 	tsk->it_prof_expires = cputime_zero;
+	tsk->it_sched_expires = 0;
+
+	if (unlikely(in_atomic()))
+		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
+				current->comm, current->pid,
+				preempt_count());
+
+	acct_update_integrals(tsk);
+	update_mem_hiwater(tsk);
+	group_dead = atomic_dec_and_test(&tsk->signal->live);
+	if (group_dead) {
+ 		del_timer_sync(&tsk->signal->real_timer);
+		acct_process(code);
+	}
+	exit_mm(tsk);
+
+	exit_sem(tsk);
+	__exit_files(tsk);
+	__exit_fs(tsk);
+	exit_namespace(tsk);
+	exit_thread();
+	cpuset_exit(tsk);
+	exit_keys(tsk);
+
+	if (group_dead && tsk->signal->leader)
+		disassociate_ctty(1);
+
+	module_put(tsk->thread_info->exec_domain->module);
+	if (tsk->binfmt)
+		module_put(tsk->binfmt->module);
+
+	tsk->exit_code = code;
+	exit_notify(tsk);
+#ifdef CONFIG_NUMA
+	mpol_free(tsk->mempolicy);
+	tsk->mempolicy = NULL;
+#endif
+
+	BUG_ON(!(current->flags & PF_DEAD));
+	schedule();
+	BUG();
+	/* Avoid "noreturn function does return".  */
+	for (;;) ;
+}
+
+NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+{
+	if (comp)
+		complete(comp);
+	
+	do_exit(code);
+}
+
+EXPORT_SYMBOL(complete_and_exit);
+
+asmlinkage long sys_exit(int error_code)
+{
+	do_exit((error_code&0xff)<<8);
+}
+
+task_t fastcall *next_thread(const task_t *p)
+{
+	return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
+}
+
+EXPORT_SYMBOL(next_thread);
+
+/*
+ * Take down every thread in the group.  This is called by fatal signals
+ * as well as by sys_exit_group (below).
+ */
+NORET_TYPE void
+do_group_exit(int exit_code)
+{
+	BUG_ON(exit_code & 0x80); /* core dumps don't get here */
+
+	if (current->signal->flags & SIGNAL_GROUP_EXIT)
+		exit_code = current->signal->group_exit_code;
+	else if (!thread_group_empty(current)) {
+		struct signal_struct *const sig = current->signal;
+		struct sighand_struct *const sighand = current->sighand;
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&sighand->siglock);
+		if (sig->flags & SIGNAL_GROUP_EXIT)
+			/* Another thread got here before we took the lock.  */
+			exit_code = sig->group_exit_code;
+		else {
+			sig->flags = SIGNAL_GROUP_EXIT;
+			sig->group_exit_code = exit_code;
+			zap_other_threads(current);
+		}
+		spin_unlock_irq(&sighand->siglock);
+		read_unlock(&tasklist_lock);
+	}
+
+	do_exit(exit_code);
+	/* NOTREACHED */
+}
+
+/*
+ * this kills every thread in the thread group. Note that any externally
+ * wait4()-ing process will get the correct exit code - even if this
+ * thread is not the thread group leader.
+ */
+asmlinkage void sys_exit_group(int error_code)
+{
+	do_group_exit((error_code & 0xff) << 8);
+}
+
+static int eligible_child(pid_t pid, int options, task_t *p)
+{
+	if (pid > 0) {
+		if (p->pid != pid)
+			return 0;
+	} else if (!pid) {
+		if (process_group(p) != process_group(current))
+			return 0;
+	} else if (pid != -1) {
+		if (process_group(p) != -pid)
+			return 0;
+	}
+
+	/*
+	 * Do not consider detached threads that are
+	 * not ptraced:
+	 */
+	if (p->exit_signal == -1 && !p->ptrace)
+		return 0;
+
+	/* Wait for all children (clone and not) if __WALL is set;
+	 * otherwise, wait for clone children *only* if __WCLONE is
+	 * set; otherwise, wait for non-clone children *only*.  (Note:
+	 * A "clone" child here is one that reports to its parent
+	 * using a signal other than SIGCHLD.) */
+	if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
+	    && !(options & __WALL))
+		return 0;
+	/*
+	 * Do not consider thread group leaders that are
+	 * in a non-empty thread group:
+	 */
+	if (current->tgid != p->tgid && delay_group_leader(p))
+		return 2;
+
+	if (security_task_wait(p))
+		return 0;
+
+	return 1;
+}
+
+static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
+			       int why, int status,
+			       struct siginfo __user *infop,
+			       struct rusage __user *rusagep)
+{
+	int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
+	put_task_struct(p);
+	if (!retval)
+		retval = put_user(SIGCHLD, &infop->si_signo);
+	if (!retval)
+		retval = put_user(0, &infop->si_errno);
+	if (!retval)
+		retval = put_user((short)why, &infop->si_code);
+	if (!retval)
+		retval = put_user(pid, &infop->si_pid);
+	if (!retval)
+		retval = put_user(uid, &infop->si_uid);
+	if (!retval)
+		retval = put_user(status, &infop->si_status);
+	if (!retval)
+		retval = pid;
+	return retval;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
+ * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * the lock and this task is uninteresting.  If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_zombie(task_t *p, int noreap,
+			    struct siginfo __user *infop,
+			    int __user *stat_addr, struct rusage __user *ru)
+{
+	unsigned long state;
+	int retval;
+	int status;
+
+	if (unlikely(noreap)) {
+		pid_t pid = p->pid;
+		uid_t uid = p->uid;
+		int exit_code = p->exit_code;
+		int why, status;
+
+		if (unlikely(p->exit_state != EXIT_ZOMBIE))
+			return 0;
+		if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
+			return 0;
+		get_task_struct(p);
+		read_unlock(&tasklist_lock);
+		if ((exit_code & 0x7f) == 0) {
+			why = CLD_EXITED;
+			status = exit_code >> 8;
+		} else {
+			why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
+			status = exit_code & 0x7f;
+		}
+		return wait_noreap_copyout(p, pid, uid, why,
+					   status, infop, ru);
+	}
+
+	/*
+	 * Try to move the task's state to DEAD
+	 * only one thread is allowed to do this:
+	 */
+	state = xchg(&p->exit_state, EXIT_DEAD);
+	if (state != EXIT_ZOMBIE) {
+		BUG_ON(state != EXIT_DEAD);
+		return 0;
+	}
+	if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) {
+		/*
+		 * This can only happen in a race with a ptraced thread
+		 * dying on another processor.
+		 */
+		return 0;
+	}
+
+	if (likely(p->real_parent == p->parent) && likely(p->signal)) {
+		/*
+		 * The resource counters for the group leader are in its
+		 * own task_struct.  Those for dead threads in the group
+		 * are in its signal_struct, as are those for the child
+		 * processes it has previously reaped.  All these
+		 * accumulate in the parent's signal_struct c* fields.
+		 *
+		 * We don't bother to take a lock here to protect these
+		 * p->signal fields, because they are only touched by
+		 * __exit_signal, which runs with tasklist_lock
+		 * write-locked anyway, and so is excluded here.  We do
+		 * need to protect the access to p->parent->signal fields,
+		 * as other threads in the parent group can be right
+		 * here reaping other children at the same time.
+		 */
+		spin_lock_irq(&p->parent->sighand->siglock);
+		p->parent->signal->cutime =
+			cputime_add(p->parent->signal->cutime,
+			cputime_add(p->utime,
+			cputime_add(p->signal->utime,
+				    p->signal->cutime)));
+		p->parent->signal->cstime =
+			cputime_add(p->parent->signal->cstime,
+			cputime_add(p->stime,
+			cputime_add(p->signal->stime,
+				    p->signal->cstime)));
+		p->parent->signal->cmin_flt +=
+			p->min_flt + p->signal->min_flt + p->signal->cmin_flt;
+		p->parent->signal->cmaj_flt +=
+			p->maj_flt + p->signal->maj_flt + p->signal->cmaj_flt;
+		p->parent->signal->cnvcsw +=
+			p->nvcsw + p->signal->nvcsw + p->signal->cnvcsw;
+		p->parent->signal->cnivcsw +=
+			p->nivcsw + p->signal->nivcsw + p->signal->cnivcsw;
+		spin_unlock_irq(&p->parent->sighand->siglock);
+	}
+
+	/*
+	 * Now we are sure this task is interesting, and no other
+	 * thread can reap it because we set its state to EXIT_DEAD.
+	 */
+	read_unlock(&tasklist_lock);
+
+	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+		? p->signal->group_exit_code : p->exit_code;
+	if (!retval && stat_addr)
+		retval = put_user(status, stat_addr);
+	if (!retval && infop)
+		retval = put_user(SIGCHLD, &infop->si_signo);
+	if (!retval && infop)
+		retval = put_user(0, &infop->si_errno);
+	if (!retval && infop) {
+		int why;
+
+		if ((status & 0x7f) == 0) {
+			why = CLD_EXITED;
+			status >>= 8;
+		} else {
+			why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
+			status &= 0x7f;
+		}
+		retval = put_user((short)why, &infop->si_code);
+		if (!retval)
+			retval = put_user(status, &infop->si_status);
+	}
+	if (!retval && infop)
+		retval = put_user(p->pid, &infop->si_pid);
+	if (!retval && infop)
+		retval = put_user(p->uid, &infop->si_uid);
+	if (retval) {
+		// TODO: is this safe?
+		p->exit_state = EXIT_ZOMBIE;
+		return retval;
+	}
+	retval = p->pid;
+	if (p->real_parent != p->parent) {
+		write_lock_irq(&tasklist_lock);
+		/* Double-check with lock held.  */
+		if (p->real_parent != p->parent) {
+			__ptrace_unlink(p);
+			// TODO: is this safe?
+			p->exit_state = EXIT_ZOMBIE;
+			/*
+			 * If this is not a detached task, notify the parent.
+			 * If it's still not detached after that, don't release
+			 * it now.
+			 */
+			if (p->exit_signal != -1) {
+				do_notify_parent(p, p->exit_signal);
+				if (p->exit_signal != -1)
+					p = NULL;
+			}
+		}
+		write_unlock_irq(&tasklist_lock);
+	}
+	if (p != NULL)
+		release_task(p);
+	BUG_ON(!retval);
+	return retval;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
+ * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * the lock and this task is uninteresting.  If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
+			     struct siginfo __user *infop,
+			     int __user *stat_addr, struct rusage __user *ru)
+{
+	int retval, exit_code;
+
+	if (!p->exit_code)
+		return 0;
+	if (delayed_group_leader && !(p->ptrace & PT_PTRACED) &&
+	    p->signal && p->signal->group_stop_count > 0)
+		/*
+		 * A group stop is in progress and this is the group leader.
+		 * We won't report until all threads have stopped.
+		 */
+		return 0;
+
+	/*
+	 * Now we are pretty sure this task is interesting.
+	 * Make sure it doesn't get reaped out from under us while we
+	 * give up the lock and then examine it below.  We don't want to
+	 * keep holding onto the tasklist_lock while we call getrusage and
+	 * possibly take page faults for user memory.
+	 */
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	if (unlikely(noreap)) {
+		pid_t pid = p->pid;
+		uid_t uid = p->uid;
+		int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
+
+		exit_code = p->exit_code;
+		if (unlikely(!exit_code) ||
+		    unlikely(p->state > TASK_STOPPED))
+			goto bail_ref;
+		return wait_noreap_copyout(p, pid, uid,
+					   why, (exit_code << 8) | 0x7f,
+					   infop, ru);
+	}
+
+	write_lock_irq(&tasklist_lock);
+
+	/*
+	 * This uses xchg to be atomic with the thread resuming and setting
+	 * it.  It must also be done with the write lock held to prevent a
+	 * race with the EXIT_ZOMBIE case.
+	 */
+	exit_code = xchg(&p->exit_code, 0);
+	if (unlikely(p->exit_state)) {
+		/*
+		 * The task resumed and then died.  Let the next iteration
+		 * catch it in EXIT_ZOMBIE.  Note that exit_code might
+		 * already be zero here if it resumed and did _exit(0).
+		 * The task itself is dead and won't touch exit_code again;
+		 * other processors in this function are locked out.
+		 */
+		p->exit_code = exit_code;
+		exit_code = 0;
+	}
+	if (unlikely(exit_code == 0)) {
+		/*
+		 * Another thread in this function got to it first, or it
+		 * resumed, or it resumed and then died.
+		 */
+		write_unlock_irq(&tasklist_lock);
+bail_ref:
+		put_task_struct(p);
+		/*
+		 * We are returning to the wait loop without having successfully
+		 * removed the process and having released the lock. We cannot
+		 * continue, since the "p" task pointer is potentially stale.
+		 *
+		 * Return -EAGAIN, and do_wait() will restart the loop from the
+		 * beginning. Do _not_ re-acquire the lock.
+		 */
+		return -EAGAIN;
+	}
+
+	/* move to end of parent's list to avoid starvation */
+	remove_parent(p);
+	add_parent(p, p->parent);
+
+	write_unlock_irq(&tasklist_lock);
+
+	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+	if (!retval && stat_addr)
+		retval = put_user((exit_code << 8) | 0x7f, stat_addr);
+	if (!retval && infop)
+		retval = put_user(SIGCHLD, &infop->si_signo);
+	if (!retval && infop)
+		retval = put_user(0, &infop->si_errno);
+	if (!retval && infop)
+		retval = put_user((short)((p->ptrace & PT_PTRACED)
+					  ? CLD_TRAPPED : CLD_STOPPED),
+				  &infop->si_code);
+	if (!retval && infop)
+		retval = put_user(exit_code, &infop->si_status);
+	if (!retval && infop)
+		retval = put_user(p->pid, &infop->si_pid);
+	if (!retval && infop)
+		retval = put_user(p->uid, &infop->si_uid);
+	if (!retval)
+		retval = p->pid;
+	put_task_struct(p);
+
+	BUG_ON(!retval);
+	return retval;
+}
+
+/*
+ * Handle do_wait work for one task in a live, non-stopped state.
+ * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * the lock and this task is uninteresting.  If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_continued(task_t *p, int noreap,
+			       struct siginfo __user *infop,
+			       int __user *stat_addr, struct rusage __user *ru)
+{
+	int retval;
+	pid_t pid;
+	uid_t uid;
+
+	if (unlikely(!p->signal))
+		return 0;
+
+	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
+		return 0;
+
+	spin_lock_irq(&p->sighand->siglock);
+	/* Re-check with the lock held.  */
+	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
+		spin_unlock_irq(&p->sighand->siglock);
+		return 0;
+	}
+	if (!noreap)
+		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
+	spin_unlock_irq(&p->sighand->siglock);
+
+	pid = p->pid;
+	uid = p->uid;
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	if (!infop) {
+		retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+		put_task_struct(p);
+		if (!retval && stat_addr)
+			retval = put_user(0xffff, stat_addr);
+		if (!retval)
+			retval = p->pid;
+	} else {
+		retval = wait_noreap_copyout(p, pid, uid,
+					     CLD_CONTINUED, SIGCONT,
+					     infop, ru);
+		BUG_ON(retval == 0);
+	}
+
+	return retval;
+}
+
+
+static inline int my_ptrace_child(struct task_struct *p)
+{
+	if (!(p->ptrace & PT_PTRACED))
+		return 0;
+	if (!(p->ptrace & PT_ATTACHED))
+		return 1;
+	/*
+	 * This child was PTRACE_ATTACH'd.  We should be seeing it only if
+	 * we are the attacher.  If we are the real parent, this is a race
+	 * inside ptrace_attach.  It is waiting for the tasklist_lock,
+	 * which we have to switch the parent links, but has already set
+	 * the flags in p->ptrace.
+	 */
+	return (p->parent != p->real_parent);
+}
+
+static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
+		    int __user *stat_addr, struct rusage __user *ru)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	struct task_struct *tsk;
+	int flag, retval;
+
+	add_wait_queue(&current->signal->wait_chldexit,&wait);
+repeat:
+	/*
+	 * We will set this flag if we see any child that might later
+	 * match our criteria, even if we are not able to reap it yet.
+	 */
+	flag = 0;
+	current->state = TASK_INTERRUPTIBLE;
+	read_lock(&tasklist_lock);
+	tsk = current;
+	do {
+		struct task_struct *p;
+		struct list_head *_p;
+		int ret;
+
+		list_for_each(_p,&tsk->children) {
+			p = list_entry(_p,struct task_struct,sibling);
+
+			ret = eligible_child(pid, options, p);
+			if (!ret)
+				continue;
+
+			switch (p->state) {
+			case TASK_TRACED:
+				if (!my_ptrace_child(p))
+					continue;
+				/*FALLTHROUGH*/
+			case TASK_STOPPED:
+				/*
+				 * It's stopped now, so it might later
+				 * continue, exit, or stop again.
+				 */
+				flag = 1;
+				if (!(options & WUNTRACED) &&
+				    !my_ptrace_child(p))
+					continue;
+				retval = wait_task_stopped(p, ret == 2,
+							   (options & WNOWAIT),
+							   infop,
+							   stat_addr, ru);
+				if (retval == -EAGAIN)
+					goto repeat;
+				if (retval != 0) /* He released the lock.  */
+					goto end;
+				break;
+			default:
+			// case EXIT_DEAD:
+				if (p->exit_state == EXIT_DEAD)
+					continue;
+			// case EXIT_ZOMBIE:
+				if (p->exit_state == EXIT_ZOMBIE) {
+					/*
+					 * Eligible but we cannot release
+					 * it yet:
+					 */
+					if (ret == 2)
+						goto check_continued;
+					if (!likely(options & WEXITED))
+						continue;
+					retval = wait_task_zombie(
+						p, (options & WNOWAIT),
+						infop, stat_addr, ru);
+					/* He released the lock.  */
+					if (retval != 0)
+						goto end;
+					break;
+				}
+check_continued:
+				/*
+				 * It's running now, so it might later
+				 * exit, stop, or stop and then continue.
+				 */
+				flag = 1;
+				if (!unlikely(options & WCONTINUED))
+					continue;
+				retval = wait_task_continued(
+					p, (options & WNOWAIT),
+					infop, stat_addr, ru);
+				if (retval != 0) /* He released the lock.  */
+					goto end;
+				break;
+			}
+		}
+		if (!flag) {
+			list_for_each(_p, &tsk->ptrace_children) {
+				p = list_entry(_p, struct task_struct,
+						ptrace_list);
+				if (!eligible_child(pid, options, p))
+					continue;
+				flag = 1;
+				break;
+			}
+		}
+		if (options & __WNOTHREAD)
+			break;
+		tsk = next_thread(tsk);
+		if (tsk->signal != current->signal)
+			BUG();
+	} while (tsk != current);
+
+	read_unlock(&tasklist_lock);
+	if (flag) {
+		retval = 0;
+		if (options & WNOHANG)
+			goto end;
+		retval = -ERESTARTSYS;
+		if (signal_pending(current))
+			goto end;
+		schedule();
+		goto repeat;
+	}
+	retval = -ECHILD;
+end:
+	current->state = TASK_RUNNING;
+	remove_wait_queue(&current->signal->wait_chldexit,&wait);
+	if (infop) {
+		if (retval > 0)
+		retval = 0;
+		else {
+			/*
+			 * For a WNOHANG return, clear out all the fields
+			 * we would set so the user can easily tell the
+			 * difference.
+			 */
+			if (!retval)
+				retval = put_user(0, &infop->si_signo);
+			if (!retval)
+				retval = put_user(0, &infop->si_errno);
+			if (!retval)
+				retval = put_user(0, &infop->si_code);
+			if (!retval)
+				retval = put_user(0, &infop->si_pid);
+			if (!retval)
+				retval = put_user(0, &infop->si_uid);
+			if (!retval)
+				retval = put_user(0, &infop->si_status);
+		}
+	}
+	return retval;
+}
+
+asmlinkage long sys_waitid(int which, pid_t pid,
+			   struct siginfo __user *infop, int options,
+			   struct rusage __user *ru)
+{
+	long ret;
+
+	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
+		return -EINVAL;
+	if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
+		return -EINVAL;
+
+	switch (which) {
+	case P_ALL:
+		pid = -1;
+		break;
+	case P_PID:
+		if (pid <= 0)
+			return -EINVAL;
+		break;
+	case P_PGID:
+		if (pid <= 0)
+			return -EINVAL;
+		pid = -pid;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ret = do_wait(pid, options, infop, NULL, ru);
+
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
+}
+
+asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
+			  int options, struct rusage __user *ru)
+{
+	long ret;
+
+	if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
+			__WNOTHREAD|__WCLONE|__WALL))
+		return -EINVAL;
+	ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
+
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
+}
+
+#ifdef __ARCH_WANT_SYS_WAITPID
+
+/*
+ * sys_waitpid() remains for compatibility. waitpid() should be
+ * implemented by calling sys_wait4() from libc.a.
+ */
+asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options)
+{
+	return sys_wait4(pid, stat_addr, options, NULL);
+}
+
+#endif
diff --git a/kernel/extable.c b/kernel/extable.c
new file mode 100644
index 000000000000..7501b531ceed
--- /dev/null
+++ b/kernel/extable.c
@@ -0,0 +1,67 @@
+/* Rewritten by Rusty Russell, on the backs of many others...
+   Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <asm/sections.h>
+
+extern struct exception_table_entry __start___ex_table[];
+extern struct exception_table_entry __stop___ex_table[];
+
+/* Sort the kernel's built-in exception table */
+void __init sort_main_extable(void)
+{
+	sort_extable(__start___ex_table, __stop___ex_table);
+}
+
+/* Given an address, look for it in the exception tables. */
+const struct exception_table_entry *search_exception_tables(unsigned long addr)
+{
+	const struct exception_table_entry *e;
+
+	e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
+	if (!e)
+		e = search_module_extables(addr);
+	return e;
+}
+
+static int core_kernel_text(unsigned long addr)
+{
+	if (addr >= (unsigned long)_stext &&
+	    addr <= (unsigned long)_etext)
+		return 1;
+
+	if (addr >= (unsigned long)_sinittext &&
+	    addr <= (unsigned long)_einittext)
+		return 1;
+	return 0;
+}
+
+int __kernel_text_address(unsigned long addr)
+{
+	if (core_kernel_text(addr))
+		return 1;
+	return __module_text_address(addr) != NULL;
+}
+
+int kernel_text_address(unsigned long addr)
+{
+	if (core_kernel_text(addr))
+		return 1;
+	return module_text_address(addr) != NULL;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
new file mode 100644
index 000000000000..f42a17f88699
--- /dev/null
+++ b/kernel/fork.c
@@ -0,0 +1,1274 @@
+/*
+ *  linux/kernel/fork.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ *  'fork.c' contains the help-routines for the 'fork' system call
+ * (see also entry.S and others).
+ * Fork is rather simple, once you get the hang of it, but the memory
+ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/unistd.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/completion.h>
+#include <linux/namespace.h>
+#include <linux/personality.h>
+#include <linux/mempolicy.h>
+#include <linux/sem.h>
+#include <linux/file.h>
+#include <linux/key.h>
+#include <linux/binfmts.h>
+#include <linux/mman.h>
+#include <linux/fs.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/security.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+#include <linux/jiffies.h>
+#include <linux/futex.h>
+#include <linux/ptrace.h>
+#include <linux/mount.h>
+#include <linux/audit.h>
+#include <linux/profile.h>
+#include <linux/rmap.h>
+#include <linux/acct.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Protected counters by write_lock_irq(&tasklist_lock)
+ */
+unsigned long total_forks;	/* Handle normal Linux uptimes. */
+int nr_threads; 		/* The idle threads do not count.. */
+
+int max_threads;		/* tunable limit on nr_threads */
+
+DEFINE_PER_CPU(unsigned long, process_counts) = 0;
+
+ __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+
+EXPORT_SYMBOL(tasklist_lock);
+
+int nr_processes(void)
+{
+	int cpu;
+	int total = 0;
+
+	for_each_online_cpu(cpu)
+		total += per_cpu(process_counts, cpu);
+
+	return total;
+}
+
+#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+# define alloc_task_struct()	kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
+# define free_task_struct(tsk)	kmem_cache_free(task_struct_cachep, (tsk))
+static kmem_cache_t *task_struct_cachep;
+#endif
+
+/* SLAB cache for signal_struct structures (tsk->signal) */
+kmem_cache_t *signal_cachep;
+
+/* SLAB cache for sighand_struct structures (tsk->sighand) */
+kmem_cache_t *sighand_cachep;
+
+/* SLAB cache for files_struct structures (tsk->files) */
+kmem_cache_t *files_cachep;
+
+/* SLAB cache for fs_struct structures (tsk->fs) */
+kmem_cache_t *fs_cachep;
+
+/* SLAB cache for vm_area_struct structures */
+kmem_cache_t *vm_area_cachep;
+
+/* SLAB cache for mm_struct structures (tsk->mm) */
+static kmem_cache_t *mm_cachep;
+
+void free_task(struct task_struct *tsk)
+{
+	free_thread_info(tsk->thread_info);
+	free_task_struct(tsk);
+}
+EXPORT_SYMBOL(free_task);
+
+void __put_task_struct(struct task_struct *tsk)
+{
+	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
+	WARN_ON(atomic_read(&tsk->usage));
+	WARN_ON(tsk == current);
+
+	if (unlikely(tsk->audit_context))
+		audit_free(tsk);
+	security_task_free(tsk);
+	free_uid(tsk->user);
+	put_group_info(tsk->group_info);
+
+	if (!profile_handoff_task(tsk))
+		free_task(tsk);
+}
+
+void __init fork_init(unsigned long mempages)
+{
+#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef ARCH_MIN_TASKALIGN
+#define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
+#endif
+	/* create a slab on which task_structs can be allocated */
+	task_struct_cachep =
+		kmem_cache_create("task_struct", sizeof(struct task_struct),
+			ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
+#endif
+
+	/*
+	 * The default maximum number of threads is set to a safe
+	 * value: the thread structures can take up at most half
+	 * of memory.
+	 */
+	max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
+
+	/*
+	 * we need to allow at least 20 threads to boot a system
+	 */
+	if(max_threads < 20)
+		max_threads = 20;
+
+	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
+	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+	init_task.signal->rlim[RLIMIT_SIGPENDING] =
+		init_task.signal->rlim[RLIMIT_NPROC];
+}
+
+static struct task_struct *dup_task_struct(struct task_struct *orig)
+{
+	struct task_struct *tsk;
+	struct thread_info *ti;
+
+	prepare_to_copy(orig);
+
+	tsk = alloc_task_struct();
+	if (!tsk)
+		return NULL;
+
+	ti = alloc_thread_info(tsk);
+	if (!ti) {
+		free_task_struct(tsk);
+		return NULL;
+	}
+
+	*ti = *orig->thread_info;
+	*tsk = *orig;
+	tsk->thread_info = ti;
+	ti->task = tsk;
+
+	/* One for us, one for whoever does the "release_task()" (usually parent) */
+	atomic_set(&tsk->usage,2);
+	return tsk;
+}
+
+#ifdef CONFIG_MMU
+static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
+{
+	struct vm_area_struct * mpnt, *tmp, **pprev;
+	struct rb_node **rb_link, *rb_parent;
+	int retval;
+	unsigned long charge;
+	struct mempolicy *pol;
+
+	down_write(&oldmm->mmap_sem);
+	flush_cache_mm(current->mm);
+	mm->locked_vm = 0;
+	mm->mmap = NULL;
+	mm->mmap_cache = NULL;
+	mm->free_area_cache = oldmm->mmap_base;
+	mm->map_count = 0;
+	set_mm_counter(mm, rss, 0);
+	set_mm_counter(mm, anon_rss, 0);
+	cpus_clear(mm->cpu_vm_mask);
+	mm->mm_rb = RB_ROOT;
+	rb_link = &mm->mm_rb.rb_node;
+	rb_parent = NULL;
+	pprev = &mm->mmap;
+
+	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
+		struct file *file;
+
+		if (mpnt->vm_flags & VM_DONTCOPY) {
+			__vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+							-vma_pages(mpnt));
+			continue;
+		}
+		charge = 0;
+		if (mpnt->vm_flags & VM_ACCOUNT) {
+			unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+			if (security_vm_enough_memory(len))
+				goto fail_nomem;
+			charge = len;
+		}
+		tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+		if (!tmp)
+			goto fail_nomem;
+		*tmp = *mpnt;
+		pol = mpol_copy(vma_policy(mpnt));
+		retval = PTR_ERR(pol);
+		if (IS_ERR(pol))
+			goto fail_nomem_policy;
+		vma_set_policy(tmp, pol);
+		tmp->vm_flags &= ~VM_LOCKED;
+		tmp->vm_mm = mm;
+		tmp->vm_next = NULL;
+		anon_vma_link(tmp);
+		file = tmp->vm_file;
+		if (file) {
+			struct inode *inode = file->f_dentry->d_inode;
+			get_file(file);
+			if (tmp->vm_flags & VM_DENYWRITE)
+				atomic_dec(&inode->i_writecount);
+      
+			/* insert tmp into the share list, just after mpnt */
+			spin_lock(&file->f_mapping->i_mmap_lock);
+			tmp->vm_truncate_count = mpnt->vm_truncate_count;
+			flush_dcache_mmap_lock(file->f_mapping);
+			vma_prio_tree_add(tmp, mpnt);
+			flush_dcache_mmap_unlock(file->f_mapping);
+			spin_unlock(&file->f_mapping->i_mmap_lock);
+		}
+
+		/*
+		 * Link in the new vma and copy the page table entries:
+		 * link in first so that swapoff can see swap entries,
+		 * and try_to_unmap_one's find_vma find the new vma.
+		 */
+		spin_lock(&mm->page_table_lock);
+		*pprev = tmp;
+		pprev = &tmp->vm_next;
+
+		__vma_link_rb(mm, tmp, rb_link, rb_parent);
+		rb_link = &tmp->vm_rb.rb_right;
+		rb_parent = &tmp->vm_rb;
+
+		mm->map_count++;
+		retval = copy_page_range(mm, current->mm, tmp);
+		spin_unlock(&mm->page_table_lock);
+
+		if (tmp->vm_ops && tmp->vm_ops->open)
+			tmp->vm_ops->open(tmp);
+
+		if (retval)
+			goto out;
+	}
+	retval = 0;
+
+out:
+	flush_tlb_mm(current->mm);
+	up_write(&oldmm->mmap_sem);
+	return retval;
+fail_nomem_policy:
+	kmem_cache_free(vm_area_cachep, tmp);
+fail_nomem:
+	retval = -ENOMEM;
+	vm_unacct_memory(charge);
+	goto out;
+}
+
+static inline int mm_alloc_pgd(struct mm_struct * mm)
+{
+	mm->pgd = pgd_alloc(mm);
+	if (unlikely(!mm->pgd))
+		return -ENOMEM;
+	return 0;
+}
+
+static inline void mm_free_pgd(struct mm_struct * mm)
+{
+	pgd_free(mm->pgd);
+}
+#else
+#define dup_mmap(mm, oldmm)	(0)
+#define mm_alloc_pgd(mm)	(0)
+#define mm_free_pgd(mm)
+#endif /* CONFIG_MMU */
+
+ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
+
+#define allocate_mm()	(kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
+#define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
+
+#include <linux/init_task.h>
+
+static struct mm_struct * mm_init(struct mm_struct * mm)
+{
+	atomic_set(&mm->mm_users, 1);
+	atomic_set(&mm->mm_count, 1);
+	init_rwsem(&mm->mmap_sem);
+	INIT_LIST_HEAD(&mm->mmlist);
+	mm->core_waiters = 0;
+	mm->nr_ptes = 0;
+	spin_lock_init(&mm->page_table_lock);
+	rwlock_init(&mm->ioctx_list_lock);
+	mm->ioctx_list = NULL;
+	mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
+	mm->free_area_cache = TASK_UNMAPPED_BASE;
+
+	if (likely(!mm_alloc_pgd(mm))) {
+		mm->def_flags = 0;
+		return mm;
+	}
+	free_mm(mm);
+	return NULL;
+}
+
+/*
+ * Allocate and initialize an mm_struct.
+ */
+struct mm_struct * mm_alloc(void)
+{
+	struct mm_struct * mm;
+
+	mm = allocate_mm();
+	if (mm) {
+		memset(mm, 0, sizeof(*mm));
+		mm = mm_init(mm);
+	}
+	return mm;
+}
+
+/*
+ * Called when the last reference to the mm
+ * is dropped: either by a lazy thread or by
+ * mmput. Free the page directory and the mm.
+ */
+void fastcall __mmdrop(struct mm_struct *mm)
+{
+	BUG_ON(mm == &init_mm);
+	mm_free_pgd(mm);
+	destroy_context(mm);
+	free_mm(mm);
+}
+
+/*
+ * Decrement the use count and release all resources for an mm.
+ */
+void mmput(struct mm_struct *mm)
+{
+	if (atomic_dec_and_test(&mm->mm_users)) {
+		exit_aio(mm);
+		exit_mmap(mm);
+		if (!list_empty(&mm->mmlist)) {
+			spin_lock(&mmlist_lock);
+			list_del(&mm->mmlist);
+			spin_unlock(&mmlist_lock);
+		}
+		put_swap_token(mm);
+		mmdrop(mm);
+	}
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+/**
+ * get_task_mm - acquire a reference to the task's mm
+ *
+ * Returns %NULL if the task has no mm.  Checks PF_BORROWED_MM (meaning
+ * this kernel workthread has transiently adopted a user mm with use_mm,
+ * to do its AIO) is not set and if so returns a reference to it, after
+ * bumping up the use count.  User must release the mm via mmput()
+ * after use.  Typically used by /proc and ptrace.
+ */
+struct mm_struct *get_task_mm(struct task_struct *task)
+{
+	struct mm_struct *mm;
+
+	task_lock(task);
+	mm = task->mm;
+	if (mm) {
+		if (task->flags & PF_BORROWED_MM)
+			mm = NULL;
+		else
+			atomic_inc(&mm->mm_users);
+	}
+	task_unlock(task);
+	return mm;
+}
+EXPORT_SYMBOL_GPL(get_task_mm);
+
+/* Please note the differences between mmput and mm_release.
+ * mmput is called whenever we stop holding onto a mm_struct,
+ * error success whatever.
+ *
+ * mm_release is called after a mm_struct has been removed
+ * from the current process.
+ *
+ * This difference is important for error handling, when we
+ * only half set up a mm_struct for a new process and need to restore
+ * the old one.  Because we mmput the new mm_struct before
+ * restoring the old one. . .
+ * Eric Biederman 10 January 1998
+ */
+void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+	struct completion *vfork_done = tsk->vfork_done;
+
+	/* Get rid of any cached register state */
+	deactivate_mm(tsk, mm);
+
+	/* notify parent sleeping on vfork() */
+	if (vfork_done) {
+		tsk->vfork_done = NULL;
+		complete(vfork_done);
+	}
+	if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
+		u32 __user * tidptr = tsk->clear_child_tid;
+		tsk->clear_child_tid = NULL;
+
+		/*
+		 * We don't check the error code - if userspace has
+		 * not set up a proper pointer then tough luck.
+		 */
+		put_user(0, tidptr);
+		sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
+	}
+}
+
+static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
+{
+	struct mm_struct * mm, *oldmm;
+	int retval;
+
+	tsk->min_flt = tsk->maj_flt = 0;
+	tsk->nvcsw = tsk->nivcsw = 0;
+
+	tsk->mm = NULL;
+	tsk->active_mm = NULL;
+
+	/*
+	 * Are we cloning a kernel thread?
+	 *
+	 * We need to steal a active VM for that..
+	 */
+	oldmm = current->mm;
+	if (!oldmm)
+		return 0;
+
+	if (clone_flags & CLONE_VM) {
+		atomic_inc(&oldmm->mm_users);
+		mm = oldmm;
+		/*
+		 * There are cases where the PTL is held to ensure no
+		 * new threads start up in user mode using an mm, which
+		 * allows optimizing out ipis; the tlb_gather_mmu code
+		 * is an example.
+		 */
+		spin_unlock_wait(&oldmm->page_table_lock);
+		goto good_mm;
+	}
+
+	retval = -ENOMEM;
+	mm = allocate_mm();
+	if (!mm)
+		goto fail_nomem;
+
+	/* Copy the current MM stuff.. */
+	memcpy(mm, oldmm, sizeof(*mm));
+	if (!mm_init(mm))
+		goto fail_nomem;
+
+	if (init_new_context(tsk,mm))
+		goto fail_nocontext;
+
+	retval = dup_mmap(mm, oldmm);
+	if (retval)
+		goto free_pt;
+
+	mm->hiwater_rss = get_mm_counter(mm,rss);
+	mm->hiwater_vm = mm->total_vm;
+
+good_mm:
+	tsk->mm = mm;
+	tsk->active_mm = mm;
+	return 0;
+
+free_pt:
+	mmput(mm);
+fail_nomem:
+	return retval;
+
+fail_nocontext:
+	/*
+	 * If init_new_context() failed, we cannot use mmput() to free the mm
+	 * because it calls destroy_context()
+	 */
+	mm_free_pgd(mm);
+	free_mm(mm);
+	return retval;
+}
+
+static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
+{
+	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+	/* We don't need to lock fs - think why ;-) */
+	if (fs) {
+		atomic_set(&fs->count, 1);
+		rwlock_init(&fs->lock);
+		fs->umask = old->umask;
+		read_lock(&old->lock);
+		fs->rootmnt = mntget(old->rootmnt);
+		fs->root = dget(old->root);
+		fs->pwdmnt = mntget(old->pwdmnt);
+		fs->pwd = dget(old->pwd);
+		if (old->altroot) {
+			fs->altrootmnt = mntget(old->altrootmnt);
+			fs->altroot = dget(old->altroot);
+		} else {
+			fs->altrootmnt = NULL;
+			fs->altroot = NULL;
+		}
+		read_unlock(&old->lock);
+	}
+	return fs;
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+	return __copy_fs_struct(old);
+}
+
+EXPORT_SYMBOL_GPL(copy_fs_struct);
+
+static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
+{
+	if (clone_flags & CLONE_FS) {
+		atomic_inc(&current->fs->count);
+		return 0;
+	}
+	tsk->fs = __copy_fs_struct(current->fs);
+	if (!tsk->fs)
+		return -ENOMEM;
+	return 0;
+}
+
+static int count_open_files(struct files_struct *files, int size)
+{
+	int i;
+
+	/* Find the last open fd */
+	for (i = size/(8*sizeof(long)); i > 0; ) {
+		if (files->open_fds->fds_bits[--i])
+			break;
+	}
+	i = (i+1) * 8 * sizeof(long);
+	return i;
+}
+
+static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+{
+	struct files_struct *oldf, *newf;
+	struct file **old_fds, **new_fds;
+	int open_files, size, i, error = 0, expand;
+
+	/*
+	 * A background process may not have any files ...
+	 */
+	oldf = current->files;
+	if (!oldf)
+		goto out;
+
+	if (clone_flags & CLONE_FILES) {
+		atomic_inc(&oldf->count);
+		goto out;
+	}
+
+	/*
+	 * Note: we may be using current for both targets (See exec.c)
+	 * This works because we cache current->files (old) as oldf. Don't
+	 * break this.
+	 */
+	tsk->files = NULL;
+	error = -ENOMEM;
+	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+	if (!newf) 
+		goto out;
+
+	atomic_set(&newf->count, 1);
+
+	spin_lock_init(&newf->file_lock);
+	newf->next_fd	    = 0;
+	newf->max_fds	    = NR_OPEN_DEFAULT;
+	newf->max_fdset	    = __FD_SETSIZE;
+	newf->close_on_exec = &newf->close_on_exec_init;
+	newf->open_fds	    = &newf->open_fds_init;
+	newf->fd	    = &newf->fd_array[0];
+
+	spin_lock(&oldf->file_lock);
+
+	open_files = count_open_files(oldf, oldf->max_fdset);
+	expand = 0;
+
+	/*
+	 * Check whether we need to allocate a larger fd array or fd set.
+	 * Note: we're not a clone task, so the open count won't  change.
+	 */
+	if (open_files > newf->max_fdset) {
+		newf->max_fdset = 0;
+		expand = 1;
+	}
+	if (open_files > newf->max_fds) {
+		newf->max_fds = 0;
+		expand = 1;
+	}
+
+	/* if the old fdset gets grown now, we'll only copy up to "size" fds */
+	if (expand) {
+		spin_unlock(&oldf->file_lock);
+		spin_lock(&newf->file_lock);
+		error = expand_files(newf, open_files-1);
+		spin_unlock(&newf->file_lock);
+		if (error < 0)
+			goto out_release;
+		spin_lock(&oldf->file_lock);
+	}
+
+	old_fds = oldf->fd;
+	new_fds = newf->fd;
+
+	memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
+	memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
+
+	for (i = open_files; i != 0; i--) {
+		struct file *f = *old_fds++;
+		if (f) {
+			get_file(f);
+		} else {
+			/*
+			 * The fd may be claimed in the fd bitmap but not yet
+			 * instantiated in the files array if a sibling thread
+			 * is partway through open().  So make sure that this
+			 * fd is available to the new process.
+			 */
+			FD_CLR(open_files - i, newf->open_fds);
+		}
+		*new_fds++ = f;
+	}
+	spin_unlock(&oldf->file_lock);
+
+	/* compute the remainder to be cleared */
+	size = (newf->max_fds - open_files) * sizeof(struct file *);
+
+	/* This is long word aligned thus could use a optimized version */ 
+	memset(new_fds, 0, size); 
+
+	if (newf->max_fdset > open_files) {
+		int left = (newf->max_fdset-open_files)/8;
+		int start = open_files / (8 * sizeof(unsigned long));
+
+		memset(&newf->open_fds->fds_bits[start], 0, left);
+		memset(&newf->close_on_exec->fds_bits[start], 0, left);
+	}
+
+	tsk->files = newf;
+	error = 0;
+out:
+	return error;
+
+out_release:
+	free_fdset (newf->close_on_exec, newf->max_fdset);
+	free_fdset (newf->open_fds, newf->max_fdset);
+	free_fd_array(newf->fd, newf->max_fds);
+	kmem_cache_free(files_cachep, newf);
+	goto out;
+}
+
+/*
+ *	Helper to unshare the files of the current task.
+ *	We don't want to expose copy_files internals to
+ *	the exec layer of the kernel.
+ */
+
+int unshare_files(void)
+{
+	struct files_struct *files  = current->files;
+	int rc;
+
+	if(!files)
+		BUG();
+
+	/* This can race but the race causes us to copy when we don't
+	   need to and drop the copy */
+	if(atomic_read(&files->count) == 1)
+	{
+		atomic_inc(&files->count);
+		return 0;
+	}
+	rc = copy_files(0, current);
+	if(rc)
+		current->files = files;
+	return rc;
+}
+
+EXPORT_SYMBOL(unshare_files);
+
+static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
+{
+	struct sighand_struct *sig;
+
+	if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
+		atomic_inc(&current->sighand->count);
+		return 0;
+	}
+	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
+	tsk->sighand = sig;
+	if (!sig)
+		return -ENOMEM;
+	spin_lock_init(&sig->siglock);
+	atomic_set(&sig->count, 1);
+	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
+	return 0;
+}
+
+static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
+{
+	struct signal_struct *sig;
+	int ret;
+
+	if (clone_flags & CLONE_THREAD) {
+		atomic_inc(&current->signal->count);
+		atomic_inc(&current->signal->live);
+		return 0;
+	}
+	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+	tsk->signal = sig;
+	if (!sig)
+		return -ENOMEM;
+
+	ret = copy_thread_group_keys(tsk);
+	if (ret < 0) {
+		kmem_cache_free(signal_cachep, sig);
+		return ret;
+	}
+
+	atomic_set(&sig->count, 1);
+	atomic_set(&sig->live, 1);
+	init_waitqueue_head(&sig->wait_chldexit);
+	sig->flags = 0;
+	sig->group_exit_code = 0;
+	sig->group_exit_task = NULL;
+	sig->group_stop_count = 0;
+	sig->curr_target = NULL;
+	init_sigpending(&sig->shared_pending);
+	INIT_LIST_HEAD(&sig->posix_timers);
+
+	sig->it_real_value = sig->it_real_incr = 0;
+	sig->real_timer.function = it_real_fn;
+	sig->real_timer.data = (unsigned long) tsk;
+	init_timer(&sig->real_timer);
+
+	sig->it_virt_expires = cputime_zero;
+	sig->it_virt_incr = cputime_zero;
+	sig->it_prof_expires = cputime_zero;
+	sig->it_prof_incr = cputime_zero;
+
+	sig->tty = current->signal->tty;
+	sig->pgrp = process_group(current);
+	sig->session = current->signal->session;
+	sig->leader = 0;	/* session leadership doesn't inherit */
+	sig->tty_old_pgrp = 0;
+
+	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
+	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+	sig->sched_time = 0;
+	INIT_LIST_HEAD(&sig->cpu_timers[0]);
+	INIT_LIST_HEAD(&sig->cpu_timers[1]);
+	INIT_LIST_HEAD(&sig->cpu_timers[2]);
+
+	task_lock(current->group_leader);
+	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
+	task_unlock(current->group_leader);
+
+	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+		/*
+		 * New sole thread in the process gets an expiry time
+		 * of the whole CPU time limit.
+		 */
+		tsk->it_prof_expires =
+			secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+	}
+
+	return 0;
+}
+
+static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
+{
+	unsigned long new_flags = p->flags;
+
+	new_flags &= ~PF_SUPERPRIV;
+	new_flags |= PF_FORKNOEXEC;
+	if (!(clone_flags & CLONE_PTRACE))
+		p->ptrace = 0;
+	p->flags = new_flags;
+}
+
+asmlinkage long sys_set_tid_address(int __user *tidptr)
+{
+	current->clear_child_tid = tidptr;
+
+	return current->pid;
+}
+
+/*
+ * This creates a new process as a copy of the old one,
+ * but does not actually start it yet.
+ *
+ * It copies the registers, and all the appropriate
+ * parts of the process environment (as per the clone
+ * flags). The actual kick-off is left to the caller.
+ */
+static task_t *copy_process(unsigned long clone_flags,
+				 unsigned long stack_start,
+				 struct pt_regs *regs,
+				 unsigned long stack_size,
+				 int __user *parent_tidptr,
+				 int __user *child_tidptr,
+				 int pid)
+{
+	int retval;
+	struct task_struct *p = NULL;
+
+	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * Thread groups must share signals as well, and detached threads
+	 * can only be started up within the thread group.
+	 */
+	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * Shared signal handlers imply shared VM. By way of the above,
+	 * thread groups also imply shared VM. Blocking this case allows
+	 * for various simplifications in other code.
+	 */
+	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
+		return ERR_PTR(-EINVAL);
+
+	retval = security_task_create(clone_flags);
+	if (retval)
+		goto fork_out;
+
+	retval = -ENOMEM;
+	p = dup_task_struct(current);
+	if (!p)
+		goto fork_out;
+
+	retval = -EAGAIN;
+	if (atomic_read(&p->user->processes) >=
+			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
+		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
+				p->user != &root_user)
+			goto bad_fork_free;
+	}
+
+	atomic_inc(&p->user->__count);
+	atomic_inc(&p->user->processes);
+	get_group_info(p->group_info);
+
+	/*
+	 * If multiple threads are within copy_process(), then this check
+	 * triggers too late. This doesn't hurt, the check is only there
+	 * to stop root fork bombs.
+	 */
+	if (nr_threads >= max_threads)
+		goto bad_fork_cleanup_count;
+
+	if (!try_module_get(p->thread_info->exec_domain->module))
+		goto bad_fork_cleanup_count;
+
+	if (p->binfmt && !try_module_get(p->binfmt->module))
+		goto bad_fork_cleanup_put_domain;
+
+	p->did_exec = 0;
+	copy_flags(clone_flags, p);
+	p->pid = pid;
+	retval = -EFAULT;
+	if (clone_flags & CLONE_PARENT_SETTID)
+		if (put_user(p->pid, parent_tidptr))
+			goto bad_fork_cleanup;
+
+	p->proc_dentry = NULL;
+
+	INIT_LIST_HEAD(&p->children);
+	INIT_LIST_HEAD(&p->sibling);
+	p->vfork_done = NULL;
+	spin_lock_init(&p->alloc_lock);
+	spin_lock_init(&p->proc_lock);
+
+	clear_tsk_thread_flag(p, TIF_SIGPENDING);
+	init_sigpending(&p->pending);
+
+	p->utime = cputime_zero;
+	p->stime = cputime_zero;
+ 	p->sched_time = 0;
+	p->rchar = 0;		/* I/O counter: bytes read */
+	p->wchar = 0;		/* I/O counter: bytes written */
+	p->syscr = 0;		/* I/O counter: read syscalls */
+	p->syscw = 0;		/* I/O counter: write syscalls */
+	acct_clear_integrals(p);
+
+ 	p->it_virt_expires = cputime_zero;
+	p->it_prof_expires = cputime_zero;
+ 	p->it_sched_expires = 0;
+ 	INIT_LIST_HEAD(&p->cpu_timers[0]);
+ 	INIT_LIST_HEAD(&p->cpu_timers[1]);
+ 	INIT_LIST_HEAD(&p->cpu_timers[2]);
+
+	p->lock_depth = -1;		/* -1 = no lock */
+	do_posix_clock_monotonic_gettime(&p->start_time);
+	p->security = NULL;
+	p->io_context = NULL;
+	p->io_wait = NULL;
+	p->audit_context = NULL;
+#ifdef CONFIG_NUMA
+ 	p->mempolicy = mpol_copy(p->mempolicy);
+ 	if (IS_ERR(p->mempolicy)) {
+ 		retval = PTR_ERR(p->mempolicy);
+ 		p->mempolicy = NULL;
+ 		goto bad_fork_cleanup;
+ 	}
+#endif
+
+	p->tgid = p->pid;
+	if (clone_flags & CLONE_THREAD)
+		p->tgid = current->tgid;
+
+	if ((retval = security_task_alloc(p)))
+		goto bad_fork_cleanup_policy;
+	if ((retval = audit_alloc(p)))
+		goto bad_fork_cleanup_security;
+	/* copy all the process information */
+	if ((retval = copy_semundo(clone_flags, p)))
+		goto bad_fork_cleanup_audit;
+	if ((retval = copy_files(clone_flags, p)))
+		goto bad_fork_cleanup_semundo;
+	if ((retval = copy_fs(clone_flags, p)))
+		goto bad_fork_cleanup_files;
+	if ((retval = copy_sighand(clone_flags, p)))
+		goto bad_fork_cleanup_fs;
+	if ((retval = copy_signal(clone_flags, p)))
+		goto bad_fork_cleanup_sighand;
+	if ((retval = copy_mm(clone_flags, p)))
+		goto bad_fork_cleanup_signal;
+	if ((retval = copy_keys(clone_flags, p)))
+		goto bad_fork_cleanup_mm;
+	if ((retval = copy_namespace(clone_flags, p)))
+		goto bad_fork_cleanup_keys;
+	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+	if (retval)
+		goto bad_fork_cleanup_namespace;
+
+	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+	/*
+	 * Clear TID on mm_release()?
+	 */
+	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+
+	/*
+	 * Syscall tracing should be turned off in the child regardless
+	 * of CLONE_PTRACE.
+	 */
+	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
+
+	/* Our parent execution domain becomes current domain
+	   These must match for thread signalling to apply */
+	   
+	p->parent_exec_id = p->self_exec_id;
+
+	/* ok, now we should be set up.. */
+	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
+	p->pdeath_signal = 0;
+	p->exit_state = 0;
+
+	/* Perform scheduler related setup */
+	sched_fork(p);
+
+	/*
+	 * Ok, make it visible to the rest of the system.
+	 * We dont wake it up yet.
+	 */
+	p->group_leader = p;
+	INIT_LIST_HEAD(&p->ptrace_children);
+	INIT_LIST_HEAD(&p->ptrace_list);
+
+	/* Need tasklist lock for parent etc handling! */
+	write_lock_irq(&tasklist_lock);
+
+	/*
+	 * The task hasn't been attached yet, so cpus_allowed mask cannot
+	 * have changed. The cpus_allowed mask of the parent may have
+	 * changed after it was copied first time, and it may then move to
+	 * another CPU - so we re-copy it here and set the child's CPU to
+	 * the parent's CPU. This avoids alot of nasty races.
+	 */
+	p->cpus_allowed = current->cpus_allowed;
+	set_task_cpu(p, smp_processor_id());
+
+	/*
+	 * Check for pending SIGKILL! The new thread should not be allowed
+	 * to slip out of an OOM kill. (or normal SIGKILL.)
+	 */
+	if (sigismember(&current->pending.signal, SIGKILL)) {
+		write_unlock_irq(&tasklist_lock);
+		retval = -EINTR;
+		goto bad_fork_cleanup_namespace;
+	}
+
+	/* CLONE_PARENT re-uses the old parent */
+	if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
+		p->real_parent = current->real_parent;
+	else
+		p->real_parent = current;
+	p->parent = p->real_parent;
+
+	if (clone_flags & CLONE_THREAD) {
+		spin_lock(&current->sighand->siglock);
+		/*
+		 * Important: if an exit-all has been started then
+		 * do not create this new thread - the whole thread
+		 * group is supposed to exit anyway.
+		 */
+		if (current->signal->flags & SIGNAL_GROUP_EXIT) {
+			spin_unlock(&current->sighand->siglock);
+			write_unlock_irq(&tasklist_lock);
+			retval = -EAGAIN;
+			goto bad_fork_cleanup_namespace;
+		}
+		p->group_leader = current->group_leader;
+
+		if (current->signal->group_stop_count > 0) {
+			/*
+			 * There is an all-stop in progress for the group.
+			 * We ourselves will stop as soon as we check signals.
+			 * Make the new thread part of that group stop too.
+			 */
+			current->signal->group_stop_count++;
+			set_tsk_thread_flag(p, TIF_SIGPENDING);
+		}
+
+		if (!cputime_eq(current->signal->it_virt_expires,
+				cputime_zero) ||
+		    !cputime_eq(current->signal->it_prof_expires,
+				cputime_zero) ||
+		    current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
+		    !list_empty(&current->signal->cpu_timers[0]) ||
+		    !list_empty(&current->signal->cpu_timers[1]) ||
+		    !list_empty(&current->signal->cpu_timers[2])) {
+			/*
+			 * Have child wake up on its first tick to check
+			 * for process CPU timers.
+			 */
+			p->it_prof_expires = jiffies_to_cputime(1);
+		}
+
+		spin_unlock(&current->sighand->siglock);
+	}
+
+	SET_LINKS(p);
+	if (unlikely(p->ptrace & PT_PTRACED))
+		__ptrace_link(p, current->parent);
+
+	cpuset_fork(p);
+
+	attach_pid(p, PIDTYPE_PID, p->pid);
+	attach_pid(p, PIDTYPE_TGID, p->tgid);
+	if (thread_group_leader(p)) {
+		attach_pid(p, PIDTYPE_PGID, process_group(p));
+		attach_pid(p, PIDTYPE_SID, p->signal->session);
+		if (p->pid)
+			__get_cpu_var(process_counts)++;
+	}
+
+	nr_threads++;
+	total_forks++;
+	write_unlock_irq(&tasklist_lock);
+	retval = 0;
+
+fork_out:
+	if (retval)
+		return ERR_PTR(retval);
+	return p;
+
+bad_fork_cleanup_namespace:
+	exit_namespace(p);
+bad_fork_cleanup_keys:
+	exit_keys(p);
+bad_fork_cleanup_mm:
+	if (p->mm)
+		mmput(p->mm);
+bad_fork_cleanup_signal:
+	exit_signal(p);
+bad_fork_cleanup_sighand:
+	exit_sighand(p);
+bad_fork_cleanup_fs:
+	exit_fs(p); /* blocking */
+bad_fork_cleanup_files:
+	exit_files(p); /* blocking */
+bad_fork_cleanup_semundo:
+	exit_sem(p);
+bad_fork_cleanup_audit:
+	audit_free(p);
+bad_fork_cleanup_security:
+	security_task_free(p);
+bad_fork_cleanup_policy:
+#ifdef CONFIG_NUMA
+	mpol_free(p->mempolicy);
+#endif
+bad_fork_cleanup:
+	if (p->binfmt)
+		module_put(p->binfmt->module);
+bad_fork_cleanup_put_domain:
+	module_put(p->thread_info->exec_domain->module);
+bad_fork_cleanup_count:
+	put_group_info(p->group_info);
+	atomic_dec(&p->user->processes);
+	free_uid(p->user);
+bad_fork_free:
+	free_task(p);
+	goto fork_out;
+}
+
+struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+{
+	memset(regs, 0, sizeof(struct pt_regs));
+	return regs;
+}
+
+task_t * __devinit fork_idle(int cpu)
+{
+	task_t *task;
+	struct pt_regs regs;
+
+	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
+	if (!task)
+		return ERR_PTR(-ENOMEM);
+	init_idle(task, cpu);
+	unhash_process(task);
+	return task;
+}
+
+static inline int fork_traceflag (unsigned clone_flags)
+{
+	if (clone_flags & CLONE_UNTRACED)
+		return 0;
+	else if (clone_flags & CLONE_VFORK) {
+		if (current->ptrace & PT_TRACE_VFORK)
+			return PTRACE_EVENT_VFORK;
+	} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
+		if (current->ptrace & PT_TRACE_CLONE)
+			return PTRACE_EVENT_CLONE;
+	} else if (current->ptrace & PT_TRACE_FORK)
+		return PTRACE_EVENT_FORK;
+
+	return 0;
+}
+
+/*
+ *  Ok, this is the main fork-routine.
+ *
+ * It copies the process, and if successful kick-starts
+ * it and waits for it to finish using the VM if required.
+ */
+long do_fork(unsigned long clone_flags,
+	      unsigned long stack_start,
+	      struct pt_regs *regs,
+	      unsigned long stack_size,
+	      int __user *parent_tidptr,
+	      int __user *child_tidptr)
+{
+	struct task_struct *p;
+	int trace = 0;
+	long pid = alloc_pidmap();
+
+	if (pid < 0)
+		return -EAGAIN;
+	if (unlikely(current->ptrace)) {
+		trace = fork_traceflag (clone_flags);
+		if (trace)
+			clone_flags |= CLONE_PTRACE;
+	}
+
+	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
+	/*
+	 * Do this prior waking up the new thread - the thread pointer
+	 * might get invalid after that point, if the thread exits quickly.
+	 */
+	if (!IS_ERR(p)) {
+		struct completion vfork;
+
+		if (clone_flags & CLONE_VFORK) {
+			p->vfork_done = &vfork;
+			init_completion(&vfork);
+		}
+
+		if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
+			/*
+			 * We'll start up with an immediate SIGSTOP.
+			 */
+			sigaddset(&p->pending.signal, SIGSTOP);
+			set_tsk_thread_flag(p, TIF_SIGPENDING);
+		}
+
+		if (!(clone_flags & CLONE_STOPPED))
+			wake_up_new_task(p, clone_flags);
+		else
+			p->state = TASK_STOPPED;
+
+		if (unlikely (trace)) {
+			current->ptrace_message = pid;
+			ptrace_notify ((trace << 8) | SIGTRAP);
+		}
+
+		if (clone_flags & CLONE_VFORK) {
+			wait_for_completion(&vfork);
+			if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
+				ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
+		}
+	} else {
+		free_pidmap(pid);
+		pid = PTR_ERR(p);
+	}
+	return pid;
+}
+
+void __init proc_caches_init(void)
+{
+	sighand_cachep = kmem_cache_create("sighand_cache",
+			sizeof(struct sighand_struct), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	signal_cachep = kmem_cache_create("signal_cache",
+			sizeof(struct signal_struct), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	files_cachep = kmem_cache_create("files_cache", 
+			sizeof(struct files_struct), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	fs_cachep = kmem_cache_create("fs_cache", 
+			sizeof(struct fs_struct), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	vm_area_cachep = kmem_cache_create("vm_area_struct",
+			sizeof(struct vm_area_struct), 0,
+			SLAB_PANIC, NULL, NULL);
+	mm_cachep = kmem_cache_create("mm_struct",
+			sizeof(struct mm_struct), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+}
diff --git a/kernel/futex.c b/kernel/futex.c
new file mode 100644
index 000000000000..7b54a672d0ad
--- /dev/null
+++ b/kernel/futex.c
@@ -0,0 +1,798 @@
+/*
+ *  Fast Userspace Mutexes (which I call "Futexes!").
+ *  (C) Rusty Russell, IBM 2002
+ *
+ *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
+ *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
+ *
+ *  Removed page pinning, fix privately mapped COW pages and other cleanups
+ *  (C) Copyright 2003, 2004 Jamie Lokier
+ *
+ *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
+ *  enough at me, Linus for the original (flawed) idea, Matthew
+ *  Kirkwood for proof-of-concept implementation.
+ *
+ *  "The futexes are also cursed."
+ *  "But they come in a choice of three flavours!"
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/jhash.h>
+#include <linux/init.h>
+#include <linux/futex.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+
+#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
+
+/*
+ * Futexes are matched on equal values of this key.
+ * The key type depends on whether it's a shared or private mapping.
+ * Don't rearrange members without looking at hash_futex().
+ *
+ * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
+ * We set bit 0 to indicate if it's an inode-based key.
+ */
+union futex_key {
+	struct {
+		unsigned long pgoff;
+		struct inode *inode;
+		int offset;
+	} shared;
+	struct {
+		unsigned long uaddr;
+		struct mm_struct *mm;
+		int offset;
+	} private;
+	struct {
+		unsigned long word;
+		void *ptr;
+		int offset;
+	} both;
+};
+
+/*
+ * We use this hashed waitqueue instead of a normal wait_queue_t, so
+ * we can wake only the relevant ones (hashed queues may be shared).
+ *
+ * A futex_q has a woken state, just like tasks have TASK_RUNNING.
+ * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0.
+ * The order of wakup is always to make the first condition true, then
+ * wake up q->waiters, then make the second condition true.
+ */
+struct futex_q {
+	struct list_head list;
+	wait_queue_head_t waiters;
+
+	/* Which hash list lock to use. */
+	spinlock_t *lock_ptr;
+
+	/* Key which the futex is hashed on. */
+	union futex_key key;
+
+	/* For fd, sigio sent using these. */
+	int fd;
+	struct file *filp;
+};
+
+/*
+ * Split the global futex_lock into every hash list lock.
+ */
+struct futex_hash_bucket {
+       spinlock_t              lock;
+       struct list_head       chain;
+};
+
+static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+
+/* Futex-fs vfsmount entry: */
+static struct vfsmount *futex_mnt;
+
+/*
+ * We hash on the keys returned from get_futex_key (see below).
+ */
+static struct futex_hash_bucket *hash_futex(union futex_key *key)
+{
+	u32 hash = jhash2((u32*)&key->both.word,
+			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+			  key->both.offset);
+	return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
+}
+
+/*
+ * Return 1 if two futex_keys are equal, 0 otherwise.
+ */
+static inline int match_futex(union futex_key *key1, union futex_key *key2)
+{
+	return (key1->both.word == key2->both.word
+		&& key1->both.ptr == key2->both.ptr
+		&& key1->both.offset == key2->both.offset);
+}
+
+/*
+ * Get parameters which are the keys for a futex.
+ *
+ * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
+ * offset_within_page).  For private mappings, it's (uaddr, current->mm).
+ * We can usually work out the index without swapping in the page.
+ *
+ * Returns: 0, or negative error code.
+ * The key words are stored in *key on success.
+ *
+ * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
+ */
+static int get_futex_key(unsigned long uaddr, union futex_key *key)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct page *page;
+	int err;
+
+	/*
+	 * The futex address must be "naturally" aligned.
+	 */
+	key->both.offset = uaddr % PAGE_SIZE;
+	if (unlikely((key->both.offset % sizeof(u32)) != 0))
+		return -EINVAL;
+	uaddr -= key->both.offset;
+
+	/*
+	 * The futex is hashed differently depending on whether
+	 * it's in a shared or private mapping.  So check vma first.
+	 */
+	vma = find_extend_vma(mm, uaddr);
+	if (unlikely(!vma))
+		return -EFAULT;
+
+	/*
+	 * Permissions.
+	 */
+	if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
+		return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+
+	/*
+	 * Private mappings are handled in a simple way.
+	 *
+	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
+	 * it's a read-only handle, it's expected that futexes attach to
+	 * the object not the particular process.  Therefore we use
+	 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
+	 * mappings of _writable_ handles.
+	 */
+	if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
+		key->private.mm = mm;
+		key->private.uaddr = uaddr;
+		return 0;
+	}
+
+	/*
+	 * Linear file mappings are also simple.
+	 */
+	key->shared.inode = vma->vm_file->f_dentry->d_inode;
+	key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
+	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
+		key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
+				     + vma->vm_pgoff);
+		return 0;
+	}
+
+	/*
+	 * We could walk the page table to read the non-linear
+	 * pte, and get the page index without fetching the page
+	 * from swap.  But that's a lot of code to duplicate here
+	 * for a rare case, so we simply fetch the page.
+	 */
+
+	/*
+	 * Do a quick atomic lookup first - this is the fastpath.
+	 */
+	spin_lock(&current->mm->page_table_lock);
+	page = follow_page(mm, uaddr, 0);
+	if (likely(page != NULL)) {
+		key->shared.pgoff =
+			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+		spin_unlock(&current->mm->page_table_lock);
+		return 0;
+	}
+	spin_unlock(&current->mm->page_table_lock);
+
+	/*
+	 * Do it the general way.
+	 */
+	err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
+	if (err >= 0) {
+		key->shared.pgoff =
+			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+		put_page(page);
+		return 0;
+	}
+	return err;
+}
+
+/*
+ * Take a reference to the resource addressed by a key.
+ * Can be called while holding spinlocks.
+ *
+ * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
+ * function, if it is called at all.  mmap_sem keeps key->shared.inode valid.
+ */
+static inline void get_key_refs(union futex_key *key)
+{
+	if (key->both.ptr != 0) {
+		if (key->both.offset & 1)
+			atomic_inc(&key->shared.inode->i_count);
+		else
+			atomic_inc(&key->private.mm->mm_count);
+	}
+}
+
+/*
+ * Drop a reference to the resource addressed by a key.
+ * The hash bucket spinlock must not be held.
+ */
+static void drop_key_refs(union futex_key *key)
+{
+	if (key->both.ptr != 0) {
+		if (key->both.offset & 1)
+			iput(key->shared.inode);
+		else
+			mmdrop(key->private.mm);
+	}
+}
+
+static inline int get_futex_value_locked(int *dest, int __user *from)
+{
+	int ret;
+
+	inc_preempt_count();
+	ret = __copy_from_user_inatomic(dest, from, sizeof(int));
+	dec_preempt_count();
+
+	return ret ? -EFAULT : 0;
+}
+
+/*
+ * The hash bucket lock must be held when this is called.
+ * Afterwards, the futex_q must not be accessed.
+ */
+static void wake_futex(struct futex_q *q)
+{
+	list_del_init(&q->list);
+	if (q->filp)
+		send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
+	/*
+	 * The lock in wake_up_all() is a crucial memory barrier after the
+	 * list_del_init() and also before assigning to q->lock_ptr.
+	 */
+	wake_up_all(&q->waiters);
+	/*
+	 * The waiting task can free the futex_q as soon as this is written,
+	 * without taking any locks.  This must come last.
+	 */
+	q->lock_ptr = NULL;
+}
+
+/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int futex_wake(unsigned long uaddr, int nr_wake)
+{
+	union futex_key key;
+	struct futex_hash_bucket *bh;
+	struct list_head *head;
+	struct futex_q *this, *next;
+	int ret;
+
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr, &key);
+	if (unlikely(ret != 0))
+		goto out;
+
+	bh = hash_futex(&key);
+	spin_lock(&bh->lock);
+	head = &bh->chain;
+
+	list_for_each_entry_safe(this, next, head, list) {
+		if (match_futex (&this->key, &key)) {
+			wake_futex(this);
+			if (++ret >= nr_wake)
+				break;
+		}
+	}
+
+	spin_unlock(&bh->lock);
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+/*
+ * Requeue all waiters hashed on one physical page to another
+ * physical page.
+ */
+static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
+			 int nr_wake, int nr_requeue, int *valp)
+{
+	union futex_key key1, key2;
+	struct futex_hash_bucket *bh1, *bh2;
+	struct list_head *head1;
+	struct futex_q *this, *next;
+	int ret, drop_count = 0;
+
+ retry:
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr1, &key1);
+	if (unlikely(ret != 0))
+		goto out;
+	ret = get_futex_key(uaddr2, &key2);
+	if (unlikely(ret != 0))
+		goto out;
+
+	bh1 = hash_futex(&key1);
+	bh2 = hash_futex(&key2);
+
+	if (bh1 < bh2)
+		spin_lock(&bh1->lock);
+	spin_lock(&bh2->lock);
+	if (bh1 > bh2)
+		spin_lock(&bh1->lock);
+
+	if (likely(valp != NULL)) {
+		int curval;
+
+		ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
+
+		if (unlikely(ret)) {
+			spin_unlock(&bh1->lock);
+			if (bh1 != bh2)
+				spin_unlock(&bh2->lock);
+
+			/* If we would have faulted, release mmap_sem, fault
+			 * it in and start all over again.
+			 */
+			up_read(&current->mm->mmap_sem);
+
+			ret = get_user(curval, (int __user *)uaddr1);
+
+			if (!ret)
+				goto retry;
+
+			return ret;
+		}
+		if (curval != *valp) {
+			ret = -EAGAIN;
+			goto out_unlock;
+		}
+	}
+
+	head1 = &bh1->chain;
+	list_for_each_entry_safe(this, next, head1, list) {
+		if (!match_futex (&this->key, &key1))
+			continue;
+		if (++ret <= nr_wake) {
+			wake_futex(this);
+		} else {
+			list_move_tail(&this->list, &bh2->chain);
+			this->lock_ptr = &bh2->lock;
+			this->key = key2;
+			get_key_refs(&key2);
+			drop_count++;
+
+			if (ret - nr_wake >= nr_requeue)
+				break;
+			/* Make sure to stop if key1 == key2 */
+			if (head1 == &bh2->chain && head1 != &next->list)
+				head1 = &this->list;
+		}
+	}
+
+out_unlock:
+	spin_unlock(&bh1->lock);
+	if (bh1 != bh2)
+		spin_unlock(&bh2->lock);
+
+	/* drop_key_refs() must be called outside the spinlocks. */
+	while (--drop_count >= 0)
+		drop_key_refs(&key1);
+
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+/* The key must be already stored in q->key. */
+static inline struct futex_hash_bucket *
+queue_lock(struct futex_q *q, int fd, struct file *filp)
+{
+	struct futex_hash_bucket *bh;
+
+	q->fd = fd;
+	q->filp = filp;
+
+	init_waitqueue_head(&q->waiters);
+
+	get_key_refs(&q->key);
+	bh = hash_futex(&q->key);
+	q->lock_ptr = &bh->lock;
+
+	spin_lock(&bh->lock);
+	return bh;
+}
+
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
+{
+	list_add_tail(&q->list, &bh->chain);
+	spin_unlock(&bh->lock);
+}
+
+static inline void
+queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
+{
+	spin_unlock(&bh->lock);
+	drop_key_refs(&q->key);
+}
+
+/*
+ * queue_me and unqueue_me must be called as a pair, each
+ * exactly once.  They are called with the hashed spinlock held.
+ */
+
+/* The key must be already stored in q->key. */
+static void queue_me(struct futex_q *q, int fd, struct file *filp)
+{
+	struct futex_hash_bucket *bh;
+	bh = queue_lock(q, fd, filp);
+	__queue_me(q, bh);
+}
+
+/* Return 1 if we were still queued (ie. 0 means we were woken) */
+static int unqueue_me(struct futex_q *q)
+{
+	int ret = 0;
+	spinlock_t *lock_ptr;
+
+	/* In the common case we don't take the spinlock, which is nice. */
+ retry:
+	lock_ptr = q->lock_ptr;
+	if (lock_ptr != 0) {
+		spin_lock(lock_ptr);
+		/*
+		 * q->lock_ptr can change between reading it and
+		 * spin_lock(), causing us to take the wrong lock.  This
+		 * corrects the race condition.
+		 *
+		 * Reasoning goes like this: if we have the wrong lock,
+		 * q->lock_ptr must have changed (maybe several times)
+		 * between reading it and the spin_lock().  It can
+		 * change again after the spin_lock() but only if it was
+		 * already changed before the spin_lock().  It cannot,
+		 * however, change back to the original value.  Therefore
+		 * we can detect whether we acquired the correct lock.
+		 */
+		if (unlikely(lock_ptr != q->lock_ptr)) {
+			spin_unlock(lock_ptr);
+			goto retry;
+		}
+		WARN_ON(list_empty(&q->list));
+		list_del(&q->list);
+		spin_unlock(lock_ptr);
+		ret = 1;
+	}
+
+	drop_key_refs(&q->key);
+	return ret;
+}
+
+static int futex_wait(unsigned long uaddr, int val, unsigned long time)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	int ret, curval;
+	struct futex_q q;
+	struct futex_hash_bucket *bh;
+
+ retry:
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr, &q.key);
+	if (unlikely(ret != 0))
+		goto out_release_sem;
+
+	bh = queue_lock(&q, -1, NULL);
+
+	/*
+	 * Access the page AFTER the futex is queued.
+	 * Order is important:
+	 *
+	 *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
+	 *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
+	 *
+	 * The basic logical guarantee of a futex is that it blocks ONLY
+	 * if cond(var) is known to be true at the time of blocking, for
+	 * any cond.  If we queued after testing *uaddr, that would open
+	 * a race condition where we could block indefinitely with
+	 * cond(var) false, which would violate the guarantee.
+	 *
+	 * A consequence is that futex_wait() can return zero and absorb
+	 * a wakeup when *uaddr != val on entry to the syscall.  This is
+	 * rare, but normal.
+	 *
+	 * We hold the mmap semaphore, so the mapping cannot have changed
+	 * since we looked it up in get_futex_key.
+	 */
+
+	ret = get_futex_value_locked(&curval, (int __user *)uaddr);
+
+	if (unlikely(ret)) {
+		queue_unlock(&q, bh);
+
+		/* If we would have faulted, release mmap_sem, fault it in and
+		 * start all over again.
+		 */
+		up_read(&current->mm->mmap_sem);
+
+		ret = get_user(curval, (int __user *)uaddr);
+
+		if (!ret)
+			goto retry;
+		return ret;
+	}
+	if (curval != val) {
+		ret = -EWOULDBLOCK;
+		queue_unlock(&q, bh);
+		goto out_release_sem;
+	}
+
+	/* Only actually queue if *uaddr contained val.  */
+	__queue_me(&q, bh);
+
+	/*
+	 * Now the futex is queued and we have checked the data, we
+	 * don't want to hold mmap_sem while we sleep.
+	 */	
+	up_read(&current->mm->mmap_sem);
+
+	/*
+	 * There might have been scheduling since the queue_me(), as we
+	 * cannot hold a spinlock across the get_user() in case it
+	 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
+	 * queueing ourselves into the futex hash.  This code thus has to
+	 * rely on the futex_wake() code removing us from hash when it
+	 * wakes us up.
+	 */
+
+	/* add_wait_queue is the barrier after __set_current_state. */
+	__set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&q.waiters, &wait);
+	/*
+	 * !list_empty() is safe here without any lock.
+	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
+	 */
+	if (likely(!list_empty(&q.list)))
+		time = schedule_timeout(time);
+	__set_current_state(TASK_RUNNING);
+
+	/*
+	 * NOTE: we don't remove ourselves from the waitqueue because
+	 * we are the only user of it.
+	 */
+
+	/* If we were woken (and unqueued), we succeeded, whatever. */
+	if (!unqueue_me(&q))
+		return 0;
+	if (time == 0)
+		return -ETIMEDOUT;
+	/* We expect signal_pending(current), but another thread may
+	 * have handled it for us already. */
+	return -EINTR;
+
+ out_release_sem:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+static int futex_close(struct inode *inode, struct file *filp)
+{
+	struct futex_q *q = filp->private_data;
+
+	unqueue_me(q);
+	kfree(q);
+	return 0;
+}
+
+/* This is one-shot: once it's gone off you need a new fd */
+static unsigned int futex_poll(struct file *filp,
+			       struct poll_table_struct *wait)
+{
+	struct futex_q *q = filp->private_data;
+	int ret = 0;
+
+	poll_wait(filp, &q->waiters, wait);
+
+	/*
+	 * list_empty() is safe here without any lock.
+	 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
+	 */
+	if (list_empty(&q->list))
+		ret = POLLIN | POLLRDNORM;
+
+	return ret;
+}
+
+static struct file_operations futex_fops = {
+	.release	= futex_close,
+	.poll		= futex_poll,
+};
+
+/*
+ * Signal allows caller to avoid the race which would occur if they
+ * set the sigio stuff up afterwards.
+ */
+static int futex_fd(unsigned long uaddr, int signal)
+{
+	struct futex_q *q;
+	struct file *filp;
+	int ret, err;
+
+	ret = -EINVAL;
+	if (signal < 0 || signal > _NSIG)
+		goto out;
+
+	ret = get_unused_fd();
+	if (ret < 0)
+		goto out;
+	filp = get_empty_filp();
+	if (!filp) {
+		put_unused_fd(ret);
+		ret = -ENFILE;
+		goto out;
+	}
+	filp->f_op = &futex_fops;
+	filp->f_vfsmnt = mntget(futex_mnt);
+	filp->f_dentry = dget(futex_mnt->mnt_root);
+	filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+
+	if (signal) {
+		int err;
+		err = f_setown(filp, current->pid, 1);
+		if (err < 0) {
+			put_unused_fd(ret);
+			put_filp(filp);
+			ret = err;
+			goto out;
+		}
+		filp->f_owner.signum = signal;
+	}
+
+	q = kmalloc(sizeof(*q), GFP_KERNEL);
+	if (!q) {
+		put_unused_fd(ret);
+		put_filp(filp);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	down_read(&current->mm->mmap_sem);
+	err = get_futex_key(uaddr, &q->key);
+
+	if (unlikely(err != 0)) {
+		up_read(&current->mm->mmap_sem);
+		put_unused_fd(ret);
+		put_filp(filp);
+		kfree(q);
+		return err;
+	}
+
+	/*
+	 * queue_me() must be called before releasing mmap_sem, because
+	 * key->shared.inode needs to be referenced while holding it.
+	 */
+	filp->private_data = q;
+
+	queue_me(q, ret, filp);
+	up_read(&current->mm->mmap_sem);
+
+	/* Now we map fd to filp, so userspace can access it */
+	fd_install(ret, filp);
+out:
+	return ret;
+}
+
+long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
+		unsigned long uaddr2, int val2, int val3)
+{
+	int ret;
+
+	switch (op) {
+	case FUTEX_WAIT:
+		ret = futex_wait(uaddr, val, timeout);
+		break;
+	case FUTEX_WAKE:
+		ret = futex_wake(uaddr, val);
+		break;
+	case FUTEX_FD:
+		/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
+		ret = futex_fd(uaddr, val);
+		break;
+	case FUTEX_REQUEUE:
+		ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+		break;
+	case FUTEX_CMP_REQUEUE:
+		ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+	return ret;
+}
+
+
+asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
+			  struct timespec __user *utime, u32 __user *uaddr2,
+			  int val3)
+{
+	struct timespec t;
+	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+	int val2 = 0;
+
+	if ((op == FUTEX_WAIT) && utime) {
+		if (copy_from_user(&t, utime, sizeof(t)) != 0)
+			return -EFAULT;
+		timeout = timespec_to_jiffies(&t) + 1;
+	}
+	/*
+	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
+	 */
+	if (op >= FUTEX_REQUEUE)
+		val2 = (int) (unsigned long) utime;
+
+	return do_futex((unsigned long)uaddr, op, val, timeout,
+			(unsigned long)uaddr2, val2, val3);
+}
+
+static struct super_block *
+futexfs_get_sb(struct file_system_type *fs_type,
+	       int flags, const char *dev_name, void *data)
+{
+	return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA);
+}
+
+static struct file_system_type futex_fs_type = {
+	.name		= "futexfs",
+	.get_sb		= futexfs_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static int __init init(void)
+{
+	unsigned int i;
+
+	register_filesystem(&futex_fs_type);
+	futex_mnt = kern_mount(&futex_fs_type);
+
+	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+		INIT_LIST_HEAD(&futex_queues[i].chain);
+		spin_lock_init(&futex_queues[i].lock);
+	}
+	return 0;
+}
+__initcall(init);
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
new file mode 100644
index 000000000000..388977f3e9b7
--- /dev/null
+++ b/kernel/intermodule.c
@@ -0,0 +1,182 @@
+/* Deprecated, do not use.  Moved from module.c to here. --RR */
+
+/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+/* inter_module functions are always available, even when the kernel is
+ * compiled without modules.  Consumers of inter_module_xxx routines
+ * will always work, even when both are built into the kernel, this
+ * approach removes lots of #ifdefs in mainline code.
+ */
+
+static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
+static DEFINE_SPINLOCK(ime_lock);
+static int kmalloc_failed;
+
+struct inter_module_entry {
+	struct list_head list;
+	const char *im_name;
+	struct module *owner;
+	const void *userdata;
+};
+
+/**
+ * inter_module_register - register a new set of inter module data.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ * @owner: module that is registering the data, always use THIS_MODULE
+ * @userdata: pointer to arbitrary userdata to be registered
+ *
+ * Description: Check that the im_name has not already been registered,
+ * complain if it has.  For new data, add it to the inter_module_entry
+ * list.
+ */
+void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
+{
+	struct list_head *tmp;
+	struct inter_module_entry *ime, *ime_new;
+
+	if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
+		/* Overloaded kernel, not fatal */
+		printk(KERN_ERR
+			"Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
+			im_name);
+		kmalloc_failed = 1;
+		return;
+	}
+	memset(ime_new, 0, sizeof(*ime_new));
+	ime_new->im_name = im_name;
+	ime_new->owner = owner;
+	ime_new->userdata = userdata;
+
+	spin_lock(&ime_lock);
+	list_for_each(tmp, &ime_list) {
+		ime = list_entry(tmp, struct inter_module_entry, list);
+		if (strcmp(ime->im_name, im_name) == 0) {
+			spin_unlock(&ime_lock);
+			kfree(ime_new);
+			/* Program logic error, fatal */
+			printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
+			BUG();
+		}
+	}
+	list_add(&(ime_new->list), &ime_list);
+	spin_unlock(&ime_lock);
+}
+
+/**
+ * inter_module_unregister - unregister a set of inter module data.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ *
+ * Description: Check that the im_name has been registered, complain if
+ * it has not.  For existing data, remove it from the
+ * inter_module_entry list.
+ */
+void inter_module_unregister(const char *im_name)
+{
+	struct list_head *tmp;
+	struct inter_module_entry *ime;
+
+	spin_lock(&ime_lock);
+	list_for_each(tmp, &ime_list) {
+		ime = list_entry(tmp, struct inter_module_entry, list);
+		if (strcmp(ime->im_name, im_name) == 0) {
+			list_del(&(ime->list));
+			spin_unlock(&ime_lock);
+			kfree(ime);
+			return;
+		}
+	}
+	spin_unlock(&ime_lock);
+	if (kmalloc_failed) {
+		printk(KERN_ERR
+			"inter_module_unregister: no entry for '%s', "
+			"probably caused by previous kmalloc failure\n",
+			im_name);
+		return;
+	}
+	else {
+		/* Program logic error, fatal */
+		printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
+		BUG();
+	}
+}
+
+/**
+ * inter_module_get - return arbitrary userdata from another module.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ *
+ * Description: If the im_name has not been registered, return NULL.
+ * Try to increment the use count on the owning module, if that fails
+ * then return NULL.  Otherwise return the userdata.
+ */
+static const void *inter_module_get(const char *im_name)
+{
+	struct list_head *tmp;
+	struct inter_module_entry *ime;
+	const void *result = NULL;
+
+	spin_lock(&ime_lock);
+	list_for_each(tmp, &ime_list) {
+		ime = list_entry(tmp, struct inter_module_entry, list);
+		if (strcmp(ime->im_name, im_name) == 0) {
+			if (try_module_get(ime->owner))
+				result = ime->userdata;
+			break;
+		}
+	}
+	spin_unlock(&ime_lock);
+	return(result);
+}
+
+/**
+ * inter_module_get_request - im get with automatic request_module.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ * @modname: module that is expected to register im_name
+ *
+ * Description: If inter_module_get fails, do request_module then retry.
+ */
+const void *inter_module_get_request(const char *im_name, const char *modname)
+{
+	const void *result = inter_module_get(im_name);
+	if (!result) {
+		request_module("%s", modname);
+		result = inter_module_get(im_name);
+	}
+	return(result);
+}
+
+/**
+ * inter_module_put - release use of data from another module.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ *
+ * Description: If the im_name has not been registered, complain,
+ * otherwise decrement the use count on the owning module.
+ */
+void inter_module_put(const char *im_name)
+{
+	struct list_head *tmp;
+	struct inter_module_entry *ime;
+
+	spin_lock(&ime_lock);
+	list_for_each(tmp, &ime_list) {
+		ime = list_entry(tmp, struct inter_module_entry, list);
+		if (strcmp(ime->im_name, im_name) == 0) {
+			if (ime->owner)
+				module_put(ime->owner);
+			spin_unlock(&ime_lock);
+			return;
+		}
+	}
+	spin_unlock(&ime_lock);
+	printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
+	BUG();
+}
+
+EXPORT_SYMBOL(inter_module_register);
+EXPORT_SYMBOL(inter_module_unregister);
+EXPORT_SYMBOL(inter_module_get_request);
+EXPORT_SYMBOL(inter_module_put);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
new file mode 100644
index 000000000000..49378738ff5e
--- /dev/null
+++ b/kernel/irq/Makefile
@@ -0,0 +1,5 @@
+
+obj-y := handle.o manage.o spurious.o
+obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
+obj-$(CONFIG_PROC_FS) += proc.o
+
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
new file mode 100644
index 000000000000..98d62d8efeaf
--- /dev/null
+++ b/kernel/irq/autoprobe.c
@@ -0,0 +1,189 @@
+/*
+ * linux/kernel/irq/autoprobe.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the interrupt probing code and driver APIs.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+/*
+ * Autodetection depends on the fact that any interrupt that
+ * comes in on to an unassigned handler will get stuck with
+ * "IRQ_WAITING" cleared and the interrupt disabled.
+ */
+static DECLARE_MUTEX(probe_sem);
+
+/**
+ *	probe_irq_on	- begin an interrupt autodetect
+ *
+ *	Commence probing for an interrupt. The interrupts are scanned
+ *	and a mask of potential interrupt lines is returned.
+ *
+ */
+unsigned long probe_irq_on(void)
+{
+	unsigned long val, delay;
+	irq_desc_t *desc;
+	unsigned int i;
+
+	down(&probe_sem);
+	/*
+	 * something may have generated an irq long ago and we want to
+	 * flush such a longstanding irq before considering it as spurious.
+	 */
+	for (i = NR_IRQS-1; i > 0; i--) {
+		desc = irq_desc + i;
+
+		spin_lock_irq(&desc->lock);
+		if (!irq_desc[i].action)
+			irq_desc[i].handler->startup(i);
+		spin_unlock_irq(&desc->lock);
+	}
+
+	/* Wait for longstanding interrupts to trigger. */
+	for (delay = jiffies + HZ/50; time_after(delay, jiffies); )
+		/* about 20ms delay */ barrier();
+
+	/*
+	 * enable any unassigned irqs
+	 * (we must startup again here because if a longstanding irq
+	 * happened in the previous stage, it may have masked itself)
+	 */
+	for (i = NR_IRQS-1; i > 0; i--) {
+		desc = irq_desc + i;
+
+		spin_lock_irq(&desc->lock);
+		if (!desc->action) {
+			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
+			if (desc->handler->startup(i))
+				desc->status |= IRQ_PENDING;
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+
+	/*
+	 * Wait for spurious interrupts to trigger
+	 */
+	for (delay = jiffies + HZ/10; time_after(delay, jiffies); )
+		/* about 100ms delay */ barrier();
+
+	/*
+	 * Now filter out any obviously spurious interrupts
+	 */
+	val = 0;
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+		unsigned int status;
+
+		spin_lock_irq(&desc->lock);
+		status = desc->status;
+
+		if (status & IRQ_AUTODETECT) {
+			/* It triggered already - consider it spurious. */
+			if (!(status & IRQ_WAITING)) {
+				desc->status = status & ~IRQ_AUTODETECT;
+				desc->handler->shutdown(i);
+			} else
+				if (i < 32)
+					val |= 1 << i;
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+
+	return val;
+}
+
+EXPORT_SYMBOL(probe_irq_on);
+
+/**
+ *	probe_irq_mask - scan a bitmap of interrupt lines
+ *	@val:	mask of interrupts to consider
+ *
+ *	Scan the interrupt lines and return a bitmap of active
+ *	autodetect interrupts. The interrupt probe logic state
+ *	is then returned to its previous value.
+ *
+ *	Note: we need to scan all the irq's even though we will
+ *	only return autodetect irq numbers - just so that we reset
+ *	them all to a known state.
+ */
+unsigned int probe_irq_mask(unsigned long val)
+{
+	unsigned int mask;
+	int i;
+
+	mask = 0;
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+		unsigned int status;
+
+		spin_lock_irq(&desc->lock);
+		status = desc->status;
+
+		if (status & IRQ_AUTODETECT) {
+			if (i < 16 && !(status & IRQ_WAITING))
+				mask |= 1 << i;
+
+			desc->status = status & ~IRQ_AUTODETECT;
+			desc->handler->shutdown(i);
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+	up(&probe_sem);
+
+	return mask & val;
+}
+EXPORT_SYMBOL(probe_irq_mask);
+
+/**
+ *	probe_irq_off	- end an interrupt autodetect
+ *	@val: mask of potential interrupts (unused)
+ *
+ *	Scans the unused interrupt lines and returns the line which
+ *	appears to have triggered the interrupt. If no interrupt was
+ *	found then zero is returned. If more than one interrupt is
+ *	found then minus the first candidate is returned to indicate
+ *	their is doubt.
+ *
+ *	The interrupt probe logic state is returned to its previous
+ *	value.
+ *
+ *	BUGS: When used in a module (which arguably shouldn't happen)
+ *	nothing prevents two IRQ probe callers from overlapping. The
+ *	results of this are non-optimal.
+ */
+int probe_irq_off(unsigned long val)
+{
+	int i, irq_found = 0, nr_irqs = 0;
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+		unsigned int status;
+
+		spin_lock_irq(&desc->lock);
+		status = desc->status;
+
+		if (status & IRQ_AUTODETECT) {
+			if (!(status & IRQ_WAITING)) {
+				if (!nr_irqs)
+					irq_found = i;
+				nr_irqs++;
+			}
+			desc->status = status & ~IRQ_AUTODETECT;
+			desc->handler->shutdown(i);
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+	up(&probe_sem);
+
+	if (nr_irqs > 1)
+		irq_found = -irq_found;
+	return irq_found;
+}
+
+EXPORT_SYMBOL(probe_irq_off);
+
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
new file mode 100644
index 000000000000..2fb0e46e11f3
--- /dev/null
+++ b/kernel/irq/handle.c
@@ -0,0 +1,193 @@
+/*
+ * linux/kernel/irq/handle.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the core interrupt handling code.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+/*
+ * Linux has a controller-independent interrupt architecture.
+ * Every controller has a 'controller-template', that is used
+ * by the main code to do the right thing. Each driver-visible
+ * interrupt source is transparently wired to the apropriate
+ * controller. Thus drivers need not be aware of the
+ * interrupt-controller.
+ *
+ * The code is designed to be easily extended with new/different
+ * interrupt controllers, without having to do assembly magic or
+ * having to touch the generic code.
+ *
+ * Controller mappings for all interrupt sources:
+ */
+irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
+	[0 ... NR_IRQS-1] = {
+		.handler = &no_irq_type,
+		.lock = SPIN_LOCK_UNLOCKED
+	}
+};
+
+/*
+ * Generic 'no controller' code
+ */
+static void end_none(unsigned int irq) { }
+static void enable_none(unsigned int irq) { }
+static void disable_none(unsigned int irq) { }
+static void shutdown_none(unsigned int irq) { }
+static unsigned int startup_none(unsigned int irq) { return 0; }
+
+static void ack_none(unsigned int irq)
+{
+	/*
+	 * 'what should we do if we get a hw irq event on an illegal vector'.
+	 * each architecture has to answer this themself.
+	 */
+	ack_bad_irq(irq);
+}
+
+struct hw_interrupt_type no_irq_type = {
+	.typename = 	"none",
+	.startup = 	startup_none,
+	.shutdown = 	shutdown_none,
+	.enable = 	enable_none,
+	.disable = 	disable_none,
+	.ack = 		ack_none,
+	.end = 		end_none,
+	.set_affinity = NULL
+};
+
+/*
+ * Special, empty irq handler:
+ */
+irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
+{
+	return IRQ_NONE;
+}
+
+/*
+ * Have got an event to handle:
+ */
+fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+				struct irqaction *action)
+{
+	int ret, retval = 0, status = 0;
+
+	if (!(action->flags & SA_INTERRUPT))
+		local_irq_enable();
+
+	do {
+		ret = action->handler(irq, action->dev_id, regs);
+		if (ret == IRQ_HANDLED)
+			status |= action->flags;
+		retval |= ret;
+		action = action->next;
+	} while (action);
+
+	if (status & SA_SAMPLE_RANDOM)
+		add_interrupt_randomness(irq);
+	local_irq_disable();
+
+	return retval;
+}
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	struct irqaction * action;
+	unsigned int status;
+
+	kstat_this_cpu.irqs[irq]++;
+	if (desc->status & IRQ_PER_CPU) {
+		irqreturn_t action_ret;
+
+		/*
+		 * No locking required for CPU-local interrupts:
+		 */
+		desc->handler->ack(irq);
+		action_ret = handle_IRQ_event(irq, regs, desc->action);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+		desc->handler->end(irq);
+		return 1;
+	}
+
+	spin_lock(&desc->lock);
+	desc->handler->ack(irq);
+	/*
+	 * REPLAY is when Linux resends an IRQ that was dropped earlier
+	 * WAITING is used by probe to mark irqs that are being tested
+	 */
+	status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
+	status |= IRQ_PENDING; /* we _want_ to handle it */
+
+	/*
+	 * If the IRQ is disabled for whatever reason, we cannot
+	 * use the action we have.
+	 */
+	action = NULL;
+	if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
+		action = desc->action;
+		status &= ~IRQ_PENDING; /* we commit to handling */
+		status |= IRQ_INPROGRESS; /* we are handling it */
+	}
+	desc->status = status;
+
+	/*
+	 * If there is no IRQ handler or it was disabled, exit early.
+	 * Since we set PENDING, if another processor is handling
+	 * a different instance of this same irq, the other processor
+	 * will take care of it.
+	 */
+	if (unlikely(!action))
+		goto out;
+
+	/*
+	 * Edge triggered interrupts need to remember
+	 * pending events.
+	 * This applies to any hw interrupts that allow a second
+	 * instance of the same irq to arrive while we are in do_IRQ
+	 * or in the handler. But the code here only handles the _second_
+	 * instance of the irq, not the third or fourth. So it is mostly
+	 * useful for irq hardware that does not mask cleanly in an
+	 * SMP environment.
+	 */
+	for (;;) {
+		irqreturn_t action_ret;
+
+		spin_unlock(&desc->lock);
+
+		action_ret = handle_IRQ_event(irq, regs, action);
+
+		spin_lock(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+		if (likely(!(desc->status & IRQ_PENDING)))
+			break;
+		desc->status &= ~IRQ_PENDING;
+	}
+	desc->status &= ~IRQ_INPROGRESS;
+
+out:
+	/*
+	 * The ->end() handler has to deal with interrupts which got
+	 * disabled while the handler was running.
+	 */
+	desc->handler->end(irq);
+	spin_unlock(&desc->lock);
+
+	return 1;
+}
+
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
new file mode 100644
index 000000000000..46feba630266
--- /dev/null
+++ b/kernel/irq/internals.h
@@ -0,0 +1,18 @@
+/*
+ * IRQ subsystem internal functions and variables:
+ */
+
+extern int noirqdebug;
+
+#ifdef CONFIG_PROC_FS
+extern void register_irq_proc(unsigned int irq);
+extern void register_handler_proc(unsigned int irq, struct irqaction *action);
+extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
+#else
+static inline void register_irq_proc(unsigned int irq) { }
+static inline void register_handler_proc(unsigned int irq,
+					 struct irqaction *action) { }
+static inline void unregister_handler_proc(unsigned int irq,
+					   struct irqaction *action) { }
+#endif
+
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
new file mode 100644
index 000000000000..5202e4c4a5b6
--- /dev/null
+++ b/kernel/irq/manage.c
@@ -0,0 +1,349 @@
+/*
+ * linux/kernel/irq/manage.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains driver APIs to the irq subsystem.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+#ifdef CONFIG_SMP
+
+cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
+
+/**
+ *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ *
+ *	This function waits for any pending IRQ handlers for this interrupt
+ *	to complete before returning. If you use this function while
+ *	holding a resource the IRQ handler may need you will deadlock.
+ *
+ *	This function may be called - with care - from IRQ context.
+ */
+void synchronize_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	while (desc->status & IRQ_INPROGRESS)
+		cpu_relax();
+}
+
+EXPORT_SYMBOL(synchronize_irq);
+
+#endif
+
+/**
+ *	disable_irq_nosync - disable an irq without waiting
+ *	@irq: Interrupt to disable
+ *
+ *	Disable the selected interrupt line.  Disables and Enables are
+ *	nested.
+ *	Unlike disable_irq(), this function does not ensure existing
+ *	instances of the IRQ handler have completed before returning.
+ *
+ *	This function may be called from IRQ context.
+ */
+void disable_irq_nosync(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	if (!desc->depth++) {
+		desc->status |= IRQ_DISABLED;
+		desc->handler->disable(irq);
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+EXPORT_SYMBOL(disable_irq_nosync);
+
+/**
+ *	disable_irq - disable an irq and wait for completion
+ *	@irq: Interrupt to disable
+ *
+ *	Disable the selected interrupt line.  Enables and Disables are
+ *	nested.
+ *	This function waits for any pending IRQ handlers for this interrupt
+ *	to complete before returning. If you use this function while
+ *	holding a resource the IRQ handler may need you will deadlock.
+ *
+ *	This function may be called - with care - from IRQ context.
+ */
+void disable_irq(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	disable_irq_nosync(irq);
+	if (desc->action)
+		synchronize_irq(irq);
+}
+
+EXPORT_SYMBOL(disable_irq);
+
+/**
+ *	enable_irq - enable handling of an irq
+ *	@irq: Interrupt to enable
+ *
+ *	Undoes the effect of one call to disable_irq().  If this
+ *	matches the last disable, processing of interrupts on this
+ *	IRQ line is re-enabled.
+ *
+ *	This function may be called from IRQ context.
+ */
+void enable_irq(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	switch (desc->depth) {
+	case 0:
+		WARN_ON(1);
+		break;
+	case 1: {
+		unsigned int status = desc->status & ~IRQ_DISABLED;
+
+		desc->status = status;
+		if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+			desc->status = status | IRQ_REPLAY;
+			hw_resend_irq(desc->handler,irq);
+		}
+		desc->handler->enable(irq);
+		/* fall-through */
+	}
+	default:
+		desc->depth--;
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+EXPORT_SYMBOL(enable_irq);
+
+/*
+ * Internal function that tells the architecture code whether a
+ * particular irq has been exclusively allocated or is available
+ * for driver use.
+ */
+int can_request_irq(unsigned int irq, unsigned long irqflags)
+{
+	struct irqaction *action;
+
+	if (irq >= NR_IRQS)
+		return 0;
+
+	action = irq_desc[irq].action;
+	if (action)
+		if (irqflags & action->flags & SA_SHIRQ)
+			action = NULL;
+
+	return !action;
+}
+
+/*
+ * Internal function to register an irqaction - typically used to
+ * allocate special interrupts that are part of the architecture.
+ */
+int setup_irq(unsigned int irq, struct irqaction * new)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *old, **p;
+	unsigned long flags;
+	int shared = 0;
+
+	if (desc->handler == &no_irq_type)
+		return -ENOSYS;
+	/*
+	 * Some drivers like serial.c use request_irq() heavily,
+	 * so we have to be careful not to interfere with a
+	 * running system.
+	 */
+	if (new->flags & SA_SAMPLE_RANDOM) {
+		/*
+		 * This function might sleep, we want to call it first,
+		 * outside of the atomic block.
+		 * Yes, this might clear the entropy pool if the wrong
+		 * driver is attempted to be loaded, without actually
+		 * installing a new handler, but is this really a problem,
+		 * only the sysadmin is able to do this.
+		 */
+		rand_initialize_irq(irq);
+	}
+
+	/*
+	 * The following block of code has to be executed atomically
+	 */
+	spin_lock_irqsave(&desc->lock,flags);
+	p = &desc->action;
+	if ((old = *p) != NULL) {
+		/* Can't share interrupts unless both agree to */
+		if (!(old->flags & new->flags & SA_SHIRQ)) {
+			spin_unlock_irqrestore(&desc->lock,flags);
+			return -EBUSY;
+		}
+
+		/* add new interrupt at end of irq queue */
+		do {
+			p = &old->next;
+			old = *p;
+		} while (old);
+		shared = 1;
+	}
+
+	*p = new;
+
+	if (!shared) {
+		desc->depth = 0;
+		desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
+				  IRQ_WAITING | IRQ_INPROGRESS);
+		if (desc->handler->startup)
+			desc->handler->startup(irq);
+		else
+			desc->handler->enable(irq);
+	}
+	spin_unlock_irqrestore(&desc->lock,flags);
+
+	new->irq = irq;
+	register_irq_proc(irq);
+	new->dir = NULL;
+	register_handler_proc(irq, new);
+
+	return 0;
+}
+
+/**
+ *	free_irq - free an interrupt
+ *	@irq: Interrupt line to free
+ *	@dev_id: Device identity to free
+ *
+ *	Remove an interrupt handler. The handler is removed and if the
+ *	interrupt line is no longer in use by any driver it is disabled.
+ *	On a shared IRQ the caller must ensure the interrupt is disabled
+ *	on the card it drives before calling this function. The function
+ *	does not return until any executing interrupts for this IRQ
+ *	have completed.
+ *
+ *	This function must not be called from interrupt context.
+ */
+void free_irq(unsigned int irq, void *dev_id)
+{
+	struct irq_desc *desc;
+	struct irqaction **p;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS)
+		return;
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock,flags);
+	p = &desc->action;
+	for (;;) {
+		struct irqaction * action = *p;
+
+		if (action) {
+			struct irqaction **pp = p;
+
+			p = &action->next;
+			if (action->dev_id != dev_id)
+				continue;
+
+			/* Found it - now remove it from the list of entries */
+			*pp = action->next;
+			if (!desc->action) {
+				desc->status |= IRQ_DISABLED;
+				if (desc->handler->shutdown)
+					desc->handler->shutdown(irq);
+				else
+					desc->handler->disable(irq);
+			}
+			spin_unlock_irqrestore(&desc->lock,flags);
+			unregister_handler_proc(irq, action);
+
+			/* Make sure it's not being used on another CPU */
+			synchronize_irq(irq);
+			kfree(action);
+			return;
+		}
+		printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
+		spin_unlock_irqrestore(&desc->lock,flags);
+		return;
+	}
+}
+
+EXPORT_SYMBOL(free_irq);
+
+/**
+ *	request_irq - allocate an interrupt line
+ *	@irq: Interrupt line to allocate
+ *	@handler: Function to be called when the IRQ occurs
+ *	@irqflags: Interrupt type flags
+ *	@devname: An ascii name for the claiming device
+ *	@dev_id: A cookie passed back to the handler function
+ *
+ *	This call allocates interrupt resources and enables the
+ *	interrupt line and IRQ handling. From the point this
+ *	call is made your handler function may be invoked. Since
+ *	your handler function must clear any interrupt the board
+ *	raises, you must take care both to initialise your hardware
+ *	and to set up the interrupt handler in the right order.
+ *
+ *	Dev_id must be globally unique. Normally the address of the
+ *	device data structure is used as the cookie. Since the handler
+ *	receives this value it makes sense to use it.
+ *
+ *	If your interrupt is shared you must pass a non NULL dev_id
+ *	as this is required when freeing the interrupt.
+ *
+ *	Flags:
+ *
+ *	SA_SHIRQ		Interrupt is shared
+ *	SA_INTERRUPT		Disable local interrupts while processing
+ *	SA_SAMPLE_RANDOM	The interrupt can be used for entropy
+ *
+ */
+int request_irq(unsigned int irq,
+		irqreturn_t (*handler)(int, void *, struct pt_regs *),
+		unsigned long irqflags, const char * devname, void *dev_id)
+{
+	struct irqaction * action;
+	int retval;
+
+	/*
+	 * Sanity-check: shared interrupts must pass in a real dev-ID,
+	 * otherwise we'll have trouble later trying to figure out
+	 * which interrupt is which (messes up the interrupt freeing
+	 * logic etc).
+	 */
+	if ((irqflags & SA_SHIRQ) && !dev_id)
+		return -EINVAL;
+	if (irq >= NR_IRQS)
+		return -EINVAL;
+	if (!handler)
+		return -EINVAL;
+
+	action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC);
+	if (!action)
+		return -ENOMEM;
+
+	action->handler = handler;
+	action->flags = irqflags;
+	cpus_clear(action->mask);
+	action->name = devname;
+	action->next = NULL;
+	action->dev_id = dev_id;
+
+	retval = setup_irq(irq, action);
+	if (retval)
+		kfree(action);
+
+	return retval;
+}
+
+EXPORT_SYMBOL(request_irq);
+
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
new file mode 100644
index 000000000000..85d08daa6600
--- /dev/null
+++ b/kernel/irq/proc.c
@@ -0,0 +1,159 @@
+/*
+ * linux/kernel/irq/proc.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the /proc/irq/ handling code.
+ */
+
+#include <linux/irq.h>
+#include <linux/proc_fs.h>
+#include <linux/interrupt.h>
+
+static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
+
+#ifdef CONFIG_SMP
+
+/*
+ * The /proc/irq/<irq>/smp_affinity values:
+ */
+static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
+
+void __attribute__((weak))
+proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+	irq_affinity[irq] = mask_val;
+	irq_desc[irq].handler->set_affinity(irq, mask_val);
+}
+
+static int irq_affinity_read_proc(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]);
+
+	if (count - len < 2)
+		return -EINVAL;
+	len += sprintf(page + len, "\n");
+	return len;
+}
+
+int no_irq_affinity;
+static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
+				   unsigned long count, void *data)
+{
+	unsigned int irq = (int)(long)data, full_count = count, err;
+	cpumask_t new_value, tmp;
+
+	if (!irq_desc[irq].handler->set_affinity || no_irq_affinity)
+		return -EIO;
+
+	err = cpumask_parse(buffer, count, new_value);
+	if (err)
+		return err;
+
+	/*
+	 * Do not allow disabling IRQs completely - it's a too easy
+	 * way to make the system unusable accidentally :-) At least
+	 * one online CPU still has to be targeted.
+	 */
+	cpus_and(tmp, new_value, cpu_online_map);
+	if (cpus_empty(tmp))
+		return -EINVAL;
+
+	proc_set_irq_affinity(irq, new_value);
+
+	return full_count;
+}
+
+#endif
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *action;
+
+	for (action = desc->action ; action; action = action->next)
+		if ((action != new_action) && action->name &&
+				!strcmp(new_action->name, action->name))
+			return 0;
+	return 1;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	char name [MAX_NAMELEN];
+
+	if (!irq_dir[irq] || action->dir || !action->name ||
+					!name_unique(irq, action))
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+	/* create /proc/irq/1234/handler/ */
+	action->dir = proc_mkdir(name, irq_dir[irq]);
+}
+
+#undef MAX_NAMELEN
+
+#define MAX_NAMELEN 10
+
+void register_irq_proc(unsigned int irq)
+{
+	char name [MAX_NAMELEN];
+
+	if (!root_irq_dir ||
+		(irq_desc[irq].handler == &no_irq_type) ||
+			irq_dir[irq])
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	sprintf(name, "%d", irq);
+
+	/* create /proc/irq/1234 */
+	irq_dir[irq] = proc_mkdir(name, root_irq_dir);
+
+#ifdef CONFIG_SMP
+	{
+		struct proc_dir_entry *entry;
+
+		/* create /proc/irq/<irq>/smp_affinity */
+		entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
+
+		if (entry) {
+			entry->nlink = 1;
+			entry->data = (void *)(long)irq;
+			entry->read_proc = irq_affinity_read_proc;
+			entry->write_proc = irq_affinity_write_proc;
+		}
+		smp_affinity_entry[irq] = entry;
+	}
+#endif
+}
+
+#undef MAX_NAMELEN
+
+void unregister_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	if (action->dir)
+		remove_proc_entry(action->dir->name, irq_dir[irq]);
+}
+
+void init_irq_proc(void)
+{
+	int i;
+
+	/* create /proc/irq */
+	root_irq_dir = proc_mkdir("irq", NULL);
+	if (!root_irq_dir)
+		return;
+
+	/*
+	 * Create entries for all existing IRQs.
+	 */
+	for (i = 0; i < NR_IRQS; i++)
+		register_irq_proc(i);
+}
+
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
new file mode 100644
index 000000000000..f6297c306905
--- /dev/null
+++ b/kernel/irq/spurious.c
@@ -0,0 +1,96 @@
+/*
+ * linux/kernel/irq/spurious.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains spurious interrupt handling.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+
+/*
+ * If 99,900 of the previous 100,000 interrupts have not been handled
+ * then assume that the IRQ is stuck in some manner. Drop a diagnostic
+ * and try to turn the IRQ off.
+ *
+ * (The other 100-of-100,000 interrupts may have been a correctly
+ *  functioning device sharing an IRQ with the failing one)
+ *
+ * Called under desc->lock
+ */
+
+static void
+__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+	struct irqaction *action;
+
+	if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+		printk(KERN_ERR "irq event %d: bogus return value %x\n",
+				irq, action_ret);
+	} else {
+		printk(KERN_ERR "irq %d: nobody cared!\n", irq);
+	}
+	dump_stack();
+	printk(KERN_ERR "handlers:\n");
+	action = desc->action;
+	while (action) {
+		printk(KERN_ERR "[<%p>]", action->handler);
+		print_symbol(" (%s)",
+			(unsigned long)action->handler);
+		printk("\n");
+		action = action->next;
+	}
+}
+
+void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+	static int count = 100;
+
+	if (count > 0) {
+		count--;
+		__report_bad_irq(irq, desc, action_ret);
+	}
+}
+
+void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+	if (action_ret != IRQ_HANDLED) {
+		desc->irqs_unhandled++;
+		if (action_ret != IRQ_NONE)
+			report_bad_irq(irq, desc, action_ret);
+	}
+
+	desc->irq_count++;
+	if (desc->irq_count < 100000)
+		return;
+
+	desc->irq_count = 0;
+	if (desc->irqs_unhandled > 99900) {
+		/*
+		 * The interrupt is stuck
+		 */
+		__report_bad_irq(irq, desc, action_ret);
+		/*
+		 * Now kill the IRQ
+		 */
+		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
+		desc->status |= IRQ_DISABLED;
+		desc->handler->disable(irq);
+	}
+	desc->irqs_unhandled = 0;
+}
+
+int noirqdebug;
+
+int __init noirqdebug_setup(char *str)
+{
+	noirqdebug = 1;
+	printk(KERN_INFO "IRQ lockup detection disabled\n");
+	return 1;
+}
+
+__setup("noirqdebug", noirqdebug_setup);
+
diff --git a/kernel/itimer.c b/kernel/itimer.c
new file mode 100644
index 000000000000..e9a40e947e07
--- /dev/null
+++ b/kernel/itimer.c
@@ -0,0 +1,241 @@
+/*
+ * linux/kernel/itimer.c
+ *
+ * Copyright (C) 1992 Darren Senn
+ */
+
+/* These are all the functions necessary to implement itimers */
+
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/syscalls.h>
+#include <linux/time.h>
+#include <linux/posix-timers.h>
+
+#include <asm/uaccess.h>
+
+static unsigned long it_real_value(struct signal_struct *sig)
+{
+	unsigned long val = 0;
+	if (timer_pending(&sig->real_timer)) {
+		val = sig->real_timer.expires - jiffies;
+
+		/* look out for negative/zero itimer.. */
+		if ((long) val <= 0)
+			val = 1;
+	}
+	return val;
+}
+
+int do_getitimer(int which, struct itimerval *value)
+{
+	struct task_struct *tsk = current;
+	unsigned long interval, val;
+	cputime_t cinterval, cval;
+
+	switch (which) {
+	case ITIMER_REAL:
+		spin_lock_irq(&tsk->sighand->siglock);
+		interval = tsk->signal->it_real_incr;
+		val = it_real_value(tsk->signal);
+		spin_unlock_irq(&tsk->sighand->siglock);
+		jiffies_to_timeval(val, &value->it_value);
+		jiffies_to_timeval(interval, &value->it_interval);
+		break;
+	case ITIMER_VIRTUAL:
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		cval = tsk->signal->it_virt_expires;
+		cinterval = tsk->signal->it_virt_incr;
+		if (!cputime_eq(cval, cputime_zero)) {
+			struct task_struct *t = tsk;
+			cputime_t utime = tsk->signal->utime;
+			do {
+				utime = cputime_add(utime, t->utime);
+				t = next_thread(t);
+			} while (t != tsk);
+			if (cputime_le(cval, utime)) { /* about to fire */
+				cval = jiffies_to_cputime(1);
+			} else {
+				cval = cputime_sub(cval, utime);
+			}
+		}
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		cputime_to_timeval(cval, &value->it_value);
+		cputime_to_timeval(cinterval, &value->it_interval);
+		break;
+	case ITIMER_PROF:
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		cval = tsk->signal->it_prof_expires;
+		cinterval = tsk->signal->it_prof_incr;
+		if (!cputime_eq(cval, cputime_zero)) {
+			struct task_struct *t = tsk;
+			cputime_t ptime = cputime_add(tsk->signal->utime,
+						      tsk->signal->stime);
+			do {
+				ptime = cputime_add(ptime,
+						    cputime_add(t->utime,
+								t->stime));
+				t = next_thread(t);
+			} while (t != tsk);
+			if (cputime_le(cval, ptime)) { /* about to fire */
+				cval = jiffies_to_cputime(1);
+			} else {
+				cval = cputime_sub(cval, ptime);
+			}
+		}
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		cputime_to_timeval(cval, &value->it_value);
+		cputime_to_timeval(cinterval, &value->it_interval);
+		break;
+	default:
+		return(-EINVAL);
+	}
+	return 0;
+}
+
+asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
+{
+	int error = -EFAULT;
+	struct itimerval get_buffer;
+
+	if (value) {
+		error = do_getitimer(which, &get_buffer);
+		if (!error &&
+		    copy_to_user(value, &get_buffer, sizeof(get_buffer)))
+			error = -EFAULT;
+	}
+	return error;
+}
+
+/*
+ * Called with P->sighand->siglock held and P->signal->real_timer inactive.
+ * If interval is nonzero, arm the timer for interval ticks from now.
+ */
+static inline void it_real_arm(struct task_struct *p, unsigned long interval)
+{
+	p->signal->it_real_value = interval; /* XXX unnecessary field?? */
+	if (interval == 0)
+		return;
+	if (interval > (unsigned long) LONG_MAX)
+		interval = LONG_MAX;
+	p->signal->real_timer.expires = jiffies + interval;
+	add_timer(&p->signal->real_timer);
+}
+
+void it_real_fn(unsigned long __data)
+{
+	struct task_struct * p = (struct task_struct *) __data;
+
+	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
+
+	/*
+	 * Now restart the timer if necessary.  We don't need any locking
+	 * here because do_setitimer makes sure we have finished running
+	 * before it touches anything.
+	 */
+	it_real_arm(p, p->signal->it_real_incr);
+}
+
+int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
+{
+	struct task_struct *tsk = current;
+ 	unsigned long val, interval;
+	cputime_t cval, cinterval, nval, ninterval;
+
+	switch (which) {
+	case ITIMER_REAL:
+		spin_lock_irq(&tsk->sighand->siglock);
+		interval = tsk->signal->it_real_incr;
+		val = it_real_value(tsk->signal);
+		if (val)
+			del_timer_sync(&tsk->signal->real_timer);
+		tsk->signal->it_real_incr =
+			timeval_to_jiffies(&value->it_interval);
+		it_real_arm(tsk, timeval_to_jiffies(&value->it_value));
+		spin_unlock_irq(&tsk->sighand->siglock);
+		if (ovalue) {
+			jiffies_to_timeval(val, &ovalue->it_value);
+			jiffies_to_timeval(interval,
+					   &ovalue->it_interval);
+		}
+		break;
+	case ITIMER_VIRTUAL:
+		nval = timeval_to_cputime(&value->it_value);
+		ninterval = timeval_to_cputime(&value->it_interval);
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		cval = tsk->signal->it_virt_expires;
+		cinterval = tsk->signal->it_virt_incr;
+		if (!cputime_eq(cval, cputime_zero) ||
+		    !cputime_eq(nval, cputime_zero)) {
+			if (cputime_gt(nval, cputime_zero))
+				nval = cputime_add(nval,
+						   jiffies_to_cputime(1));
+			set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
+					      &nval, &cval);
+		}
+		tsk->signal->it_virt_expires = nval;
+		tsk->signal->it_virt_incr = ninterval;
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		if (ovalue) {
+			cputime_to_timeval(cval, &ovalue->it_value);
+			cputime_to_timeval(cinterval, &ovalue->it_interval);
+		}
+		break;
+	case ITIMER_PROF:
+		nval = timeval_to_cputime(&value->it_value);
+		ninterval = timeval_to_cputime(&value->it_interval);
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
+		cval = tsk->signal->it_prof_expires;
+		cinterval = tsk->signal->it_prof_incr;
+		if (!cputime_eq(cval, cputime_zero) ||
+		    !cputime_eq(nval, cputime_zero)) {
+			if (cputime_gt(nval, cputime_zero))
+				nval = cputime_add(nval,
+						   jiffies_to_cputime(1));
+			set_process_cpu_timer(tsk, CPUCLOCK_PROF,
+					      &nval, &cval);
+		}
+		tsk->signal->it_prof_expires = nval;
+		tsk->signal->it_prof_incr = ninterval;
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+		if (ovalue) {
+			cputime_to_timeval(cval, &ovalue->it_value);
+			cputime_to_timeval(cinterval, &ovalue->it_interval);
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+asmlinkage long sys_setitimer(int which,
+			      struct itimerval __user *value,
+			      struct itimerval __user *ovalue)
+{
+	struct itimerval set_buffer, get_buffer;
+	int error;
+
+	if (value) {
+		if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
+			return -EFAULT;
+	} else
+		memset((char *) &set_buffer, 0, sizeof(set_buffer));
+
+	error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
+	if (error || !ovalue)
+		return error;
+
+	if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
+		return -EFAULT; 
+	return 0;
+}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
new file mode 100644
index 000000000000..1627f8d6e0cd
--- /dev/null
+++ b/kernel/kallsyms.c
@@ -0,0 +1,411 @@
+/*
+ * kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
+ *
+ * Rewritten and vastly simplified by Rusty Russell for in-kernel
+ * module loader:
+ *   Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ *
+ * ChangeLog:
+ *
+ * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com>
+ *      Changed the compression method from stem compression to "table lookup"
+ *      compression (see scripts/kallsyms.c for a more complete description)
+ */
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/proc_fs.h>
+#include <linux/mm.h>
+
+#include <asm/sections.h>
+
+#ifdef CONFIG_KALLSYMS_ALL
+#define all_var 1
+#else
+#define all_var 0
+#endif
+
+/* These will be re-linked against their real values during the second link stage */
+extern unsigned long kallsyms_addresses[] __attribute__((weak));
+extern unsigned long kallsyms_num_syms __attribute__((weak,section("data")));
+extern u8 kallsyms_names[] __attribute__((weak));
+
+extern u8 kallsyms_token_table[] __attribute__((weak));
+extern u16 kallsyms_token_index[] __attribute__((weak));
+
+extern unsigned long kallsyms_markers[] __attribute__((weak));
+
+static inline int is_kernel_inittext(unsigned long addr)
+{
+	if (addr >= (unsigned long)_sinittext
+	    && addr <= (unsigned long)_einittext)
+		return 1;
+	return 0;
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+	if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
+		return 1;
+	return in_gate_area_no_task(addr);
+}
+
+static inline int is_kernel(unsigned long addr)
+{
+	if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
+		return 1;
+	return in_gate_area_no_task(addr);
+}
+
+/* expand a compressed symbol data into the resulting uncompressed string,
+   given the offset to where the symbol is in the compressed stream */
+static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
+{
+	int len, skipped_first = 0;
+	u8 *tptr, *data;
+
+	/* get the compressed symbol length from the first symbol byte */
+	data = &kallsyms_names[off];
+	len = *data;
+	data++;
+
+	/* update the offset to return the offset for the next symbol on
+	 * the compressed stream */
+	off += len + 1;
+
+	/* for every byte on the compressed symbol data, copy the table
+	   entry for that byte */
+	while(len) {
+		tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ];
+		data++;
+		len--;
+
+		while (*tptr) {
+			if(skipped_first) {
+				*result = *tptr;
+				result++;
+			} else
+				skipped_first = 1;
+			tptr++;
+		}
+	}
+
+	*result = '\0';
+
+	/* return to offset to the next symbol */
+	return off;
+}
+
+/* get symbol type information. This is encoded as a single char at the
+ * begining of the symbol name */
+static char kallsyms_get_symbol_type(unsigned int off)
+{
+	/* get just the first code, look it up in the token table, and return the
+	 * first char from this token */
+	return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ];
+}
+
+
+/* find the offset on the compressed stream given and index in the
+ * kallsyms array */
+static unsigned int get_symbol_offset(unsigned long pos)
+{
+	u8 *name;
+	int i;
+
+	/* use the closest marker we have. We have markers every 256 positions,
+	 * so that should be close enough */
+	name = &kallsyms_names[ kallsyms_markers[pos>>8] ];
+
+	/* sequentially scan all the symbols up to the point we're searching for.
+	 * Every symbol is stored in a [<len>][<len> bytes of data] format, so we
+	 * just need to add the len to the current pointer for every symbol we
+	 * wish to skip */
+	for(i = 0; i < (pos&0xFF); i++)
+		name = name + (*name) + 1;
+
+	return name - kallsyms_names;
+}
+
+/* Lookup the address for this symbol. Returns 0 if not found. */
+unsigned long kallsyms_lookup_name(const char *name)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long i;
+	unsigned int off;
+
+	for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
+		off = kallsyms_expand_symbol(off, namebuf);
+
+		if (strcmp(namebuf, name) == 0)
+			return kallsyms_addresses[i];
+	}
+	return module_kallsyms_lookup_name(name);
+}
+EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
+
+/*
+ * Lookup an address
+ * - modname is set to NULL if it's in the kernel
+ * - we guarantee that the returned name is valid until we reschedule even if
+ *   it resides in a module
+ * - we also guarantee that modname will be valid until rescheduled
+ */
+const char *kallsyms_lookup(unsigned long addr,
+			    unsigned long *symbolsize,
+			    unsigned long *offset,
+			    char **modname, char *namebuf)
+{
+	unsigned long i, low, high, mid;
+	const char *msym;
+
+	/* This kernel should never had been booted. */
+	BUG_ON(!kallsyms_addresses);
+
+	namebuf[KSYM_NAME_LEN] = 0;
+	namebuf[0] = 0;
+
+	if ((all_var && is_kernel(addr)) ||
+	    (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr)))) {
+		unsigned long symbol_end=0;
+
+		/* do a binary search on the sorted kallsyms_addresses array */
+		low = 0;
+		high = kallsyms_num_syms;
+
+		while (high-low > 1) {
+			mid = (low + high) / 2;
+			if (kallsyms_addresses[mid] <= addr) low = mid;
+			else high = mid;
+		}
+
+		/* search for the first aliased symbol. Aliased symbols are
+		   symbols with the same address */
+		while (low && kallsyms_addresses[low - 1] == kallsyms_addresses[low])
+			--low;
+
+		/* Grab name */
+		kallsyms_expand_symbol(get_symbol_offset(low), namebuf);
+
+		/* Search for next non-aliased symbol */
+		for (i = low + 1; i < kallsyms_num_syms; i++) {
+			if (kallsyms_addresses[i] > kallsyms_addresses[low]) {
+				symbol_end = kallsyms_addresses[i];
+				break;
+			}
+		}
+
+		/* if we found no next symbol, we use the end of the section */
+		if (!symbol_end) {
+			if (is_kernel_inittext(addr))
+				symbol_end = (unsigned long)_einittext;
+			else
+				symbol_end = all_var ? (unsigned long)_end : (unsigned long)_etext;
+		}
+
+		*symbolsize = symbol_end - kallsyms_addresses[low];
+		*modname = NULL;
+		*offset = addr - kallsyms_addresses[low];
+		return namebuf;
+	}
+
+	/* see if it's in a module */
+	msym = module_address_lookup(addr, symbolsize, offset, modname);
+	if (msym)
+		return strncpy(namebuf, msym, KSYM_NAME_LEN);
+
+	return NULL;
+}
+
+/* Replace "%s" in format with address, or returns -errno. */
+void __print_symbol(const char *fmt, unsigned long address)
+{
+	char *modname;
+	const char *name;
+	unsigned long offset, size;
+	char namebuf[KSYM_NAME_LEN+1];
+	char buffer[sizeof("%s+%#lx/%#lx [%s]") + KSYM_NAME_LEN +
+		    2*(BITS_PER_LONG*3/10) + MODULE_NAME_LEN + 1];
+
+	name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
+
+	if (!name)
+		sprintf(buffer, "0x%lx", address);
+	else {
+		if (modname)
+			sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
+				size, modname);
+		else
+			sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
+	}
+	printk(fmt, buffer);
+}
+
+/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
+struct kallsym_iter
+{
+	loff_t pos;
+	struct module *owner;
+	unsigned long value;
+	unsigned int nameoff; /* If iterating in core kernel symbols */
+	char type;
+	char name[KSYM_NAME_LEN+1];
+};
+
+/* Only label it "global" if it is exported. */
+static void upcase_if_global(struct kallsym_iter *iter)
+{
+	if (is_exported(iter->name, iter->owner))
+		iter->type += 'A' - 'a';
+}
+
+static int get_ksymbol_mod(struct kallsym_iter *iter)
+{
+	iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
+					 &iter->value,
+					 &iter->type, iter->name);
+	if (iter->owner == NULL)
+		return 0;
+
+	upcase_if_global(iter);
+	return 1;
+}
+
+/* Returns space to next name. */
+static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
+{
+	unsigned off = iter->nameoff;
+
+	iter->owner = NULL;
+	iter->value = kallsyms_addresses[iter->pos];
+
+	iter->type = kallsyms_get_symbol_type(off);
+
+	off = kallsyms_expand_symbol(off, iter->name);
+
+	return off - iter->nameoff;
+}
+
+static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
+{
+	iter->name[0] = '\0';
+	iter->nameoff = get_symbol_offset(new_pos);
+	iter->pos = new_pos;
+}
+
+/* Returns false if pos at or past end of file. */
+static int update_iter(struct kallsym_iter *iter, loff_t pos)
+{
+	/* Module symbols can be accessed randomly. */
+	if (pos >= kallsyms_num_syms) {
+		iter->pos = pos;
+		return get_ksymbol_mod(iter);
+	}
+	
+	/* If we're not on the desired position, reset to new position. */
+	if (pos != iter->pos)
+		reset_iter(iter, pos);
+
+	iter->nameoff += get_ksymbol_core(iter);
+	iter->pos++;
+
+	return 1;
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	(*pos)++;
+
+	if (!update_iter(m->private, *pos))
+		return NULL;
+	return p;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	if (!update_iter(m->private, *pos))
+		return NULL;
+	return m->private;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+	struct kallsym_iter *iter = m->private;
+
+	/* Some debugging symbols have no name.  Ignore them. */ 
+	if (!iter->name[0])
+		return 0;
+
+	if (iter->owner)
+		seq_printf(m, "%0*lx %c %s\t[%s]\n",
+			   (int)(2*sizeof(void*)),
+			   iter->value, iter->type, iter->name,
+			   module_name(iter->owner));
+	else
+		seq_printf(m, "%0*lx %c %s\n",
+			   (int)(2*sizeof(void*)),
+			   iter->value, iter->type, iter->name);
+	return 0;
+}
+
+static struct seq_operations kallsyms_op = {
+	.start = s_start,
+	.next = s_next,
+	.stop = s_stop,
+	.show = s_show
+};
+
+static int kallsyms_open(struct inode *inode, struct file *file)
+{
+	/* We keep iterator in m->private, since normal case is to
+	 * s_start from where we left off, so we avoid doing
+	 * using get_symbol_offset for every symbol */
+	struct kallsym_iter *iter;
+	int ret;
+
+	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+	reset_iter(iter, 0);
+
+	ret = seq_open(file, &kallsyms_op);
+	if (ret == 0)
+		((struct seq_file *)file->private_data)->private = iter;
+	else
+		kfree(iter);
+	return ret;
+}
+
+static int kallsyms_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	kfree(m->private);
+	return seq_release(inode, file);
+}
+
+static struct file_operations kallsyms_operations = {
+	.open = kallsyms_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = kallsyms_release,
+};
+
+static int __init kallsyms_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("kallsyms", 0444, NULL);
+	if (entry)
+		entry->proc_fops = &kallsyms_operations;
+	return 0;
+}
+__initcall(kallsyms_init);
+
+EXPORT_SYMBOL(__print_symbol);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
new file mode 100644
index 000000000000..179baafcdd96
--- /dev/null
+++ b/kernel/kfifo.c
@@ -0,0 +1,168 @@
+/*
+ * A simple kernel FIFO implementation.
+ *
+ * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/kfifo.h>
+
+/**
+ * kfifo_init - allocates a new FIFO using a preallocated buffer
+ * @buffer: the preallocated buffer to be used.
+ * @size: the size of the internal buffer, this have to be a power of 2.
+ * @gfp_mask: get_free_pages mask, passed to kmalloc()
+ * @lock: the lock to be used to protect the fifo buffer
+ *
+ * Do NOT pass the kfifo to kfifo_free() after use ! Simply free the
+ * struct kfifo with kfree().
+ */
+struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
+			 unsigned int __nocast gfp_mask, spinlock_t *lock)
+{
+	struct kfifo *fifo;
+
+	/* size must be a power of 2 */
+	BUG_ON(size & (size - 1));
+
+	fifo = kmalloc(sizeof(struct kfifo), gfp_mask);
+	if (!fifo)
+		return ERR_PTR(-ENOMEM);
+
+	fifo->buffer = buffer;
+	fifo->size = size;
+	fifo->in = fifo->out = 0;
+	fifo->lock = lock;
+
+	return fifo;
+}
+EXPORT_SYMBOL(kfifo_init);
+
+/**
+ * kfifo_alloc - allocates a new FIFO and its internal buffer
+ * @size: the size of the internal buffer to be allocated.
+ * @gfp_mask: get_free_pages mask, passed to kmalloc()
+ * @lock: the lock to be used to protect the fifo buffer
+ *
+ * The size will be rounded-up to a power of 2.
+ */
+struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock)
+{
+	unsigned char *buffer;
+	struct kfifo *ret;
+
+	/*
+	 * round up to the next power of 2, since our 'let the indices
+	 * wrap' tachnique works only in this case.
+	 */
+	if (size & (size - 1)) {
+		BUG_ON(size > 0x80000000);
+		size = roundup_pow_of_two(size);
+	}
+
+	buffer = kmalloc(size, gfp_mask);
+	if (!buffer)
+		return ERR_PTR(-ENOMEM);
+
+	ret = kfifo_init(buffer, size, gfp_mask, lock);
+
+	if (IS_ERR(ret))
+		kfree(buffer);
+
+	return ret;
+}
+EXPORT_SYMBOL(kfifo_alloc);
+
+/**
+ * kfifo_free - frees the FIFO
+ * @fifo: the fifo to be freed.
+ */
+void kfifo_free(struct kfifo *fifo)
+{
+	kfree(fifo->buffer);
+	kfree(fifo);
+}
+EXPORT_SYMBOL(kfifo_free);
+
+/**
+ * __kfifo_put - puts some data into the FIFO, no locking version
+ * @fifo: the fifo to be used.
+ * @buffer: the data to be added.
+ * @len: the length of the data to be added.
+ *
+ * This function copies at most 'len' bytes from the 'buffer' into
+ * the FIFO depending on the free space, and returns the number of
+ * bytes copied.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+unsigned int __kfifo_put(struct kfifo *fifo,
+			 unsigned char *buffer, unsigned int len)
+{
+	unsigned int l;
+
+	len = min(len, fifo->size - fifo->in + fifo->out);
+
+	/* first put the data starting from fifo->in to buffer end */
+	l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
+	memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
+
+	/* then put the rest (if any) at the beginning of the buffer */
+	memcpy(fifo->buffer, buffer + l, len - l);
+
+	fifo->in += len;
+
+	return len;
+}
+EXPORT_SYMBOL(__kfifo_put);
+
+/**
+ * __kfifo_get - gets some data from the FIFO, no locking version
+ * @fifo: the fifo to be used.
+ * @buffer: where the data must be copied.
+ * @len: the size of the destination buffer.
+ *
+ * This function copies at most 'len' bytes from the FIFO into the
+ * 'buffer' and returns the number of copied bytes.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+unsigned int __kfifo_get(struct kfifo *fifo,
+			 unsigned char *buffer, unsigned int len)
+{
+	unsigned int l;
+
+	len = min(len, fifo->in - fifo->out);
+
+	/* first get the data from fifo->out until the end of the buffer */
+	l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
+	memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
+
+	/* then get the rest (if any) from the beginning of the buffer */
+	memcpy(buffer + l, fifo->buffer, len - l);
+
+	fifo->out += len;
+
+	return len;
+}
+EXPORT_SYMBOL(__kfifo_get);
diff --git a/kernel/kmod.c b/kernel/kmod.c
new file mode 100644
index 000000000000..eed53d4f5230
--- /dev/null
+++ b/kernel/kmod.c
@@ -0,0 +1,256 @@
+/*
+	kmod, the new module loader (replaces kerneld)
+	Kirk Petersen
+
+	Reorganized not to be a daemon by Adam Richter, with guidance
+	from Greg Zornetzer.
+
+	Modified to avoid chroot and file sharing problems.
+	Mikael Pettersson
+
+	Limit the concurrent number of kmod modprobes to catch loops from
+	"modprobe needs a service that is in a module".
+	Keith Owens <kaos@ocs.com.au> December 1999
+
+	Unblock all signals when we exec a usermode process.
+	Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
+
+	call_usermodehelper wait flag, and remove exec_usermodehelper.
+	Rusty Russell <rusty@rustcorp.com.au>  Jan 2003
+*/
+#define __KERNEL_SYSCALLS__
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/namespace.h>
+#include <linux/completion.h>
+#include <linux/file.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+
+extern int max_threads;
+
+static struct workqueue_struct *khelper_wq;
+
+#ifdef CONFIG_KMOD
+
+/*
+	modprobe_path is set via /proc/sys.
+*/
+char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+
+/**
+ * request_module - try to load a kernel module
+ * @fmt:     printf style format string for the name of the module
+ * @varargs: arguements as specified in the format string
+ *
+ * Load a module using the user mode module loader. The function returns
+ * zero on success or a negative errno code on failure. Note that a
+ * successful module load does not mean the module did not then unload
+ * and exit on an error of its own. Callers must check that the service
+ * they requested is now available not blindly invoke it.
+ *
+ * If module auto-loading support is disabled then this function
+ * becomes a no-operation.
+ */
+int request_module(const char *fmt, ...)
+{
+	va_list args;
+	char module_name[MODULE_NAME_LEN];
+	unsigned int max_modprobes;
+	int ret;
+	char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
+	static char *envp[] = { "HOME=/",
+				"TERM=linux",
+				"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+				NULL };
+	static atomic_t kmod_concurrent = ATOMIC_INIT(0);
+#define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */
+	static int kmod_loop_msg;
+
+	va_start(args, fmt);
+	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+	va_end(args);
+	if (ret >= MODULE_NAME_LEN)
+		return -ENAMETOOLONG;
+
+	/* If modprobe needs a service that is in a module, we get a recursive
+	 * loop.  Limit the number of running kmod threads to max_threads/2 or
+	 * MAX_KMOD_CONCURRENT, whichever is the smaller.  A cleaner method
+	 * would be to run the parents of this process, counting how many times
+	 * kmod was invoked.  That would mean accessing the internals of the
+	 * process tables to get the command line, proc_pid_cmdline is static
+	 * and it is not worth changing the proc code just to handle this case. 
+	 * KAO.
+	 *
+	 * "trace the ppid" is simple, but will fail if someone's
+	 * parent exits.  I think this is as good as it gets. --RR
+	 */
+	max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
+	atomic_inc(&kmod_concurrent);
+	if (atomic_read(&kmod_concurrent) > max_modprobes) {
+		/* We may be blaming an innocent here, but unlikely */
+		if (kmod_loop_msg++ < 5)
+			printk(KERN_ERR
+			       "request_module: runaway loop modprobe %s\n",
+			       module_name);
+		atomic_dec(&kmod_concurrent);
+		return -ENOMEM;
+	}
+
+	ret = call_usermodehelper(modprobe_path, argv, envp, 1);
+	atomic_dec(&kmod_concurrent);
+	return ret;
+}
+EXPORT_SYMBOL(request_module);
+#endif /* CONFIG_KMOD */
+
+struct subprocess_info {
+	struct completion *complete;
+	char *path;
+	char **argv;
+	char **envp;
+	int wait;
+	int retval;
+};
+
+/*
+ * This is the task which runs the usermode application
+ */
+static int ____call_usermodehelper(void *data)
+{
+	struct subprocess_info *sub_info = data;
+	int retval;
+
+	/* Unblock all signals. */
+	flush_signals(current);
+	spin_lock_irq(&current->sighand->siglock);
+	flush_signal_handlers(current, 1);
+	sigemptyset(&current->blocked);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	/* We can run anywhere, unlike our parent keventd(). */
+	set_cpus_allowed(current, CPU_MASK_ALL);
+
+	retval = -EPERM;
+	if (current->fs->root)
+		retval = execve(sub_info->path, sub_info->argv,sub_info->envp);
+
+	/* Exec failed? */
+	sub_info->retval = retval;
+	do_exit(0);
+}
+
+/* Keventd can't block, but this (a child) can. */
+static int wait_for_helper(void *data)
+{
+	struct subprocess_info *sub_info = data;
+	pid_t pid;
+	struct k_sigaction sa;
+
+	/* Install a handler: if SIGCLD isn't handled sys_wait4 won't
+	 * populate the status, but will return -ECHILD. */
+	sa.sa.sa_handler = SIG_IGN;
+	sa.sa.sa_flags = 0;
+	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
+	do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+	allow_signal(SIGCHLD);
+
+	pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
+	if (pid < 0) {
+		sub_info->retval = pid;
+	} else {
+		/*
+		 * Normally it is bogus to call wait4() from in-kernel because
+		 * wait4() wants to write the exit code to a userspace address.
+		 * But wait_for_helper() always runs as keventd, and put_user()
+		 * to a kernel address works OK for kernel threads, due to their
+		 * having an mm_segment_t which spans the entire address space.
+		 *
+		 * Thus the __user pointer cast is valid here.
+		 */
+		sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL);
+	}
+
+	complete(sub_info->complete);
+	return 0;
+}
+
+/* This is run by khelper thread  */
+static void __call_usermodehelper(void *data)
+{
+	struct subprocess_info *sub_info = data;
+	pid_t pid;
+
+	/* CLONE_VFORK: wait until the usermode helper has execve'd
+	 * successfully We need the data structures to stay around
+	 * until that is done.  */
+	if (sub_info->wait)
+		pid = kernel_thread(wait_for_helper, sub_info,
+				    CLONE_FS | CLONE_FILES | SIGCHLD);
+	else
+		pid = kernel_thread(____call_usermodehelper, sub_info,
+				    CLONE_VFORK | SIGCHLD);
+
+	if (pid < 0) {
+		sub_info->retval = pid;
+		complete(sub_info->complete);
+	} else if (!sub_info->wait)
+		complete(sub_info->complete);
+}
+
+/**
+ * call_usermodehelper - start a usermode application
+ * @path: pathname for the application
+ * @argv: null-terminated argument list
+ * @envp: null-terminated environment list
+ * @wait: wait for the application to finish and return status.
+ *
+ * Runs a user-space application.  The application is started
+ * asynchronously if wait is not set, and runs as a child of keventd.
+ * (ie. it runs with full root capabilities).
+ *
+ * Must be called from process context.  Returns a negative error code
+ * if program was not execed successfully, or 0.
+ */
+int call_usermodehelper(char *path, char **argv, char **envp, int wait)
+{
+	DECLARE_COMPLETION(done);
+	struct subprocess_info sub_info = {
+		.complete	= &done,
+		.path		= path,
+		.argv		= argv,
+		.envp		= envp,
+		.wait		= wait,
+		.retval		= 0,
+	};
+	DECLARE_WORK(work, __call_usermodehelper, &sub_info);
+
+	if (!khelper_wq)
+		return -EBUSY;
+
+	if (path[0] == '\0')
+		return 0;
+
+	queue_work(khelper_wq, &work);
+	wait_for_completion(&done);
+	return sub_info.retval;
+}
+EXPORT_SYMBOL(call_usermodehelper);
+
+void __init usermodehelper_init(void)
+{
+	khelper_wq = create_singlethread_workqueue("khelper");
+	BUG_ON(!khelper_wq);
+}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
new file mode 100644
index 000000000000..1d5dd1337bd1
--- /dev/null
+++ b/kernel/kprobes.c
@@ -0,0 +1,157 @@
+/*
+ *  Kernel Probes (KProbes)
+ *  kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *		Probes initial implementation (includes suggestions from
+ *		Rusty Russell).
+ * 2004-Aug	Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
+ *		hlists and exceptions notifier as suggested by Andi Kleen.
+ * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *		interface to access function arguments.
+ * 2004-Sep	Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
+ *		exceptions notifier to be first on the priority list.
+ */
+#include <linux/kprobes.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cacheflush.h>
+#include <asm/errno.h>
+#include <asm/kdebug.h>
+
+#define KPROBE_HASH_BITS 6
+#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+
+static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
+
+unsigned int kprobe_cpu = NR_CPUS;
+static DEFINE_SPINLOCK(kprobe_lock);
+
+/* Locks kprobe: irqs must be disabled */
+void lock_kprobes(void)
+{
+	spin_lock(&kprobe_lock);
+	kprobe_cpu = smp_processor_id();
+}
+
+void unlock_kprobes(void)
+{
+	kprobe_cpu = NR_CPUS;
+	spin_unlock(&kprobe_lock);
+}
+
+/* You have to be holding the kprobe_lock */
+struct kprobe *get_kprobe(void *addr)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
+	hlist_for_each(node, head) {
+		struct kprobe *p = hlist_entry(node, struct kprobe, hlist);
+		if (p->addr == addr)
+			return p;
+	}
+	return NULL;
+}
+
+int register_kprobe(struct kprobe *p)
+{
+	int ret = 0;
+	unsigned long flags = 0;
+
+	if ((ret = arch_prepare_kprobe(p)) != 0) {
+		goto rm_kprobe;
+	}
+	spin_lock_irqsave(&kprobe_lock, flags);
+	INIT_HLIST_NODE(&p->hlist);
+	if (get_kprobe(p->addr)) {
+		ret = -EEXIST;
+		goto out;
+	}
+	arch_copy_kprobe(p);
+
+	hlist_add_head(&p->hlist,
+		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+
+	p->opcode = *p->addr;
+	*p->addr = BREAKPOINT_INSTRUCTION;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+out:
+	spin_unlock_irqrestore(&kprobe_lock, flags);
+rm_kprobe:
+	if (ret == -EEXIST)
+		arch_remove_kprobe(p);
+	return ret;
+}
+
+void unregister_kprobe(struct kprobe *p)
+{
+	unsigned long flags;
+	arch_remove_kprobe(p);
+	spin_lock_irqsave(&kprobe_lock, flags);
+	*p->addr = p->opcode;
+	hlist_del(&p->hlist);
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+	spin_unlock_irqrestore(&kprobe_lock, flags);
+}
+
+static struct notifier_block kprobe_exceptions_nb = {
+	.notifier_call = kprobe_exceptions_notify,
+	.priority = 0x7fffffff /* we need to notified first */
+};
+
+int register_jprobe(struct jprobe *jp)
+{
+	/* Todo: Verify probepoint is a function entry point */
+	jp->kp.pre_handler = setjmp_pre_handler;
+	jp->kp.break_handler = longjmp_break_handler;
+
+	return register_kprobe(&jp->kp);
+}
+
+void unregister_jprobe(struct jprobe *jp)
+{
+	unregister_kprobe(&jp->kp);
+}
+
+static int __init init_kprobes(void)
+{
+	int i, err = 0;
+
+	/* FIXME allocate the probe table, currently defined statically */
+	/* initialize all list heads */
+	for (i = 0; i < KPROBE_TABLE_SIZE; i++)
+		INIT_HLIST_HEAD(&kprobe_table[i]);
+
+	err = register_die_notifier(&kprobe_exceptions_nb);
+	return err;
+}
+
+__initcall(init_kprobes);
+
+EXPORT_SYMBOL_GPL(register_kprobe);
+EXPORT_SYMBOL_GPL(unregister_kprobe);
+EXPORT_SYMBOL_GPL(register_jprobe);
+EXPORT_SYMBOL_GPL(unregister_jprobe);
+EXPORT_SYMBOL_GPL(jprobe_return);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
new file mode 100644
index 000000000000..1f064a63f8cf
--- /dev/null
+++ b/kernel/ksysfs.c
@@ -0,0 +1,57 @@
+/*
+ * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
+ * 		     are not related to any other subsystem
+ *
+ * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
+ * 
+ * This file is release under the GPLv2
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#define KERNEL_ATTR_RO(_name) \
+static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
+
+#define KERNEL_ATTR_RW(_name) \
+static struct subsys_attribute _name##_attr = \
+	__ATTR(_name, 0644, _name##_show, _name##_store)
+
+#ifdef CONFIG_HOTPLUG
+static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum);
+}
+KERNEL_ATTR_RO(hotplug_seqnum);
+#endif
+
+decl_subsys(kernel, NULL, NULL);
+EXPORT_SYMBOL_GPL(kernel_subsys);
+
+static struct attribute * kernel_attrs[] = {
+#ifdef CONFIG_HOTPLUG
+	&hotplug_seqnum_attr.attr,
+#endif
+	NULL
+};
+
+static struct attribute_group kernel_attr_group = {
+	.attrs = kernel_attrs,
+};
+
+static int __init ksysfs_init(void)
+{
+	int error = subsystem_register(&kernel_subsys);
+	if (!error)
+		error = sysfs_create_group(&kernel_subsys.kset.kobj,
+					   &kernel_attr_group);
+
+	return error;
+}
+
+core_initcall(ksysfs_init);
diff --git a/kernel/kthread.c b/kernel/kthread.c
new file mode 100644
index 000000000000..e377e2244103
--- /dev/null
+++ b/kernel/kthread.c
@@ -0,0 +1,202 @@
+/* Kernel thread helper functions.
+ *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
+ *
+ * Creation is done via keventd, so that we get a clean environment
+ * even if we're invoked from userspace (think modprobe, hotplug cpu,
+ * etc.).
+ */
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/unistd.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <asm/semaphore.h>
+
+/*
+ * We dont want to execute off keventd since it might
+ * hold a semaphore our callers hold too:
+ */
+static struct workqueue_struct *helper_wq;
+
+struct kthread_create_info
+{
+	/* Information passed to kthread() from keventd. */
+	int (*threadfn)(void *data);
+	void *data;
+	struct completion started;
+
+	/* Result passed back to kthread_create() from keventd. */
+	struct task_struct *result;
+	struct completion done;
+};
+
+struct kthread_stop_info
+{
+	struct task_struct *k;
+	int err;
+	struct completion done;
+};
+
+/* Thread stopping is done by setthing this var: lock serializes
+ * multiple kthread_stop calls. */
+static DECLARE_MUTEX(kthread_stop_lock);
+static struct kthread_stop_info kthread_stop_info;
+
+int kthread_should_stop(void)
+{
+	return (kthread_stop_info.k == current);
+}
+EXPORT_SYMBOL(kthread_should_stop);
+
+static void kthread_exit_files(void)
+{
+	struct fs_struct *fs;
+	struct task_struct *tsk = current;
+
+	exit_fs(tsk);		/* current->fs->count--; */
+	fs = init_task.fs;
+	tsk->fs = fs;
+	atomic_inc(&fs->count);
+ 	exit_files(tsk);
+	current->files = init_task.files;
+	atomic_inc(&tsk->files->count);
+}
+
+static int kthread(void *_create)
+{
+	struct kthread_create_info *create = _create;
+	int (*threadfn)(void *data);
+	void *data;
+	sigset_t blocked;
+	int ret = -EINTR;
+
+	kthread_exit_files();
+
+	/* Copy data: it's on keventd's stack */
+	threadfn = create->threadfn;
+	data = create->data;
+
+	/* Block and flush all signals (in case we're not from keventd). */
+	sigfillset(&blocked);
+	sigprocmask(SIG_BLOCK, &blocked, NULL);
+	flush_signals(current);
+
+	/* By default we can run anywhere, unlike keventd. */
+	set_cpus_allowed(current, CPU_MASK_ALL);
+
+	/* OK, tell user we're spawned, wait for stop or wakeup */
+	__set_current_state(TASK_INTERRUPTIBLE);
+	complete(&create->started);
+	schedule();
+
+	if (!kthread_should_stop())
+		ret = threadfn(data);
+
+	/* It might have exited on its own, w/o kthread_stop.  Check. */
+	if (kthread_should_stop()) {
+		kthread_stop_info.err = ret;
+		complete(&kthread_stop_info.done);
+	}
+	return 0;
+}
+
+/* We are keventd: create a thread. */
+static void keventd_create_kthread(void *_create)
+{
+	struct kthread_create_info *create = _create;
+	int pid;
+
+	/* We want our own signal handler (we take no signals by default). */
+	pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+	if (pid < 0) {
+		create->result = ERR_PTR(pid);
+	} else {
+		wait_for_completion(&create->started);
+		create->result = find_task_by_pid(pid);
+	}
+	complete(&create->done);
+}
+
+struct task_struct *kthread_create(int (*threadfn)(void *data),
+				   void *data,
+				   const char namefmt[],
+				   ...)
+{
+	struct kthread_create_info create;
+	DECLARE_WORK(work, keventd_create_kthread, &create);
+
+	create.threadfn = threadfn;
+	create.data = data;
+	init_completion(&create.started);
+	init_completion(&create.done);
+
+	/*
+	 * The workqueue needs to start up first:
+	 */
+	if (!helper_wq)
+		work.func(work.data);
+	else {
+		queue_work(helper_wq, &work);
+		wait_for_completion(&create.done);
+	}
+	if (!IS_ERR(create.result)) {
+		va_list args;
+		va_start(args, namefmt);
+		vsnprintf(create.result->comm, sizeof(create.result->comm),
+			  namefmt, args);
+		va_end(args);
+	}
+
+	return create.result;
+}
+EXPORT_SYMBOL(kthread_create);
+
+void kthread_bind(struct task_struct *k, unsigned int cpu)
+{
+	BUG_ON(k->state != TASK_INTERRUPTIBLE);
+	/* Must have done schedule() in kthread() before we set_task_cpu */
+	wait_task_inactive(k);
+	set_task_cpu(k, cpu);
+	k->cpus_allowed = cpumask_of_cpu(cpu);
+}
+EXPORT_SYMBOL(kthread_bind);
+
+int kthread_stop(struct task_struct *k)
+{
+	int ret;
+
+	down(&kthread_stop_lock);
+
+	/* It could exit after stop_info.k set, but before wake_up_process. */
+	get_task_struct(k);
+
+	/* Must init completion *before* thread sees kthread_stop_info.k */
+	init_completion(&kthread_stop_info.done);
+	wmb();
+
+	/* Now set kthread_should_stop() to true, and wake it up. */
+	kthread_stop_info.k = k;
+	wake_up_process(k);
+	put_task_struct(k);
+
+	/* Once it dies, reset stop ptr, gather result and we're done. */
+	wait_for_completion(&kthread_stop_info.done);
+	kthread_stop_info.k = NULL;
+	ret = kthread_stop_info.err;
+	up(&kthread_stop_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(kthread_stop);
+
+static __init int helper_init(void)
+{
+	helper_wq = create_singlethread_workqueue("kthread");
+	BUG_ON(!helper_wq);
+
+	return 0;
+}
+core_initcall(helper_init);
+
diff --git a/kernel/module.c b/kernel/module.c
new file mode 100644
index 000000000000..2dbfa0773faf
--- /dev/null
+++ b/kernel/module.c
@@ -0,0 +1,2108 @@
+/* Rewritten by Rusty Russell, on the backs of many others...
+   Copyright (C) 2002 Richard Henderson
+   Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/moduleloader.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/elf.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/moduleparam.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/vermagic.h>
+#include <linux/notifier.h>
+#include <linux/stop_machine.h>
+#include <linux/device.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <asm/cacheflush.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt , a...)
+#endif
+
+#ifndef ARCH_SHF_SMALL
+#define ARCH_SHF_SMALL 0
+#endif
+
+/* If this is set, the section belongs in the init part of the module */
+#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
+
+/* Protects module list */
+static DEFINE_SPINLOCK(modlist_lock);
+
+/* List of modules, protected by module_mutex AND modlist_lock */
+static DECLARE_MUTEX(module_mutex);
+static LIST_HEAD(modules);
+
+static DECLARE_MUTEX(notify_mutex);
+static struct notifier_block * module_notify_list;
+
+int register_module_notifier(struct notifier_block * nb)
+{
+	int err;
+	down(&notify_mutex);
+	err = notifier_chain_register(&module_notify_list, nb);
+	up(&notify_mutex);
+	return err;
+}
+EXPORT_SYMBOL(register_module_notifier);
+
+int unregister_module_notifier(struct notifier_block * nb)
+{
+	int err;
+	down(&notify_mutex);
+	err = notifier_chain_unregister(&module_notify_list, nb);
+	up(&notify_mutex);
+	return err;
+}
+EXPORT_SYMBOL(unregister_module_notifier);
+
+/* We require a truly strong try_module_get() */
+static inline int strong_try_module_get(struct module *mod)
+{
+	if (mod && mod->state == MODULE_STATE_COMING)
+		return 0;
+	return try_module_get(mod);
+}
+
+/* A thread that wants to hold a reference to a module only while it
+ * is running can call ths to safely exit.
+ * nfsd and lockd use this.
+ */
+void __module_put_and_exit(struct module *mod, long code)
+{
+	module_put(mod);
+	do_exit(code);
+}
+EXPORT_SYMBOL(__module_put_and_exit);
+	
+/* Find a module section: 0 means not found. */
+static unsigned int find_sec(Elf_Ehdr *hdr,
+			     Elf_Shdr *sechdrs,
+			     const char *secstrings,
+			     const char *name)
+{
+	unsigned int i;
+
+	for (i = 1; i < hdr->e_shnum; i++)
+		/* Alloc bit cleared means "ignore it." */
+		if ((sechdrs[i].sh_flags & SHF_ALLOC)
+		    && strcmp(secstrings+sechdrs[i].sh_name, name) == 0)
+			return i;
+	return 0;
+}
+
+/* Provided by the linker */
+extern const struct kernel_symbol __start___ksymtab[];
+extern const struct kernel_symbol __stop___ksymtab[];
+extern const struct kernel_symbol __start___ksymtab_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const unsigned long __start___kcrctab[];
+extern const unsigned long __start___kcrctab_gpl[];
+
+#ifndef CONFIG_MODVERSIONS
+#define symversion(base, idx) NULL
+#else
+#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL)
+#endif
+
+/* Find a symbol, return value, crc and module which owns it */
+static unsigned long __find_symbol(const char *name,
+				   struct module **owner,
+				   const unsigned long **crc,
+				   int gplok)
+{
+	struct module *mod;
+	unsigned int i;
+
+	/* Core kernel first. */ 
+	*owner = NULL;
+	for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) {
+		if (strcmp(__start___ksymtab[i].name, name) == 0) {
+			*crc = symversion(__start___kcrctab, i);
+			return __start___ksymtab[i].value;
+		}
+	}
+	if (gplok) {
+		for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++)
+			if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) {
+				*crc = symversion(__start___kcrctab_gpl, i);
+				return __start___ksymtab_gpl[i].value;
+			}
+	}
+
+	/* Now try modules. */ 
+	list_for_each_entry(mod, &modules, list) {
+		*owner = mod;
+		for (i = 0; i < mod->num_syms; i++)
+			if (strcmp(mod->syms[i].name, name) == 0) {
+				*crc = symversion(mod->crcs, i);
+				return mod->syms[i].value;
+			}
+
+		if (gplok) {
+			for (i = 0; i < mod->num_gpl_syms; i++) {
+				if (strcmp(mod->gpl_syms[i].name, name) == 0) {
+					*crc = symversion(mod->gpl_crcs, i);
+					return mod->gpl_syms[i].value;
+				}
+			}
+		}
+	}
+	DEBUGP("Failed to find symbol %s\n", name);
+ 	return 0;
+}
+
+/* Find a symbol in this elf symbol table */
+static unsigned long find_local_symbol(Elf_Shdr *sechdrs,
+				       unsigned int symindex,
+				       const char *strtab,
+				       const char *name)
+{
+	unsigned int i;
+	Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
+
+	/* Search (defined) internal symbols first. */
+	for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) {
+		if (sym[i].st_shndx != SHN_UNDEF
+		    && strcmp(name, strtab + sym[i].st_name) == 0)
+			return sym[i].st_value;
+	}
+	return 0;
+}
+
+/* Search for module by name: must hold module_mutex. */
+static struct module *find_module(const char *name)
+{
+	struct module *mod;
+
+	list_for_each_entry(mod, &modules, list) {
+		if (strcmp(mod->name, name) == 0)
+			return mod;
+	}
+	return NULL;
+}
+
+#ifdef CONFIG_SMP
+/* Number of blocks used and allocated. */
+static unsigned int pcpu_num_used, pcpu_num_allocated;
+/* Size of each block.  -ve means used. */
+static int *pcpu_size;
+
+static int split_block(unsigned int i, unsigned short size)
+{
+	/* Reallocation required? */
+	if (pcpu_num_used + 1 > pcpu_num_allocated) {
+		int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2,
+				   GFP_KERNEL);
+		if (!new)
+			return 0;
+
+		memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated);
+		pcpu_num_allocated *= 2;
+		kfree(pcpu_size);
+		pcpu_size = new;
+	}
+
+	/* Insert a new subblock */
+	memmove(&pcpu_size[i+1], &pcpu_size[i],
+		sizeof(pcpu_size[0]) * (pcpu_num_used - i));
+	pcpu_num_used++;
+
+	pcpu_size[i+1] -= size;
+	pcpu_size[i] = size;
+	return 1;
+}
+
+static inline unsigned int block_size(int val)
+{
+	if (val < 0)
+		return -val;
+	return val;
+}
+
+/* Created by linker magic */
+extern char __per_cpu_start[], __per_cpu_end[];
+
+static void *percpu_modalloc(unsigned long size, unsigned long align)
+{
+	unsigned long extra;
+	unsigned int i;
+	void *ptr;
+
+	BUG_ON(align > SMP_CACHE_BYTES);
+
+	ptr = __per_cpu_start;
+	for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
+		/* Extra for alignment requirement. */
+		extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
+		BUG_ON(i == 0 && extra != 0);
+
+		if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
+			continue;
+
+		/* Transfer extra to previous block. */
+		if (pcpu_size[i-1] < 0)
+			pcpu_size[i-1] -= extra;
+		else
+			pcpu_size[i-1] += extra;
+		pcpu_size[i] -= extra;
+		ptr += extra;
+
+		/* Split block if warranted */
+		if (pcpu_size[i] - size > sizeof(unsigned long))
+			if (!split_block(i, size))
+				return NULL;
+
+		/* Mark allocated */
+		pcpu_size[i] = -pcpu_size[i];
+		return ptr;
+	}
+
+	printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
+	       size);
+	return NULL;
+}
+
+static void percpu_modfree(void *freeme)
+{
+	unsigned int i;
+	void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
+
+	/* First entry is core kernel percpu data. */
+	for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
+		if (ptr == freeme) {
+			pcpu_size[i] = -pcpu_size[i];
+			goto free;
+		}
+	}
+	BUG();
+
+ free:
+	/* Merge with previous? */
+	if (pcpu_size[i-1] >= 0) {
+		pcpu_size[i-1] += pcpu_size[i];
+		pcpu_num_used--;
+		memmove(&pcpu_size[i], &pcpu_size[i+1],
+			(pcpu_num_used - i) * sizeof(pcpu_size[0]));
+		i--;
+	}
+	/* Merge with next? */
+	if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
+		pcpu_size[i] += pcpu_size[i+1];
+		pcpu_num_used--;
+		memmove(&pcpu_size[i+1], &pcpu_size[i+2],
+			(pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
+	}
+}
+
+static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+				 Elf_Shdr *sechdrs,
+				 const char *secstrings)
+{
+	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+}
+
+static int percpu_modinit(void)
+{
+	pcpu_num_used = 2;
+	pcpu_num_allocated = 2;
+	pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
+			    GFP_KERNEL);
+	/* Static in-kernel percpu data (used). */
+	pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);
+	/* Free room. */
+	pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
+	if (pcpu_size[1] < 0) {
+		printk(KERN_ERR "No per-cpu room for modules.\n");
+		pcpu_num_used = 1;
+	}
+
+	return 0;
+}	
+__initcall(percpu_modinit);
+#else /* ... !CONFIG_SMP */
+static inline void *percpu_modalloc(unsigned long size, unsigned long align)
+{
+	return NULL;
+}
+static inline void percpu_modfree(void *pcpuptr)
+{
+	BUG();
+}
+static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
+					Elf_Shdr *sechdrs,
+					const char *secstrings)
+{
+	return 0;
+}
+static inline void percpu_modcopy(void *pcpudst, const void *src,
+				  unsigned long size)
+{
+	/* pcpusec should be 0, and size of that section should be 0. */
+	BUG_ON(size != 0);
+}
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_MODULE_UNLOAD
+/* Init the unload section of the module. */
+static void module_unload_init(struct module *mod)
+{
+	unsigned int i;
+
+	INIT_LIST_HEAD(&mod->modules_which_use_me);
+	for (i = 0; i < NR_CPUS; i++)
+		local_set(&mod->ref[i].count, 0);
+	/* Hold reference count during initialization. */
+	local_set(&mod->ref[_smp_processor_id()].count, 1);
+	/* Backwards compatibility macros put refcount during init. */
+	mod->waiter = current;
+}
+
+/* modules using other modules */
+struct module_use
+{
+	struct list_head list;
+	struct module *module_which_uses;
+};
+
+/* Does a already use b? */
+static int already_uses(struct module *a, struct module *b)
+{
+	struct module_use *use;
+
+	list_for_each_entry(use, &b->modules_which_use_me, list) {
+		if (use->module_which_uses == a) {
+			DEBUGP("%s uses %s!\n", a->name, b->name);
+			return 1;
+		}
+	}
+	DEBUGP("%s does not use %s!\n", a->name, b->name);
+	return 0;
+}
+
+/* Module a uses b */
+static int use_module(struct module *a, struct module *b)
+{
+	struct module_use *use;
+	if (b == NULL || already_uses(a, b)) return 1;
+
+	if (!strong_try_module_get(b))
+		return 0;
+
+	DEBUGP("Allocating new usage for %s.\n", a->name);
+	use = kmalloc(sizeof(*use), GFP_ATOMIC);
+	if (!use) {
+		printk("%s: out of memory loading\n", a->name);
+		module_put(b);
+		return 0;
+	}
+
+	use->module_which_uses = a;
+	list_add(&use->list, &b->modules_which_use_me);
+	return 1;
+}
+
+/* Clear the unload stuff of the module. */
+static void module_unload_free(struct module *mod)
+{
+	struct module *i;
+
+	list_for_each_entry(i, &modules, list) {
+		struct module_use *use;
+
+		list_for_each_entry(use, &i->modules_which_use_me, list) {
+			if (use->module_which_uses == mod) {
+				DEBUGP("%s unusing %s\n", mod->name, i->name);
+				module_put(i);
+				list_del(&use->list);
+				kfree(use);
+				/* There can be at most one match. */
+				break;
+			}
+		}
+	}
+}
+
+#ifdef CONFIG_MODULE_FORCE_UNLOAD
+static inline int try_force(unsigned int flags)
+{
+	int ret = (flags & O_TRUNC);
+	if (ret)
+		tainted |= TAINT_FORCED_MODULE;
+	return ret;
+}
+#else
+static inline int try_force(unsigned int flags)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULE_FORCE_UNLOAD */
+
+struct stopref
+{
+	struct module *mod;
+	int flags;
+	int *forced;
+};
+
+/* Whole machine is stopped with interrupts off when this runs. */
+static int __try_stop_module(void *_sref)
+{
+	struct stopref *sref = _sref;
+
+	/* If it's not unused, quit unless we are told to block. */
+	if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {
+		if (!(*sref->forced = try_force(sref->flags)))
+			return -EWOULDBLOCK;
+	}
+
+	/* Mark it as dying. */
+	sref->mod->state = MODULE_STATE_GOING;
+	return 0;
+}
+
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
+	struct stopref sref = { mod, flags, forced };
+
+	return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+}
+
+unsigned int module_refcount(struct module *mod)
+{
+	unsigned int i, total = 0;
+
+	for (i = 0; i < NR_CPUS; i++)
+		total += local_read(&mod->ref[i].count);
+	return total;
+}
+EXPORT_SYMBOL(module_refcount);
+
+/* This exists whether we can unload or not */
+static void free_module(struct module *mod);
+
+static void wait_for_zero_refcount(struct module *mod)
+{
+	/* Since we might sleep for some time, drop the semaphore first */
+	up(&module_mutex);
+	for (;;) {
+		DEBUGP("Looking at refcount...\n");
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (module_refcount(mod) == 0)
+			break;
+		schedule();
+	}
+	current->state = TASK_RUNNING;
+	down(&module_mutex);
+}
+
+asmlinkage long
+sys_delete_module(const char __user *name_user, unsigned int flags)
+{
+	struct module *mod;
+	char name[MODULE_NAME_LEN];
+	int ret, forced = 0;
+
+	if (!capable(CAP_SYS_MODULE))
+		return -EPERM;
+
+	if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
+		return -EFAULT;
+	name[MODULE_NAME_LEN-1] = '\0';
+
+	if (down_interruptible(&module_mutex) != 0)
+		return -EINTR;
+
+	mod = find_module(name);
+	if (!mod) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (!list_empty(&mod->modules_which_use_me)) {
+		/* Other modules depend on us: get rid of them first. */
+		ret = -EWOULDBLOCK;
+		goto out;
+	}
+
+	/* Doing init or already dying? */
+	if (mod->state != MODULE_STATE_LIVE) {
+		/* FIXME: if (force), slam module count and wake up
+                   waiter --RR */
+		DEBUGP("%s already dying\n", mod->name);
+		ret = -EBUSY;
+		goto out;
+	}
+
+	/* If it has an init func, it must have an exit func to unload */
+	if ((mod->init != NULL && mod->exit == NULL)
+	    || mod->unsafe) {
+		forced = try_force(flags);
+		if (!forced) {
+			/* This module can't be removed */
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	/* Set this up before setting mod->state */
+	mod->waiter = current;
+
+	/* Stop the machine so refcounts can't move and disable module. */
+	ret = try_stop_module(mod, flags, &forced);
+	if (ret != 0)
+		goto out;
+
+	/* Never wait if forced. */
+	if (!forced && module_refcount(mod) != 0)
+		wait_for_zero_refcount(mod);
+
+	/* Final destruction now noone is using it. */
+	if (mod->exit != NULL) {
+		up(&module_mutex);
+		mod->exit();
+		down(&module_mutex);
+	}
+	free_module(mod);
+
+ out:
+	up(&module_mutex);
+	return ret;
+}
+
+static void print_unload_info(struct seq_file *m, struct module *mod)
+{
+	struct module_use *use;
+	int printed_something = 0;
+
+	seq_printf(m, " %u ", module_refcount(mod));
+
+	/* Always include a trailing , so userspace can differentiate
+           between this and the old multi-field proc format. */
+	list_for_each_entry(use, &mod->modules_which_use_me, list) {
+		printed_something = 1;
+		seq_printf(m, "%s,", use->module_which_uses->name);
+	}
+
+	if (mod->unsafe) {
+		printed_something = 1;
+		seq_printf(m, "[unsafe],");
+	}
+
+	if (mod->init != NULL && mod->exit == NULL) {
+		printed_something = 1;
+		seq_printf(m, "[permanent],");
+	}
+
+	if (!printed_something)
+		seq_printf(m, "-");
+}
+
+void __symbol_put(const char *symbol)
+{
+	struct module *owner;
+	unsigned long flags;
+	const unsigned long *crc;
+
+	spin_lock_irqsave(&modlist_lock, flags);
+	if (!__find_symbol(symbol, &owner, &crc, 1))
+		BUG();
+	module_put(owner);
+	spin_unlock_irqrestore(&modlist_lock, flags);
+}
+EXPORT_SYMBOL(__symbol_put);
+
+void symbol_put_addr(void *addr)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&modlist_lock, flags);
+	if (!kernel_text_address((unsigned long)addr))
+		BUG();
+
+	module_put(module_text_address((unsigned long)addr));
+	spin_unlock_irqrestore(&modlist_lock, flags);
+}
+EXPORT_SYMBOL_GPL(symbol_put_addr);
+
+static ssize_t show_refcnt(struct module_attribute *mattr,
+			   struct module *mod, char *buffer)
+{
+	/* sysfs holds a reference */
+	return sprintf(buffer, "%u\n", module_refcount(mod)-1);
+}
+
+static struct module_attribute refcnt = {
+	.attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE },
+	.show = show_refcnt,
+};
+
+#else /* !CONFIG_MODULE_UNLOAD */
+static void print_unload_info(struct seq_file *m, struct module *mod)
+{
+	/* We don't know the usage count, or what modules are using. */
+	seq_printf(m, " - -");
+}
+
+static inline void module_unload_free(struct module *mod)
+{
+}
+
+static inline int use_module(struct module *a, struct module *b)
+{
+	return strong_try_module_get(b);
+}
+
+static inline void module_unload_init(struct module *mod)
+{
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+#ifdef CONFIG_OBSOLETE_MODPARM
+/* Bounds checking done below */
+static int obsparm_copy_string(const char *val, struct kernel_param *kp)
+{
+	strcpy(kp->arg, val);
+	return 0;
+}
+
+int set_obsolete(const char *val, struct kernel_param *kp)
+{
+	unsigned int min, max;
+	unsigned int size, maxsize;
+	int dummy;
+	char *endp;
+	const char *p;
+	struct obsolete_modparm *obsparm = kp->arg;
+
+	if (!val) {
+		printk(KERN_ERR "Parameter %s needs an argument\n", kp->name);
+		return -EINVAL;
+	}
+
+	/* type is: [min[-max]]{b,h,i,l,s} */
+	p = obsparm->type;
+	min = simple_strtol(p, &endp, 10);
+	if (endp == obsparm->type)
+		min = max = 1;
+	else if (*endp == '-') {
+		p = endp+1;
+		max = simple_strtol(p, &endp, 10);
+	} else
+		max = min;
+	switch (*endp) {
+	case 'b':
+		return param_array(kp->name, val, min, max, obsparm->addr,
+				   1, param_set_byte, &dummy);
+	case 'h':
+		return param_array(kp->name, val, min, max, obsparm->addr,
+				   sizeof(short), param_set_short, &dummy);
+	case 'i':
+		return param_array(kp->name, val, min, max, obsparm->addr,
+				   sizeof(int), param_set_int, &dummy);
+	case 'l':
+		return param_array(kp->name, val, min, max, obsparm->addr,
+				   sizeof(long), param_set_long, &dummy);
+	case 's':
+		return param_array(kp->name, val, min, max, obsparm->addr,
+				   sizeof(char *), param_set_charp, &dummy);
+
+	case 'c':
+		/* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars,
+		   and the decl is "char xxx[5][50];" */
+		p = endp+1;
+		maxsize = simple_strtol(p, &endp, 10);
+		/* We check lengths here (yes, this is a hack). */
+		p = val;
+		while (p[size = strcspn(p, ",")]) {
+			if (size >= maxsize) 
+				goto oversize;
+			p += size+1;
+		}
+		if (size >= maxsize) 
+			goto oversize;
+		return param_array(kp->name, val, min, max, obsparm->addr,
+				   maxsize, obsparm_copy_string, &dummy);
+	}
+	printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type);
+	return -EINVAL;
+ oversize:
+	printk(KERN_ERR
+	       "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize);
+	return -EINVAL;
+}
+
+static int obsolete_params(const char *name,
+			   char *args,
+			   struct obsolete_modparm obsparm[],
+			   unsigned int num,
+			   Elf_Shdr *sechdrs,
+			   unsigned int symindex,
+			   const char *strtab)
+{
+	struct kernel_param *kp;
+	unsigned int i;
+	int ret;
+
+	kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL);
+	if (!kp)
+		return -ENOMEM;
+
+	for (i = 0; i < num; i++) {
+		char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)];
+
+		snprintf(sym_name, sizeof(sym_name), "%s%s",
+			 MODULE_SYMBOL_PREFIX, obsparm[i].name);
+
+		kp[i].name = obsparm[i].name;
+		kp[i].perm = 000;
+		kp[i].set = set_obsolete;
+		kp[i].get = NULL;
+		obsparm[i].addr
+			= (void *)find_local_symbol(sechdrs, symindex, strtab,
+						    sym_name);
+		if (!obsparm[i].addr) {
+			printk("%s: falsely claims to have parameter %s\n",
+			       name, obsparm[i].name);
+			ret = -EINVAL;
+			goto out;
+		}
+		kp[i].arg = &obsparm[i];
+	}
+
+	ret = parse_args(name, args, kp, num, NULL);
+ out:
+	kfree(kp);
+	return ret;
+}
+#else
+static int obsolete_params(const char *name,
+			   char *args,
+			   struct obsolete_modparm obsparm[],
+			   unsigned int num,
+			   Elf_Shdr *sechdrs,
+			   unsigned int symindex,
+			   const char *strtab)
+{
+	if (num != 0)
+		printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
+		       name);
+	return 0;
+}
+#endif /* CONFIG_OBSOLETE_MODPARM */
+
+static const char vermagic[] = VERMAGIC_STRING;
+
+#ifdef CONFIG_MODVERSIONS
+static int check_version(Elf_Shdr *sechdrs,
+			 unsigned int versindex,
+			 const char *symname,
+			 struct module *mod, 
+			 const unsigned long *crc)
+{
+	unsigned int i, num_versions;
+	struct modversion_info *versions;
+
+	/* Exporting module didn't supply crcs?  OK, we're already tainted. */
+	if (!crc)
+		return 1;
+
+	versions = (void *) sechdrs[versindex].sh_addr;
+	num_versions = sechdrs[versindex].sh_size
+		/ sizeof(struct modversion_info);
+
+	for (i = 0; i < num_versions; i++) {
+		if (strcmp(versions[i].name, symname) != 0)
+			continue;
+
+		if (versions[i].crc == *crc)
+			return 1;
+		printk("%s: disagrees about version of symbol %s\n",
+		       mod->name, symname);
+		DEBUGP("Found checksum %lX vs module %lX\n",
+		       *crc, versions[i].crc);
+		return 0;
+	}
+	/* Not in module's version table.  OK, but that taints the kernel. */
+	if (!(tainted & TAINT_FORCED_MODULE)) {
+		printk("%s: no version for \"%s\" found: kernel tainted.\n",
+		       mod->name, symname);
+		tainted |= TAINT_FORCED_MODULE;
+	}
+	return 1;
+}
+
+static inline int check_modstruct_version(Elf_Shdr *sechdrs,
+					  unsigned int versindex,
+					  struct module *mod)
+{
+	const unsigned long *crc;
+	struct module *owner;
+
+	if (!__find_symbol("struct_module", &owner, &crc, 1))
+		BUG();
+	return check_version(sechdrs, versindex, "struct_module", mod,
+			     crc);
+}
+
+/* First part is kernel version, which we ignore. */
+static inline int same_magic(const char *amagic, const char *bmagic)
+{
+	amagic += strcspn(amagic, " ");
+	bmagic += strcspn(bmagic, " ");
+	return strcmp(amagic, bmagic) == 0;
+}
+#else
+static inline int check_version(Elf_Shdr *sechdrs,
+				unsigned int versindex,
+				const char *symname,
+				struct module *mod, 
+				const unsigned long *crc)
+{
+	return 1;
+}
+
+static inline int check_modstruct_version(Elf_Shdr *sechdrs,
+					  unsigned int versindex,
+					  struct module *mod)
+{
+	return 1;
+}
+
+static inline int same_magic(const char *amagic, const char *bmagic)
+{
+	return strcmp(amagic, bmagic) == 0;
+}
+#endif /* CONFIG_MODVERSIONS */
+
+/* Resolve a symbol for this module.  I.e. if we find one, record usage.
+   Must be holding module_mutex. */
+static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
+				    unsigned int versindex,
+				    const char *name,
+				    struct module *mod)
+{
+	struct module *owner;
+	unsigned long ret;
+	const unsigned long *crc;
+
+	spin_lock_irq(&modlist_lock);
+	ret = __find_symbol(name, &owner, &crc, mod->license_gplok);
+	if (ret) {
+		/* use_module can fail due to OOM, or module unloading */
+		if (!check_version(sechdrs, versindex, name, mod, crc) ||
+		    !use_module(mod, owner))
+			ret = 0;
+	}
+	spin_unlock_irq(&modlist_lock);
+	return ret;
+}
+
+
+/*
+ * /sys/module/foo/sections stuff
+ * J. Corbet <corbet@lwn.net>
+ */
+#ifdef CONFIG_KALLSYMS
+static ssize_t module_sect_show(struct module_attribute *mattr,
+				struct module *mod, char *buf)
+{
+	struct module_sect_attr *sattr =
+		container_of(mattr, struct module_sect_attr, mattr);
+	return sprintf(buf, "0x%lx\n", sattr->address);
+}
+
+static void add_sect_attrs(struct module *mod, unsigned int nsect,
+		char *secstrings, Elf_Shdr *sechdrs)
+{
+	unsigned int nloaded = 0, i, size[2];
+	struct module_sect_attrs *sect_attrs;
+	struct module_sect_attr *sattr;
+	struct attribute **gattr;
+	
+	/* Count loaded sections and allocate structures */
+	for (i = 0; i < nsect; i++)
+		if (sechdrs[i].sh_flags & SHF_ALLOC)
+			nloaded++;
+	size[0] = ALIGN(sizeof(*sect_attrs)
+			+ nloaded * sizeof(sect_attrs->attrs[0]),
+			sizeof(sect_attrs->grp.attrs[0]));
+	size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
+	if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL)))
+		return;
+
+	/* Setup section attributes. */
+	sect_attrs->grp.name = "sections";
+	sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
+
+	sattr = &sect_attrs->attrs[0];
+	gattr = &sect_attrs->grp.attrs[0];
+	for (i = 0; i < nsect; i++) {
+		if (! (sechdrs[i].sh_flags & SHF_ALLOC))
+			continue;
+		sattr->address = sechdrs[i].sh_addr;
+		strlcpy(sattr->name, secstrings + sechdrs[i].sh_name,
+			MODULE_SECT_NAME_LEN);
+		sattr->mattr.show = module_sect_show;
+		sattr->mattr.store = NULL;
+		sattr->mattr.attr.name = sattr->name;
+		sattr->mattr.attr.owner = mod;
+		sattr->mattr.attr.mode = S_IRUGO;
+		*(gattr++) = &(sattr++)->mattr.attr;
+	}
+	*gattr = NULL;
+
+	if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
+		goto out;
+
+	mod->sect_attrs = sect_attrs;
+	return;
+  out:
+	kfree(sect_attrs);
+}
+
+static void remove_sect_attrs(struct module *mod)
+{
+	if (mod->sect_attrs) {
+		sysfs_remove_group(&mod->mkobj.kobj,
+				   &mod->sect_attrs->grp);
+		/* We are positive that no one is using any sect attrs
+		 * at this point.  Deallocate immediately. */
+		kfree(mod->sect_attrs);
+		mod->sect_attrs = NULL;
+	}
+}
+
+
+#else
+static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
+		char *sectstrings, Elf_Shdr *sechdrs)
+{
+}
+
+static inline void remove_sect_attrs(struct module *mod)
+{
+}
+#endif /* CONFIG_KALLSYMS */
+
+
+#ifdef CONFIG_MODULE_UNLOAD
+static inline int module_add_refcnt_attr(struct module *mod)
+{
+	return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr);
+}
+static void module_remove_refcnt_attr(struct module *mod)
+{
+	return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr);
+}
+#else
+static inline int module_add_refcnt_attr(struct module *mod)
+{
+	return 0;
+}
+static void module_remove_refcnt_attr(struct module *mod)
+{
+}
+#endif
+
+
+static int mod_sysfs_setup(struct module *mod,
+			   struct kernel_param *kparam,
+			   unsigned int num_params)
+{
+	int err;
+
+	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+	err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
+	if (err)
+		goto out;
+	kobj_set_kset_s(&mod->mkobj, module_subsys);
+	mod->mkobj.mod = mod;
+	err = kobject_register(&mod->mkobj.kobj);
+	if (err)
+		goto out;
+
+	err = module_add_refcnt_attr(mod);
+	if (err)
+		goto out_unreg;
+
+	err = module_param_sysfs_setup(mod, kparam, num_params);
+	if (err)
+		goto out_unreg;
+
+	return 0;
+
+out_unreg:
+	kobject_unregister(&mod->mkobj.kobj);
+out:
+	return err;
+}
+
+static void mod_kobject_remove(struct module *mod)
+{
+	module_remove_refcnt_attr(mod);
+	module_param_sysfs_remove(mod);
+
+	kobject_unregister(&mod->mkobj.kobj);
+}
+
+/*
+ * unlink the module with the whole machine is stopped with interrupts off
+ * - this defends against kallsyms not taking locks
+ */
+static int __unlink_module(void *_mod)
+{
+	struct module *mod = _mod;
+	list_del(&mod->list);
+	return 0;
+}
+
+/* Free a module, remove from lists, etc (must hold module mutex). */
+static void free_module(struct module *mod)
+{
+	/* Delete from various lists */
+	stop_machine_run(__unlink_module, mod, NR_CPUS);
+	remove_sect_attrs(mod);
+	mod_kobject_remove(mod);
+
+	/* Arch-specific cleanup. */
+	module_arch_cleanup(mod);
+
+	/* Module unload stuff */
+	module_unload_free(mod);
+
+	/* This may be NULL, but that's OK */
+	module_free(mod, mod->module_init);
+	kfree(mod->args);
+	if (mod->percpu)
+		percpu_modfree(mod->percpu);
+
+	/* Finally, free the core (containing the module structure) */
+	module_free(mod, mod->module_core);
+}
+
+void *__symbol_get(const char *symbol)
+{
+	struct module *owner;
+	unsigned long value, flags;
+	const unsigned long *crc;
+
+	spin_lock_irqsave(&modlist_lock, flags);
+	value = __find_symbol(symbol, &owner, &crc, 1);
+	if (value && !strong_try_module_get(owner))
+		value = 0;
+	spin_unlock_irqrestore(&modlist_lock, flags);
+
+	return (void *)value;
+}
+EXPORT_SYMBOL_GPL(__symbol_get);
+
+/* Change all symbols so that sh_value encodes the pointer directly. */
+static int simplify_symbols(Elf_Shdr *sechdrs,
+			    unsigned int symindex,
+			    const char *strtab,
+			    unsigned int versindex,
+			    unsigned int pcpuindex,
+			    struct module *mod)
+{
+	Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
+	unsigned long secbase;
+	unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
+	int ret = 0;
+
+	for (i = 1; i < n; i++) {
+		switch (sym[i].st_shndx) {
+		case SHN_COMMON:
+			/* We compiled with -fno-common.  These are not
+			   supposed to happen.  */
+			DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name);
+			printk("%s: please compile with -fno-common\n",
+			       mod->name);
+			ret = -ENOEXEC;
+			break;
+
+		case SHN_ABS:
+			/* Don't need to do anything */
+			DEBUGP("Absolute symbol: 0x%08lx\n",
+			       (long)sym[i].st_value);
+			break;
+
+		case SHN_UNDEF:
+			sym[i].st_value
+			  = resolve_symbol(sechdrs, versindex,
+					   strtab + sym[i].st_name, mod);
+
+			/* Ok if resolved.  */
+			if (sym[i].st_value != 0)
+				break;
+			/* Ok if weak.  */
+			if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
+				break;
+
+			printk(KERN_WARNING "%s: Unknown symbol %s\n",
+			       mod->name, strtab + sym[i].st_name);
+			ret = -ENOENT;
+			break;
+
+		default:
+			/* Divert to percpu allocation if a percpu var. */
+			if (sym[i].st_shndx == pcpuindex)
+				secbase = (unsigned long)mod->percpu;
+			else
+				secbase = sechdrs[sym[i].st_shndx].sh_addr;
+			sym[i].st_value += secbase;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/* Update size with this section: return offset. */
+static long get_offset(unsigned long *size, Elf_Shdr *sechdr)
+{
+	long ret;
+
+	ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
+	*size = ret + sechdr->sh_size;
+	return ret;
+}
+
+/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
+   might -- code, read-only data, read-write data, small data.  Tally
+   sizes, and place the offsets into sh_entsize fields: high bit means it
+   belongs in init. */
+static void layout_sections(struct module *mod,
+			    const Elf_Ehdr *hdr,
+			    Elf_Shdr *sechdrs,
+			    const char *secstrings)
+{
+	static unsigned long const masks[][2] = {
+		/* NOTE: all executable code must be the first section
+		 * in this array; otherwise modify the text_size
+		 * finder in the two loops below */
+		{ SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
+		{ SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+		{ SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
+		{ ARCH_SHF_SMALL | SHF_ALLOC, 0 }
+	};
+	unsigned int m, i;
+
+	for (i = 0; i < hdr->e_shnum; i++)
+		sechdrs[i].sh_entsize = ~0UL;
+
+	DEBUGP("Core section allocation order:\n");
+	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+		for (i = 0; i < hdr->e_shnum; ++i) {
+			Elf_Shdr *s = &sechdrs[i];
+
+			if ((s->sh_flags & masks[m][0]) != masks[m][0]
+			    || (s->sh_flags & masks[m][1])
+			    || s->sh_entsize != ~0UL
+			    || strncmp(secstrings + s->sh_name,
+				       ".init", 5) == 0)
+				continue;
+			s->sh_entsize = get_offset(&mod->core_size, s);
+			DEBUGP("\t%s\n", secstrings + s->sh_name);
+		}
+		if (m == 0)
+			mod->core_text_size = mod->core_size;
+	}
+
+	DEBUGP("Init section allocation order:\n");
+	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+		for (i = 0; i < hdr->e_shnum; ++i) {
+			Elf_Shdr *s = &sechdrs[i];
+
+			if ((s->sh_flags & masks[m][0]) != masks[m][0]
+			    || (s->sh_flags & masks[m][1])
+			    || s->sh_entsize != ~0UL
+			    || strncmp(secstrings + s->sh_name,
+				       ".init", 5) != 0)
+				continue;
+			s->sh_entsize = (get_offset(&mod->init_size, s)
+					 | INIT_OFFSET_MASK);
+			DEBUGP("\t%s\n", secstrings + s->sh_name);
+		}
+		if (m == 0)
+			mod->init_text_size = mod->init_size;
+	}
+}
+
+static inline int license_is_gpl_compatible(const char *license)
+{
+	return (strcmp(license, "GPL") == 0
+		|| strcmp(license, "GPL v2") == 0
+		|| strcmp(license, "GPL and additional rights") == 0
+		|| strcmp(license, "Dual BSD/GPL") == 0
+		|| strcmp(license, "Dual MPL/GPL") == 0);
+}
+
+static void set_license(struct module *mod, const char *license)
+{
+	if (!license)
+		license = "unspecified";
+
+	mod->license_gplok = license_is_gpl_compatible(license);
+	if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {
+		printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
+		       mod->name, license);
+		tainted |= TAINT_PROPRIETARY_MODULE;
+	}
+}
+
+/* Parse tag=value strings from .modinfo section */
+static char *next_string(char *string, unsigned long *secsize)
+{
+	/* Skip non-zero chars */
+	while (string[0]) {
+		string++;
+		if ((*secsize)-- <= 1)
+			return NULL;
+	}
+
+	/* Skip any zero padding. */
+	while (!string[0]) {
+		string++;
+		if ((*secsize)-- <= 1)
+			return NULL;
+	}
+	return string;
+}
+
+static char *get_modinfo(Elf_Shdr *sechdrs,
+			 unsigned int info,
+			 const char *tag)
+{
+	char *p;
+	unsigned int taglen = strlen(tag);
+	unsigned long size = sechdrs[info].sh_size;
+
+	for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) {
+		if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
+			return p + taglen + 1;
+	}
+	return NULL;
+}
+
+#ifdef CONFIG_KALLSYMS
+int is_exported(const char *name, const struct module *mod)
+{
+	unsigned int i;
+
+	if (!mod) {
+		for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++)
+			if (strcmp(__start___ksymtab[i].name, name) == 0)
+				return 1;
+		return 0;
+	}
+	for (i = 0; i < mod->num_syms; i++)
+		if (strcmp(mod->syms[i].name, name) == 0)
+			return 1;
+	return 0;
+}
+
+/* As per nm */
+static char elf_type(const Elf_Sym *sym,
+		     Elf_Shdr *sechdrs,
+		     const char *secstrings,
+		     struct module *mod)
+{
+	if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
+		if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
+			return 'v';
+		else
+			return 'w';
+	}
+	if (sym->st_shndx == SHN_UNDEF)
+		return 'U';
+	if (sym->st_shndx == SHN_ABS)
+		return 'a';
+	if (sym->st_shndx >= SHN_LORESERVE)
+		return '?';
+	if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
+		return 't';
+	if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
+	    && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
+		if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
+			return 'r';
+		else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+			return 'g';
+		else
+			return 'd';
+	}
+	if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+		if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+			return 's';
+		else
+			return 'b';
+	}
+	if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name,
+		    ".debug", strlen(".debug")) == 0)
+		return 'n';
+	return '?';
+}
+
+static void add_kallsyms(struct module *mod,
+			 Elf_Shdr *sechdrs,
+			 unsigned int symindex,
+			 unsigned int strindex,
+			 const char *secstrings)
+{
+	unsigned int i;
+
+	mod->symtab = (void *)sechdrs[symindex].sh_addr;
+	mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
+	mod->strtab = (void *)sechdrs[strindex].sh_addr;
+
+	/* Set types up while we still have access to sections. */
+	for (i = 0; i < mod->num_symtab; i++)
+		mod->symtab[i].st_info
+			= elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
+}
+#else
+static inline void add_kallsyms(struct module *mod,
+				Elf_Shdr *sechdrs,
+				unsigned int symindex,
+				unsigned int strindex,
+				const char *secstrings)
+{
+}
+#endif /* CONFIG_KALLSYMS */
+
+/* Allocate and load the module: note that size of section 0 is always
+   zero, and we rely on this for optional sections. */
+static struct module *load_module(void __user *umod,
+				  unsigned long len,
+				  const char __user *uargs)
+{
+	Elf_Ehdr *hdr;
+	Elf_Shdr *sechdrs;
+	char *secstrings, *args, *modmagic, *strtab = NULL;
+	unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
+		exportindex, modindex, obsparmindex, infoindex, gplindex,
+		crcindex, gplcrcindex, versindex, pcpuindex;
+	long arglen;
+	struct module *mod;
+	long err = 0;
+	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
+	struct exception_table_entry *extable;
+
+	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
+	       umod, len, uargs);
+	if (len < sizeof(*hdr))
+		return ERR_PTR(-ENOEXEC);
+
+	/* Suck in entire file: we'll want most of it. */
+	/* vmalloc barfs on "unusual" numbers.  Check here */
+	if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (copy_from_user(hdr, umod, len) != 0) {
+		err = -EFAULT;
+		goto free_hdr;
+	}
+
+	/* Sanity checks against insmoding binaries or wrong arch,
+           weird elf version */
+	if (memcmp(hdr->e_ident, ELFMAG, 4) != 0
+	    || hdr->e_type != ET_REL
+	    || !elf_check_arch(hdr)
+	    || hdr->e_shentsize != sizeof(*sechdrs)) {
+		err = -ENOEXEC;
+		goto free_hdr;
+	}
+
+	if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr))
+		goto truncated;
+
+	/* Convenience variables */
+	sechdrs = (void *)hdr + hdr->e_shoff;
+	secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+	sechdrs[0].sh_addr = 0;
+
+	for (i = 1; i < hdr->e_shnum; i++) {
+		if (sechdrs[i].sh_type != SHT_NOBITS
+		    && len < sechdrs[i].sh_offset + sechdrs[i].sh_size)
+			goto truncated;
+
+		/* Mark all sections sh_addr with their address in the
+		   temporary image. */
+		sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset;
+
+		/* Internal symbols and strings. */
+		if (sechdrs[i].sh_type == SHT_SYMTAB) {
+			symindex = i;
+			strindex = sechdrs[i].sh_link;
+			strtab = (char *)hdr + sechdrs[strindex].sh_offset;
+		}
+#ifndef CONFIG_MODULE_UNLOAD
+		/* Don't load .exit sections */
+		if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
+			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
+#endif
+	}
+
+	modindex = find_sec(hdr, sechdrs, secstrings,
+			    ".gnu.linkonce.this_module");
+	if (!modindex) {
+		printk(KERN_WARNING "No module found in object\n");
+		err = -ENOEXEC;
+		goto free_hdr;
+	}
+	mod = (void *)sechdrs[modindex].sh_addr;
+
+	if (symindex == 0) {
+		printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
+		       mod->name);
+		err = -ENOEXEC;
+		goto free_hdr;
+	}
+
+	/* Optional sections */
+	exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
+	gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
+	crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
+	gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
+	setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
+	exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
+	obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
+	versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
+	infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
+	pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
+
+	/* Don't keep modinfo section */
+	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+#ifdef CONFIG_KALLSYMS
+	/* Keep symbol and string tables for decoding later. */
+	sechdrs[symindex].sh_flags |= SHF_ALLOC;
+	sechdrs[strindex].sh_flags |= SHF_ALLOC;
+#endif
+
+	/* Check module struct version now, before we try to use module. */
+	if (!check_modstruct_version(sechdrs, versindex, mod)) {
+		err = -ENOEXEC;
+		goto free_hdr;
+	}
+
+	modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
+	/* This is allowed: modprobe --force will invalidate it. */
+	if (!modmagic) {
+		tainted |= TAINT_FORCED_MODULE;
+		printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
+		       mod->name);
+	} else if (!same_magic(modmagic, vermagic)) {
+		printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
+		       mod->name, modmagic, vermagic);
+		err = -ENOEXEC;
+		goto free_hdr;
+	}
+
+	/* Now copy in args */
+	arglen = strlen_user(uargs);
+	if (!arglen) {
+		err = -EFAULT;
+		goto free_hdr;
+	}
+	args = kmalloc(arglen, GFP_KERNEL);
+	if (!args) {
+		err = -ENOMEM;
+		goto free_hdr;
+	}
+	if (copy_from_user(args, uargs, arglen) != 0) {
+		err = -EFAULT;
+		goto free_mod;
+	}
+
+	if (find_module(mod->name)) {
+		err = -EEXIST;
+		goto free_mod;
+	}
+
+	mod->state = MODULE_STATE_COMING;
+
+	/* Allow arches to frob section contents and sizes.  */
+	err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod);
+	if (err < 0)
+		goto free_mod;
+
+	if (pcpuindex) {
+		/* We have a special allocation for this section. */
+		percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
+					 sechdrs[pcpuindex].sh_addralign);
+		if (!percpu) {
+			err = -ENOMEM;
+			goto free_mod;
+		}
+		sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+		mod->percpu = percpu;
+	}
+
+	/* Determine total sizes, and put offsets in sh_entsize.  For now
+	   this is done generically; there doesn't appear to be any
+	   special cases for the architectures. */
+	layout_sections(mod, hdr, sechdrs, secstrings);
+
+	/* Do the allocs. */
+	ptr = module_alloc(mod->core_size);
+	if (!ptr) {
+		err = -ENOMEM;
+		goto free_percpu;
+	}
+	memset(ptr, 0, mod->core_size);
+	mod->module_core = ptr;
+
+	ptr = module_alloc(mod->init_size);
+	if (!ptr && mod->init_size) {
+		err = -ENOMEM;
+		goto free_core;
+	}
+	memset(ptr, 0, mod->init_size);
+	mod->module_init = ptr;
+
+	/* Transfer each section which specifies SHF_ALLOC */
+	DEBUGP("final section addresses:\n");
+	for (i = 0; i < hdr->e_shnum; i++) {
+		void *dest;
+
+		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+			continue;
+
+		if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK)
+			dest = mod->module_init
+				+ (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK);
+		else
+			dest = mod->module_core + sechdrs[i].sh_entsize;
+
+		if (sechdrs[i].sh_type != SHT_NOBITS)
+			memcpy(dest, (void *)sechdrs[i].sh_addr,
+			       sechdrs[i].sh_size);
+		/* Update sh_addr to point to copy in image. */
+		sechdrs[i].sh_addr = (unsigned long)dest;
+		DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name);
+	}
+	/* Module has been moved. */
+	mod = (void *)sechdrs[modindex].sh_addr;
+
+	/* Now we've moved module, initialize linked lists, etc. */
+	module_unload_init(mod);
+
+	/* Set up license info based on the info section */
+	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
+
+	/* Fix up syms, so that st_value is a pointer to location. */
+	err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
+			       mod);
+	if (err < 0)
+		goto cleanup;
+
+	/* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */
+	mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms);
+	mod->syms = (void *)sechdrs[exportindex].sh_addr;
+	if (crcindex)
+		mod->crcs = (void *)sechdrs[crcindex].sh_addr;
+	mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms);
+	mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
+	if (gplcrcindex)
+		mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
+
+#ifdef CONFIG_MODVERSIONS
+	if ((mod->num_syms && !crcindex) || 
+	    (mod->num_gpl_syms && !gplcrcindex)) {
+		printk(KERN_WARNING "%s: No versions for exported symbols."
+		       " Tainting kernel.\n", mod->name);
+		tainted |= TAINT_FORCED_MODULE;
+	}
+#endif
+
+	/* Now do relocations. */
+	for (i = 1; i < hdr->e_shnum; i++) {
+		const char *strtab = (char *)sechdrs[strindex].sh_addr;
+		unsigned int info = sechdrs[i].sh_info;
+
+		/* Not a valid relocation section? */
+		if (info >= hdr->e_shnum)
+			continue;
+
+		/* Don't bother with non-allocated sections */
+		if (!(sechdrs[info].sh_flags & SHF_ALLOC))
+			continue;
+
+		if (sechdrs[i].sh_type == SHT_REL)
+			err = apply_relocate(sechdrs, strtab, symindex, i,mod);
+		else if (sechdrs[i].sh_type == SHT_RELA)
+			err = apply_relocate_add(sechdrs, strtab, symindex, i,
+						 mod);
+		if (err < 0)
+			goto cleanup;
+	}
+
+  	/* Set up and sort exception table */
+	mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);
+	mod->extable = extable = (void *)sechdrs[exindex].sh_addr;
+	sort_extable(extable, extable + mod->num_exentries);
+
+	/* Finally, copy percpu area over. */
+	percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
+		       sechdrs[pcpuindex].sh_size);
+
+	add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
+
+	err = module_finalize(hdr, sechdrs, mod);
+	if (err < 0)
+		goto cleanup;
+
+	mod->args = args;
+	if (obsparmindex) {
+		err = obsolete_params(mod->name, mod->args,
+				      (struct obsolete_modparm *)
+				      sechdrs[obsparmindex].sh_addr,
+				      sechdrs[obsparmindex].sh_size
+				      / sizeof(struct obsolete_modparm),
+				      sechdrs, symindex,
+				      (char *)sechdrs[strindex].sh_addr);
+		if (setupindex)
+			printk(KERN_WARNING "%s: Ignoring new-style "
+			       "parameters in presence of obsolete ones\n",
+			       mod->name);
+	} else {
+		/* Size of section 0 is 0, so this works well if no params */
+		err = parse_args(mod->name, mod->args,
+				 (struct kernel_param *)
+				 sechdrs[setupindex].sh_addr,
+				 sechdrs[setupindex].sh_size
+				 / sizeof(struct kernel_param),
+				 NULL);
+	}
+	if (err < 0)
+		goto arch_cleanup;
+
+	err = mod_sysfs_setup(mod, 
+			      (struct kernel_param *)
+			      sechdrs[setupindex].sh_addr,
+			      sechdrs[setupindex].sh_size
+			      / sizeof(struct kernel_param));
+	if (err < 0)
+		goto arch_cleanup;
+	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
+
+	/* Get rid of temporary copy */
+	vfree(hdr);
+
+	/* Done! */
+	return mod;
+
+ arch_cleanup:
+	module_arch_cleanup(mod);
+ cleanup:
+	module_unload_free(mod);
+	module_free(mod, mod->module_init);
+ free_core:
+	module_free(mod, mod->module_core);
+ free_percpu:
+	if (percpu)
+		percpu_modfree(percpu);
+ free_mod:
+	kfree(args);
+ free_hdr:
+	vfree(hdr);
+	if (err < 0) return ERR_PTR(err);
+	else return ptr;
+
+ truncated:
+	printk(KERN_ERR "Module len %lu truncated\n", len);
+	err = -ENOEXEC;
+	goto free_hdr;
+}
+
+/*
+ * link the module with the whole machine is stopped with interrupts off
+ * - this defends against kallsyms not taking locks
+ */
+static int __link_module(void *_mod)
+{
+	struct module *mod = _mod;
+	list_add(&mod->list, &modules);
+	return 0;
+}
+
+/* This is where the real work happens */
+asmlinkage long
+sys_init_module(void __user *umod,
+		unsigned long len,
+		const char __user *uargs)
+{
+	struct module *mod;
+	int ret = 0;
+
+	/* Must have permission */
+	if (!capable(CAP_SYS_MODULE))
+		return -EPERM;
+
+	/* Only one module load at a time, please */
+	if (down_interruptible(&module_mutex) != 0)
+		return -EINTR;
+
+	/* Do all the hard work */
+	mod = load_module(umod, len, uargs);
+	if (IS_ERR(mod)) {
+		up(&module_mutex);
+		return PTR_ERR(mod);
+	}
+
+	/* Flush the instruction cache, since we've played with text */
+	if (mod->module_init)
+		flush_icache_range((unsigned long)mod->module_init,
+				   (unsigned long)mod->module_init
+				   + mod->init_size);
+	flush_icache_range((unsigned long)mod->module_core,
+			   (unsigned long)mod->module_core + mod->core_size);
+
+	/* Now sew it into the lists.  They won't access us, since
+           strong_try_module_get() will fail. */
+	stop_machine_run(__link_module, mod, NR_CPUS);
+
+	/* Drop lock so they can recurse */
+	up(&module_mutex);
+
+	down(&notify_mutex);
+	notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
+	up(&notify_mutex);
+
+	/* Start the module */
+	if (mod->init != NULL)
+		ret = mod->init();
+	if (ret < 0) {
+		/* Init routine failed: abort.  Try to protect us from
+                   buggy refcounters. */
+		mod->state = MODULE_STATE_GOING;
+		synchronize_kernel();
+		if (mod->unsafe)
+			printk(KERN_ERR "%s: module is now stuck!\n",
+			       mod->name);
+		else {
+			module_put(mod);
+			down(&module_mutex);
+			free_module(mod);
+			up(&module_mutex);
+		}
+		return ret;
+	}
+
+	/* Now it's a first class citizen! */
+	down(&module_mutex);
+	mod->state = MODULE_STATE_LIVE;
+	/* Drop initial reference. */
+	module_put(mod);
+	module_free(mod, mod->module_init);
+	mod->module_init = NULL;
+	mod->init_size = 0;
+	mod->init_text_size = 0;
+	up(&module_mutex);
+
+	return 0;
+}
+
+static inline int within(unsigned long addr, void *start, unsigned long size)
+{
+	return ((void *)addr >= start && (void *)addr < start + size);
+}
+
+#ifdef CONFIG_KALLSYMS
+/*
+ * This ignores the intensely annoying "mapping symbols" found
+ * in ARM ELF files: $a, $t and $d.
+ */
+static inline int is_arm_mapping_symbol(const char *str)
+{
+	return str[0] == '$' && strchr("atd", str[1]) 
+	       && (str[2] == '\0' || str[2] == '.');
+}
+
+static const char *get_ksymbol(struct module *mod,
+			       unsigned long addr,
+			       unsigned long *size,
+			       unsigned long *offset)
+{
+	unsigned int i, best = 0;
+	unsigned long nextval;
+
+	/* At worse, next value is at end of module */
+	if (within(addr, mod->module_init, mod->init_size))
+		nextval = (unsigned long)mod->module_init+mod->init_text_size;
+	else 
+		nextval = (unsigned long)mod->module_core+mod->core_text_size;
+
+	/* Scan for closest preceeding symbol, and next symbol. (ELF
+           starts real symbols at 1). */
+	for (i = 1; i < mod->num_symtab; i++) {
+		if (mod->symtab[i].st_shndx == SHN_UNDEF)
+			continue;
+
+		/* We ignore unnamed symbols: they're uninformative
+		 * and inserted at a whim. */
+		if (mod->symtab[i].st_value <= addr
+		    && mod->symtab[i].st_value > mod->symtab[best].st_value
+		    && *(mod->strtab + mod->symtab[i].st_name) != '\0'
+		    && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+			best = i;
+		if (mod->symtab[i].st_value > addr
+		    && mod->symtab[i].st_value < nextval
+		    && *(mod->strtab + mod->symtab[i].st_name) != '\0'
+		    && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+			nextval = mod->symtab[i].st_value;
+	}
+
+	if (!best)
+		return NULL;
+
+	*size = nextval - mod->symtab[best].st_value;
+	*offset = addr - mod->symtab[best].st_value;
+	return mod->strtab + mod->symtab[best].st_name;
+}
+
+/* For kallsyms to ask for address resolution.  NULL means not found.
+   We don't lock, as this is used for oops resolution and races are a
+   lesser concern. */
+const char *module_address_lookup(unsigned long addr,
+				  unsigned long *size,
+				  unsigned long *offset,
+				  char **modname)
+{
+	struct module *mod;
+
+	list_for_each_entry(mod, &modules, list) {
+		if (within(addr, mod->module_init, mod->init_size)
+		    || within(addr, mod->module_core, mod->core_size)) {
+			*modname = mod->name;
+			return get_ksymbol(mod, addr, size, offset);
+		}
+	}
+	return NULL;
+}
+
+struct module *module_get_kallsym(unsigned int symnum,
+				  unsigned long *value,
+				  char *type,
+				  char namebuf[128])
+{
+	struct module *mod;
+
+	down(&module_mutex);
+	list_for_each_entry(mod, &modules, list) {
+		if (symnum < mod->num_symtab) {
+			*value = mod->symtab[symnum].st_value;
+			*type = mod->symtab[symnum].st_info;
+			strncpy(namebuf,
+				mod->strtab + mod->symtab[symnum].st_name,
+				127);
+			up(&module_mutex);
+			return mod;
+		}
+		symnum -= mod->num_symtab;
+	}
+	up(&module_mutex);
+	return NULL;
+}
+
+static unsigned long mod_find_symname(struct module *mod, const char *name)
+{
+	unsigned int i;
+
+	for (i = 0; i < mod->num_symtab; i++)
+		if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0)
+			return mod->symtab[i].st_value;
+	return 0;
+}
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name)
+{
+	struct module *mod;
+	char *colon;
+	unsigned long ret = 0;
+
+	/* Don't lock: we're in enough trouble already. */
+	if ((colon = strchr(name, ':')) != NULL) {
+		*colon = '\0';
+		if ((mod = find_module(name)) != NULL)
+			ret = mod_find_symname(mod, colon+1);
+		*colon = ':';
+	} else {
+		list_for_each_entry(mod, &modules, list)
+			if ((ret = mod_find_symname(mod, name)) != 0)
+				break;
+	}
+	return ret;
+}
+#endif /* CONFIG_KALLSYMS */
+
+/* Called by the /proc file system to return a list of modules. */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+	struct list_head *i;
+	loff_t n = 0;
+
+	down(&module_mutex);
+	list_for_each(i, &modules) {
+		if (n++ == *pos)
+			break;
+	}
+	if (i == &modules)
+		return NULL;
+	return i;
+}
+
+static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct list_head *i = p;
+	(*pos)++;
+	if (i->next == &modules)
+		return NULL;
+	return i->next;
+}
+
+static void m_stop(struct seq_file *m, void *p)
+{
+	up(&module_mutex);
+}
+
+static int m_show(struct seq_file *m, void *p)
+{
+	struct module *mod = list_entry(p, struct module, list);
+	seq_printf(m, "%s %lu",
+		   mod->name, mod->init_size + mod->core_size);
+	print_unload_info(m, mod);
+
+	/* Informative for users. */
+	seq_printf(m, " %s",
+		   mod->state == MODULE_STATE_GOING ? "Unloading":
+		   mod->state == MODULE_STATE_COMING ? "Loading":
+		   "Live");
+	/* Used by oprofile and other similar tools. */
+	seq_printf(m, " 0x%p", mod->module_core);
+
+	seq_printf(m, "\n");
+	return 0;
+}
+
+/* Format: modulename size refcount deps address
+
+   Where refcount is a number or -, and deps is a comma-separated list
+   of depends or -.
+*/
+struct seq_operations modules_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= m_show
+};
+
+/* Given an address, look for it in the module exception tables. */
+const struct exception_table_entry *search_module_extables(unsigned long addr)
+{
+	unsigned long flags;
+	const struct exception_table_entry *e = NULL;
+	struct module *mod;
+
+	spin_lock_irqsave(&modlist_lock, flags);
+	list_for_each_entry(mod, &modules, list) {
+		if (mod->num_exentries == 0)
+			continue;
+				
+		e = search_extable(mod->extable,
+				   mod->extable + mod->num_exentries - 1,
+				   addr);
+		if (e)
+			break;
+	}
+	spin_unlock_irqrestore(&modlist_lock, flags);
+
+	/* Now, if we found one, we are running inside it now, hence
+           we cannot unload the module, hence no refcnt needed. */
+	return e;
+}
+
+/* Is this a valid kernel address?  We don't grab the lock: we are oopsing. */
+struct module *__module_text_address(unsigned long addr)
+{
+	struct module *mod;
+
+	list_for_each_entry(mod, &modules, list)
+		if (within(addr, mod->module_init, mod->init_text_size)
+		    || within(addr, mod->module_core, mod->core_text_size))
+			return mod;
+	return NULL;
+}
+
+struct module *module_text_address(unsigned long addr)
+{
+	struct module *mod;
+	unsigned long flags;
+
+	spin_lock_irqsave(&modlist_lock, flags);
+	mod = __module_text_address(addr);
+	spin_unlock_irqrestore(&modlist_lock, flags);
+
+	return mod;
+}
+
+/* Don't grab lock, we're oopsing. */
+void print_modules(void)
+{
+	struct module *mod;
+
+	printk("Modules linked in:");
+	list_for_each_entry(mod, &modules, list)
+		printk(" %s", mod->name);
+	printk("\n");
+}
+
+void module_add_driver(struct module *mod, struct device_driver *drv)
+{
+	if (!mod || !drv)
+		return;
+
+	/* Don't check return code; this call is idempotent */
+	sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
+}
+EXPORT_SYMBOL(module_add_driver);
+
+void module_remove_driver(struct device_driver *drv)
+{
+	if (!drv)
+		return;
+	sysfs_remove_link(&drv->kobj, "module");
+}
+EXPORT_SYMBOL(module_remove_driver);
+
+#ifdef CONFIG_MODVERSIONS
+/* Generate the signature for struct module here, too, for modversions. */
+void struct_module(struct module *mod) { return; }
+EXPORT_SYMBOL(struct_module);
+#endif
diff --git a/kernel/panic.c b/kernel/panic.c
new file mode 100644
index 000000000000..0fa3f3a66fb6
--- /dev/null
+++ b/kernel/panic.c
@@ -0,0 +1,157 @@
+/*
+ *  linux/kernel/panic.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * This function is used through-out the kernel (including mm and fs)
+ * to indicate a major problem.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/sysrq.h>
+#include <linux/interrupt.h>
+#include <linux/nmi.h>
+
+int panic_timeout;
+int panic_on_oops;
+int tainted;
+
+EXPORT_SYMBOL(panic_timeout);
+
+struct notifier_block *panic_notifier_list;
+
+EXPORT_SYMBOL(panic_notifier_list);
+
+static int __init panic_setup(char *str)
+{
+	panic_timeout = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("panic=", panic_setup);
+
+static long no_blink(long time)
+{
+	return 0;
+}
+
+/* Returns how long it waited in ms */
+long (*panic_blink)(long time);
+EXPORT_SYMBOL(panic_blink);
+
+/**
+ *	panic - halt the system
+ *	@fmt: The text string to print
+ *
+ *	Display a message, then perform cleanups.
+ *
+ *	This function never returns.
+ */
+ 
+NORET_TYPE void panic(const char * fmt, ...)
+{
+	long i;
+	static char buf[1024];
+	va_list args;
+#if defined(CONFIG_ARCH_S390)
+        unsigned long caller = (unsigned long) __builtin_return_address(0);
+#endif
+
+	bust_spinlocks(1);
+	va_start(args, fmt);
+	vsnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+	bust_spinlocks(0);
+
+#ifdef CONFIG_SMP
+	smp_send_stop();
+#endif
+
+	notifier_call_chain(&panic_notifier_list, 0, buf);
+
+	if (!panic_blink)
+		panic_blink = no_blink;
+
+	if (panic_timeout > 0)
+	{
+		/*
+	 	 * Delay timeout seconds before rebooting the machine. 
+		 * We can't use the "normal" timers since we just panicked..
+	 	 */
+		printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
+		for (i = 0; i < panic_timeout*1000; ) {
+			touch_nmi_watchdog();
+			i += panic_blink(i);
+			mdelay(1);
+			i++;
+		}
+		/*
+		 *	Should we run the reboot notifier. For the moment Im
+		 *	choosing not too. It might crash, be corrupt or do
+		 *	more harm than good for other reasons.
+		 */
+		machine_restart(NULL);
+	}
+#ifdef __sparc__
+	{
+		extern int stop_a_enabled;
+		/* Make sure the user can actually press L1-A */
+		stop_a_enabled = 1;
+		printk(KERN_EMERG "Press L1-A to return to the boot prom\n");
+	}
+#endif
+#if defined(CONFIG_ARCH_S390)
+        disabled_wait(caller);
+#endif
+	local_irq_enable();
+	for (i = 0;;) {
+		i += panic_blink(i);
+		mdelay(1);
+		i++;
+	}
+}
+
+EXPORT_SYMBOL(panic);
+
+/**
+ *	print_tainted - return a string to represent the kernel taint state.
+ *
+ *  'P' - Proprietary module has been loaded.
+ *  'F' - Module has been forcibly loaded.
+ *  'S' - SMP with CPUs not designed for SMP.
+ *  'R' - User forced a module unload.
+ *  'M' - Machine had a machine check experience.
+ *  'B' - System has hit bad_page.
+ *
+ *	The string is overwritten by the next call to print_taint().
+ */
+ 
+const char *print_tainted(void)
+{
+	static char buf[20];
+	if (tainted) {
+		snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c",
+			tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
+			tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
+			tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
+			tainted & TAINT_FORCED_RMMOD ? 'R' : ' ',
+ 			tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
+			tainted & TAINT_BAD_PAGE ? 'B' : ' ');
+	}
+	else
+		snprintf(buf, sizeof(buf), "Not tainted");
+	return(buf);
+}
+
+void add_taint(unsigned flag)
+{
+	tainted |= flag;
+}
+EXPORT_SYMBOL(add_taint);
diff --git a/kernel/params.c b/kernel/params.c
new file mode 100644
index 000000000000..5538608bd339
--- /dev/null
+++ b/kernel/params.c
@@ -0,0 +1,721 @@
+/* Helpers for initial module or kernel cmdline parsing
+   Copyright (C) 2001 Rusty Russell.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/config.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt, a...)
+#endif
+
+static inline int dash2underscore(char c)
+{
+	if (c == '-')
+		return '_';
+	return c;
+}
+
+static inline int parameq(const char *input, const char *paramname)
+{
+	unsigned int i;
+	for (i = 0; dash2underscore(input[i]) == paramname[i]; i++)
+		if (input[i] == '\0')
+			return 1;
+	return 0;
+}
+
+static int parse_one(char *param,
+		     char *val,
+		     struct kernel_param *params, 
+		     unsigned num_params,
+		     int (*handle_unknown)(char *param, char *val))
+{
+	unsigned int i;
+
+	/* Find parameter */
+	for (i = 0; i < num_params; i++) {
+		if (parameq(param, params[i].name)) {
+			DEBUGP("They are equal!  Calling %p\n",
+			       params[i].set);
+			return params[i].set(val, &params[i]);
+		}
+	}
+
+	if (handle_unknown) {
+		DEBUGP("Unknown argument: calling %p\n", handle_unknown);
+		return handle_unknown(param, val);
+	}
+
+	DEBUGP("Unknown argument `%s'\n", param);
+	return -ENOENT;
+}
+
+/* You can use " around spaces, but can't escape ". */
+/* Hyphens and underscores equivalent in parameter names. */
+static char *next_arg(char *args, char **param, char **val)
+{
+	unsigned int i, equals = 0;
+	int in_quote = 0, quoted = 0;
+	char *next;
+
+	/* Chew any extra spaces */
+	while (*args == ' ') args++;
+	if (*args == '"') {
+		args++;
+		in_quote = 1;
+		quoted = 1;
+	}
+
+	for (i = 0; args[i]; i++) {
+		if (args[i] == ' ' && !in_quote)
+			break;
+		if (equals == 0) {
+			if (args[i] == '=')
+				equals = i;
+		}
+		if (args[i] == '"')
+			in_quote = !in_quote;
+	}
+
+	*param = args;
+	if (!equals)
+		*val = NULL;
+	else {
+		args[equals] = '\0';
+		*val = args + equals + 1;
+
+		/* Don't include quotes in value. */
+		if (**val == '"') {
+			(*val)++;
+			if (args[i-1] == '"')
+				args[i-1] = '\0';
+		}
+		if (quoted && args[i-1] == '"')
+			args[i-1] = '\0';
+	}
+
+	if (args[i]) {
+		args[i] = '\0';
+		next = args + i + 1;
+	} else
+		next = args + i;
+	return next;
+}
+
+/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
+int parse_args(const char *name,
+	       char *args,
+	       struct kernel_param *params,
+	       unsigned num,
+	       int (*unknown)(char *param, char *val))
+{
+	char *param, *val;
+
+	DEBUGP("Parsing ARGS: %s\n", args);
+
+	while (*args) {
+		int ret;
+
+		args = next_arg(args, &param, &val);
+		ret = parse_one(param, val, params, num, unknown);
+		switch (ret) {
+		case -ENOENT:
+			printk(KERN_ERR "%s: Unknown parameter `%s'\n",
+			       name, param);
+			return ret;
+		case -ENOSPC:
+			printk(KERN_ERR
+			       "%s: `%s' too large for parameter `%s'\n",
+			       name, val ?: "", param);
+			return ret;
+		case 0:
+			break;
+		default:
+			printk(KERN_ERR
+			       "%s: `%s' invalid for parameter `%s'\n",
+			       name, val ?: "", param);
+			return ret;
+		}
+	}
+
+	/* All parsed OK. */
+	return 0;
+}
+
+/* Lazy bastard, eh? */
+#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn)      	\
+	int param_set_##name(const char *val, struct kernel_param *kp)	\
+	{								\
+		char *endp;						\
+		tmptype l;						\
+									\
+		if (!val) return -EINVAL;				\
+		l = strtolfn(val, &endp, 0);				\
+		if (endp == val || ((type)l != l))			\
+			return -EINVAL;					\
+		*((type *)kp->arg) = l;					\
+		return 0;						\
+	}								\
+	int param_get_##name(char *buffer, struct kernel_param *kp)	\
+	{								\
+		return sprintf(buffer, format, *((type *)kp->arg));	\
+	}
+
+STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul);
+
+int param_set_charp(const char *val, struct kernel_param *kp)
+{
+	if (!val) {
+		printk(KERN_ERR "%s: string parameter expected\n",
+		       kp->name);
+		return -EINVAL;
+	}
+
+	if (strlen(val) > 1024) {
+		printk(KERN_ERR "%s: string parameter too long\n",
+		       kp->name);
+		return -ENOSPC;
+	}
+
+	*(char **)kp->arg = (char *)val;
+	return 0;
+}
+
+int param_get_charp(char *buffer, struct kernel_param *kp)
+{
+	return sprintf(buffer, "%s", *((char **)kp->arg));
+}
+
+int param_set_bool(const char *val, struct kernel_param *kp)
+{
+	/* No equals means "set"... */
+	if (!val) val = "1";
+
+	/* One of =[yYnN01] */
+	switch (val[0]) {
+	case 'y': case 'Y': case '1':
+		*(int *)kp->arg = 1;
+		return 0;
+	case 'n': case 'N': case '0':
+		*(int *)kp->arg = 0;
+		return 0;
+	}
+	return -EINVAL;
+}
+
+int param_get_bool(char *buffer, struct kernel_param *kp)
+{
+	/* Y and N chosen as being relatively non-coder friendly */
+	return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N');
+}
+
+int param_set_invbool(const char *val, struct kernel_param *kp)
+{
+	int boolval, ret;
+	struct kernel_param dummy = { .arg = &boolval };
+
+	ret = param_set_bool(val, &dummy);
+	if (ret == 0)
+		*(int *)kp->arg = !boolval;
+	return ret;
+}
+
+int param_get_invbool(char *buffer, struct kernel_param *kp)
+{
+	int val;
+	struct kernel_param dummy = { .arg = &val };
+
+	val = !*(int *)kp->arg;
+	return param_get_bool(buffer, &dummy);
+}
+
+/* We cheat here and temporarily mangle the string. */
+int param_array(const char *name,
+		const char *val,
+		unsigned int min, unsigned int max,
+		void *elem, int elemsize,
+		int (*set)(const char *, struct kernel_param *kp),
+		int *num)
+{
+	int ret;
+	struct kernel_param kp;
+	char save;
+
+	/* Get the name right for errors. */
+	kp.name = name;
+	kp.arg = elem;
+
+	/* No equals sign? */
+	if (!val) {
+		printk(KERN_ERR "%s: expects arguments\n", name);
+		return -EINVAL;
+	}
+
+	*num = 0;
+	/* We expect a comma-separated list of values. */
+	do {
+		int len;
+
+		if (*num == max) {
+			printk(KERN_ERR "%s: can only take %i arguments\n",
+			       name, max);
+			return -EINVAL;
+		}
+		len = strcspn(val, ",");
+
+		/* nul-terminate and parse */
+		save = val[len];
+		((char *)val)[len] = '\0';
+		ret = set(val, &kp);
+
+		if (ret != 0)
+			return ret;
+		kp.arg += elemsize;
+		val += len+1;
+		(*num)++;
+	} while (save == ',');
+
+	if (*num < min) {
+		printk(KERN_ERR "%s: needs at least %i arguments\n",
+		       name, min);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int param_array_set(const char *val, struct kernel_param *kp)
+{
+	struct kparam_array *arr = kp->arg;
+
+	return param_array(kp->name, val, 1, arr->max, arr->elem,
+			   arr->elemsize, arr->set, arr->num ?: &arr->max);
+}
+
+int param_array_get(char *buffer, struct kernel_param *kp)
+{
+	int i, off, ret;
+	struct kparam_array *arr = kp->arg;
+	struct kernel_param p;
+
+	p = *kp;
+	for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+		if (i)
+			buffer[off++] = ',';
+		p.arg = arr->elem + arr->elemsize * i;
+		ret = arr->get(buffer + off, &p);
+		if (ret < 0)
+			return ret;
+		off += ret;
+	}
+	buffer[off] = '\0';
+	return off;
+}
+
+int param_set_copystring(const char *val, struct kernel_param *kp)
+{
+	struct kparam_string *kps = kp->arg;
+
+	if (strlen(val)+1 > kps->maxlen) {
+		printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
+		       kp->name, kps->maxlen-1);
+		return -ENOSPC;
+	}
+	strcpy(kps->string, val);
+	return 0;
+}
+
+int param_get_string(char *buffer, struct kernel_param *kp)
+{
+	struct kparam_string *kps = kp->arg;
+	return strlcpy(buffer, kps->string, kps->maxlen);
+}
+
+/* sysfs output in /sys/modules/XYZ/parameters/ */
+
+extern struct kernel_param __start___param[], __stop___param[];
+
+#define MAX_KBUILD_MODNAME KOBJ_NAME_LEN
+
+struct param_attribute
+{
+	struct module_attribute mattr;
+	struct kernel_param *param;
+};
+
+struct module_param_attrs
+{
+	struct attribute_group grp;
+	struct param_attribute attrs[0];
+};
+
+#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
+
+static ssize_t param_attr_show(struct module_attribute *mattr,
+			       struct module *mod, char *buf)
+{
+	int count;
+	struct param_attribute *attribute = to_param_attr(mattr);
+
+	if (!attribute->param->get)
+		return -EPERM;
+
+	count = attribute->param->get(buf, attribute->param);
+	if (count > 0) {
+		strcat(buf, "\n");
+		++count;
+	}
+	return count;
+}
+
+/* sysfs always hands a nul-terminated string in buf.  We rely on that. */
+static ssize_t param_attr_store(struct module_attribute *mattr,
+				struct module *owner,
+				const char *buf, size_t len)
+{
+ 	int err;
+	struct param_attribute *attribute = to_param_attr(mattr);
+
+	if (!attribute->param->set)
+		return -EPERM;
+
+	err = attribute->param->set(buf, attribute->param);
+	if (!err)
+		return len;
+	return err;
+}
+
+#ifdef CONFIG_MODULES
+#define __modinit
+#else
+#define __modinit __init
+#endif
+
+/*
+ * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME
+ * @mk: struct module_kobject (contains parent kobject)
+ * @kparam: array of struct kernel_param, the actual parameter definitions
+ * @num_params: number of entries in array
+ * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules"
+ *
+ * Create a kobject for a (per-module) group of parameters, and create files
+ * in sysfs. A pointer to the param_kobject is returned on success,
+ * NULL if there's no parameter to export, or other ERR_PTR(err).
+ */
+static __modinit struct module_param_attrs *
+param_sysfs_setup(struct module_kobject *mk,
+		  struct kernel_param *kparam,
+		  unsigned int num_params,
+		  unsigned int name_skip)
+{
+	struct module_param_attrs *mp;
+	unsigned int valid_attrs = 0;
+	unsigned int i, size[2];
+	struct param_attribute *pattr;
+	struct attribute **gattr;
+	int err;
+
+	for (i=0; i<num_params; i++) {
+		if (kparam[i].perm)
+			valid_attrs++;
+	}
+
+	if (!valid_attrs)
+		return NULL;
+
+	size[0] = ALIGN(sizeof(*mp) +
+			valid_attrs * sizeof(mp->attrs[0]),
+			sizeof(mp->grp.attrs[0]));
+	size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
+
+	mp = kmalloc(size[0] + size[1], GFP_KERNEL);
+	if (!mp)
+		return ERR_PTR(-ENOMEM);
+
+	mp->grp.name = "parameters";
+	mp->grp.attrs = (void *)mp + size[0];
+
+	pattr = &mp->attrs[0];
+	gattr = &mp->grp.attrs[0];
+	for (i = 0; i < num_params; i++) {
+		struct kernel_param *kp = &kparam[i];
+		if (kp->perm) {
+			pattr->param = kp;
+			pattr->mattr.show = param_attr_show;
+			pattr->mattr.store = param_attr_store;
+			pattr->mattr.attr.name = (char *)&kp->name[name_skip];
+			pattr->mattr.attr.owner = mk->mod;
+			pattr->mattr.attr.mode = kp->perm;
+			*(gattr++) = &(pattr++)->mattr.attr;
+		}
+	}
+	*gattr = NULL;
+
+	if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) {
+		kfree(mp);
+		return ERR_PTR(err);
+	}
+	return mp;
+}
+
+
+#ifdef CONFIG_MODULES
+
+/*
+ * module_param_sysfs_setup - setup sysfs support for one module
+ * @mod: module
+ * @kparam: module parameters (array)
+ * @num_params: number of module parameters
+ *
+ * Adds sysfs entries for module parameters, and creates a link from
+ * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/
+ */
+int module_param_sysfs_setup(struct module *mod,
+			     struct kernel_param *kparam,
+			     unsigned int num_params)
+{
+	struct module_param_attrs *mp;
+
+	mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0);
+	if (IS_ERR(mp))
+		return PTR_ERR(mp);
+
+	mod->param_attrs = mp;
+	return 0;
+}
+
+/*
+ * module_param_sysfs_remove - remove sysfs support for one module
+ * @mod: module
+ *
+ * Remove sysfs entries for module parameters and the corresponding
+ * kobject.
+ */
+void module_param_sysfs_remove(struct module *mod)
+{
+	if (mod->param_attrs) {
+		sysfs_remove_group(&mod->mkobj.kobj,
+				   &mod->param_attrs->grp);
+		/* We are positive that no one is using any param
+		 * attrs at this point.  Deallocate immediately. */
+		kfree(mod->param_attrs);
+		mod->param_attrs = NULL;
+	}
+}
+#endif
+
+/*
+ * kernel_param_sysfs_setup - wrapper for built-in params support
+ */
+static void __init kernel_param_sysfs_setup(const char *name,
+					    struct kernel_param *kparam,
+					    unsigned int num_params,
+					    unsigned int name_skip)
+{
+	struct module_kobject *mk;
+
+	mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL);
+	memset(mk, 0, sizeof(struct module_kobject));
+
+	mk->mod = THIS_MODULE;
+	kobj_set_kset_s(mk, module_subsys);
+	kobject_set_name(&mk->kobj, name);
+	kobject_register(&mk->kobj);
+
+	/* no need to keep the kobject if no parameter is exported */
+	if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
+		kobject_unregister(&mk->kobj);
+		kfree(mk);
+	}
+}
+
+/*
+ * param_sysfs_builtin - add contents in /sys/parameters for built-in modules
+ *
+ * Add module_parameters to sysfs for "modules" built into the kernel.
+ *
+ * The "module" name (KBUILD_MODNAME) is stored before a dot, the
+ * "parameter" name is stored behind a dot in kernel_param->name. So,
+ * extract the "module" name for all built-in kernel_param-eters,
+ * and for all who have the same, call kernel_param_sysfs_setup.
+ */
+static void __init param_sysfs_builtin(void)
+{
+	struct kernel_param *kp, *kp_begin = NULL;
+	unsigned int i, name_len, count = 0;
+	char modname[MAX_KBUILD_MODNAME + 1] = "";
+
+	for (i=0; i < __stop___param - __start___param; i++) {
+		char *dot;
+
+		kp = &__start___param[i];
+
+		/* We do not handle args without periods. */
+		dot = memchr(kp->name, '.', MAX_KBUILD_MODNAME);
+		if (!dot) {
+			DEBUGP("couldn't find period in %s\n", kp->name);
+			continue;
+		}
+		name_len = dot - kp->name;
+
+ 		/* new kbuild_modname? */
+		if (strlen(modname) != name_len
+		    || strncmp(modname, kp->name, name_len) != 0) {
+			/* add a new kobject for previous kernel_params. */
+			if (count)
+				kernel_param_sysfs_setup(modname,
+							 kp_begin,
+							 count,
+							 strlen(modname)+1);
+
+			strncpy(modname, kp->name, name_len);
+			modname[name_len] = '\0';
+			count = 0;
+			kp_begin = kp;
+		}
+		count++;
+	}
+
+	/* last kernel_params need to be registered as well */
+	if (count)
+		kernel_param_sysfs_setup(modname, kp_begin, count,
+					 strlen(modname)+1);
+}
+
+
+/* module-related sysfs stuff */
+#ifdef CONFIG_MODULES
+
+#define to_module_attr(n) container_of(n, struct module_attribute, attr);
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
+
+static ssize_t module_attr_show(struct kobject *kobj,
+				struct attribute *attr,
+				char *buf)
+{
+	struct module_attribute *attribute;
+	struct module_kobject *mk;
+	int ret;
+
+	attribute = to_module_attr(attr);
+	mk = to_module_kobject(kobj);
+
+	if (!attribute->show)
+		return -EPERM;
+
+	if (!try_module_get(mk->mod))
+		return -ENODEV;
+
+	ret = attribute->show(attribute, mk->mod, buf);
+
+	module_put(mk->mod);
+
+	return ret;
+}
+
+static ssize_t module_attr_store(struct kobject *kobj,
+				struct attribute *attr,
+				const char *buf, size_t len)
+{
+	struct module_attribute *attribute;
+	struct module_kobject *mk;
+	int ret;
+
+	attribute = to_module_attr(attr);
+	mk = to_module_kobject(kobj);
+
+	if (!attribute->store)
+		return -EPERM;
+
+	if (!try_module_get(mk->mod))
+		return -ENODEV;
+
+	ret = attribute->store(attribute, mk->mod, buf, len);
+
+	module_put(mk->mod);
+
+	return ret;
+}
+
+static struct sysfs_ops module_sysfs_ops = {
+	.show = module_attr_show,
+	.store = module_attr_store,
+};
+
+#else
+static struct sysfs_ops module_sysfs_ops = {
+	.show = NULL,
+	.store = NULL,
+};
+#endif
+
+static struct kobj_type module_ktype = {
+	.sysfs_ops =	&module_sysfs_ops,
+};
+
+decl_subsys(module, &module_ktype, NULL);
+
+/*
+ * param_sysfs_init - wrapper for built-in params support
+ */
+static int __init param_sysfs_init(void)
+{
+	subsystem_register(&module_subsys);
+
+	param_sysfs_builtin();
+
+	return 0;
+}
+__initcall(param_sysfs_init);
+
+EXPORT_SYMBOL(param_set_byte);
+EXPORT_SYMBOL(param_get_byte);
+EXPORT_SYMBOL(param_set_short);
+EXPORT_SYMBOL(param_get_short);
+EXPORT_SYMBOL(param_set_ushort);
+EXPORT_SYMBOL(param_get_ushort);
+EXPORT_SYMBOL(param_set_int);
+EXPORT_SYMBOL(param_get_int);
+EXPORT_SYMBOL(param_set_uint);
+EXPORT_SYMBOL(param_get_uint);
+EXPORT_SYMBOL(param_set_long);
+EXPORT_SYMBOL(param_get_long);
+EXPORT_SYMBOL(param_set_ulong);
+EXPORT_SYMBOL(param_get_ulong);
+EXPORT_SYMBOL(param_set_charp);
+EXPORT_SYMBOL(param_get_charp);
+EXPORT_SYMBOL(param_set_bool);
+EXPORT_SYMBOL(param_get_bool);
+EXPORT_SYMBOL(param_set_invbool);
+EXPORT_SYMBOL(param_get_invbool);
+EXPORT_SYMBOL(param_array_set);
+EXPORT_SYMBOL(param_array_get);
+EXPORT_SYMBOL(param_set_copystring);
+EXPORT_SYMBOL(param_get_string);
diff --git a/kernel/pid.c b/kernel/pid.c
new file mode 100644
index 000000000000..edba31c681ac
--- /dev/null
+++ b/kernel/pid.c
@@ -0,0 +1,292 @@
+/*
+ * Generic pidhash and scalable, time-bounded PID allocator
+ *
+ * (C) 2002-2003 William Irwin, IBM
+ * (C) 2004 William Irwin, Oracle
+ * (C) 2002-2004 Ingo Molnar, Red Hat
+ *
+ * pid-structures are backing objects for tasks sharing a given ID to chain
+ * against. There is very little to them aside from hashing them and
+ * parking tasks using given ID's on a list.
+ *
+ * The hash is always changed with the tasklist_lock write-acquired,
+ * and the hash is only accessed with the tasklist_lock at least
+ * read-acquired, so there's no additional SMP locking needed here.
+ *
+ * We have a list of bitmap pages, which bitmaps represent the PID space.
+ * Allocating and freeing PIDs is completely lockless. The worst-case
+ * allocation scenario when all but one out of 1 million PIDs possible are
+ * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
+ * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/hash.h>
+
+#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
+static struct hlist_head *pid_hash[PIDTYPE_MAX];
+static int pidhash_shift;
+
+int pid_max = PID_MAX_DEFAULT;
+int last_pid;
+
+#define RESERVED_PIDS		300
+
+int pid_max_min = RESERVED_PIDS + 1;
+int pid_max_max = PID_MAX_LIMIT;
+
+#define PIDMAP_ENTRIES		((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8)
+#define BITS_PER_PAGE		(PAGE_SIZE*8)
+#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
+#define mk_pid(map, off)	(((map) - pidmap_array)*BITS_PER_PAGE + (off))
+#define find_next_offset(map, off)					\
+		find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
+
+/*
+ * PID-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way a low pid_max
+ * value does not cause lots of bitmaps to be allocated, but
+ * the scheme scales to up to 4 million PIDs, runtime.
+ */
+typedef struct pidmap {
+	atomic_t nr_free;
+	void *page;
+} pidmap_t;
+
+static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
+	 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
+
+static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+
+fastcall void free_pidmap(int pid)
+{
+	pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
+	int offset = pid & BITS_PER_PAGE_MASK;
+
+	clear_bit(offset, map->page);
+	atomic_inc(&map->nr_free);
+}
+
+int alloc_pidmap(void)
+{
+	int i, offset, max_scan, pid, last = last_pid;
+	pidmap_t *map;
+
+	pid = last + 1;
+	if (pid >= pid_max)
+		pid = RESERVED_PIDS;
+	offset = pid & BITS_PER_PAGE_MASK;
+	map = &pidmap_array[pid/BITS_PER_PAGE];
+	max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
+	for (i = 0; i <= max_scan; ++i) {
+		if (unlikely(!map->page)) {
+			unsigned long page = get_zeroed_page(GFP_KERNEL);
+			/*
+			 * Free the page if someone raced with us
+			 * installing it:
+			 */
+			spin_lock(&pidmap_lock);
+			if (map->page)
+				free_page(page);
+			else
+				map->page = (void *)page;
+			spin_unlock(&pidmap_lock);
+			if (unlikely(!map->page))
+				break;
+		}
+		if (likely(atomic_read(&map->nr_free))) {
+			do {
+				if (!test_and_set_bit(offset, map->page)) {
+					atomic_dec(&map->nr_free);
+					last_pid = pid;
+					return pid;
+				}
+				offset = find_next_offset(map, offset);
+				pid = mk_pid(map, offset);
+			/*
+			 * find_next_offset() found a bit, the pid from it
+			 * is in-bounds, and if we fell back to the last
+			 * bitmap block and the final block was the same
+			 * as the starting point, pid is before last_pid.
+			 */
+			} while (offset < BITS_PER_PAGE && pid < pid_max &&
+					(i != max_scan || pid < last ||
+					    !((last+1) & BITS_PER_PAGE_MASK)));
+		}
+		if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) {
+			++map;
+			offset = 0;
+		} else {
+			map = &pidmap_array[0];
+			offset = RESERVED_PIDS;
+			if (unlikely(last == offset))
+				break;
+		}
+		pid = mk_pid(map, offset);
+	}
+	return -1;
+}
+
+struct pid * fastcall find_pid(enum pid_type type, int nr)
+{
+	struct hlist_node *elem;
+	struct pid *pid;
+
+	hlist_for_each_entry(pid, elem,
+			&pid_hash[type][pid_hashfn(nr)], pid_chain) {
+		if (pid->nr == nr)
+			return pid;
+	}
+	return NULL;
+}
+
+int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
+{
+	struct pid *pid, *task_pid;
+
+	task_pid = &task->pids[type];
+	pid = find_pid(type, nr);
+	if (pid == NULL) {
+		hlist_add_head(&task_pid->pid_chain,
+				&pid_hash[type][pid_hashfn(nr)]);
+		INIT_LIST_HEAD(&task_pid->pid_list);
+	} else {
+		INIT_HLIST_NODE(&task_pid->pid_chain);
+		list_add_tail(&task_pid->pid_list, &pid->pid_list);
+	}
+	task_pid->nr = nr;
+
+	return 0;
+}
+
+static fastcall int __detach_pid(task_t *task, enum pid_type type)
+{
+	struct pid *pid, *pid_next;
+	int nr = 0;
+
+	pid = &task->pids[type];
+	if (!hlist_unhashed(&pid->pid_chain)) {
+		hlist_del(&pid->pid_chain);
+
+		if (list_empty(&pid->pid_list))
+			nr = pid->nr;
+		else {
+			pid_next = list_entry(pid->pid_list.next,
+						struct pid, pid_list);
+			/* insert next pid from pid_list to hash */
+			hlist_add_head(&pid_next->pid_chain,
+				&pid_hash[type][pid_hashfn(pid_next->nr)]);
+		}
+	}
+
+	list_del(&pid->pid_list);
+	pid->nr = 0;
+
+	return nr;
+}
+
+void fastcall detach_pid(task_t *task, enum pid_type type)
+{
+	int tmp, nr;
+
+	nr = __detach_pid(task, type);
+	if (!nr)
+		return;
+
+	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+		if (tmp != type && find_pid(tmp, nr))
+			return;
+
+	free_pidmap(nr);
+}
+
+task_t *find_task_by_pid_type(int type, int nr)
+{
+	struct pid *pid;
+
+	pid = find_pid(type, nr);
+	if (!pid)
+		return NULL;
+
+	return pid_task(&pid->pid_list, type);
+}
+
+EXPORT_SYMBOL(find_task_by_pid_type);
+
+/*
+ * This function switches the PIDs if a non-leader thread calls
+ * sys_execve() - this must be done without releasing the PID.
+ * (which a detach_pid() would eventually do.)
+ */
+void switch_exec_pids(task_t *leader, task_t *thread)
+{
+	__detach_pid(leader, PIDTYPE_PID);
+	__detach_pid(leader, PIDTYPE_TGID);
+	__detach_pid(leader, PIDTYPE_PGID);
+	__detach_pid(leader, PIDTYPE_SID);
+
+	__detach_pid(thread, PIDTYPE_PID);
+	__detach_pid(thread, PIDTYPE_TGID);
+
+	leader->pid = leader->tgid = thread->pid;
+	thread->pid = thread->tgid;
+
+	attach_pid(thread, PIDTYPE_PID, thread->pid);
+	attach_pid(thread, PIDTYPE_TGID, thread->tgid);
+	attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
+	attach_pid(thread, PIDTYPE_SID, thread->signal->session);
+	list_add_tail(&thread->tasks, &init_task.tasks);
+
+	attach_pid(leader, PIDTYPE_PID, leader->pid);
+	attach_pid(leader, PIDTYPE_TGID, leader->tgid);
+	attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
+	attach_pid(leader, PIDTYPE_SID, leader->signal->session);
+}
+
+/*
+ * The pid hash table is scaled according to the amount of memory in the
+ * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
+ * more.
+ */
+void __init pidhash_init(void)
+{
+	int i, j, pidhash_size;
+	unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
+
+	pidhash_shift = max(4, fls(megabytes * 4));
+	pidhash_shift = min(12, pidhash_shift);
+	pidhash_size = 1 << pidhash_shift;
+
+	printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
+		pidhash_size, pidhash_shift,
+		PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head));
+
+	for (i = 0; i < PIDTYPE_MAX; i++) {
+		pid_hash[i] = alloc_bootmem(pidhash_size *
+					sizeof(*(pid_hash[i])));
+		if (!pid_hash[i])
+			panic("Could not alloc pidhash!\n");
+		for (j = 0; j < pidhash_size; j++)
+			INIT_HLIST_HEAD(&pid_hash[i][j]);
+	}
+}
+
+void __init pidmap_init(void)
+{
+	int i;
+
+	pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
+	set_bit(0, pidmap_array->page);
+	atomic_dec(&pidmap_array->nr_free);
+
+	/*
+	 * Allocate PID 0, and hash it via all PID types:
+	 */
+
+	for (i = 0; i < PIDTYPE_MAX; i++)
+		attach_pid(current, i, 0);
+}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
new file mode 100644
index 000000000000..ad85d3f0dcc4
--- /dev/null
+++ b/kernel/posix-cpu-timers.c
@@ -0,0 +1,1559 @@
+/*
+ * Implement CPU time clocks for the POSIX clock interface.
+ */
+
+#include <linux/sched.h>
+#include <linux/posix-timers.h>
+#include <asm/uaccess.h>
+#include <linux/errno.h>
+
+static int check_clock(clockid_t which_clock)
+{
+	int error = 0;
+	struct task_struct *p;
+	const pid_t pid = CPUCLOCK_PID(which_clock);
+
+	if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
+		return -EINVAL;
+
+	if (pid == 0)
+		return 0;
+
+	read_lock(&tasklist_lock);
+	p = find_task_by_pid(pid);
+	if (!p || (CPUCLOCK_PERTHREAD(which_clock) ?
+		   p->tgid != current->tgid : p->tgid != pid)) {
+		error = -EINVAL;
+	}
+	read_unlock(&tasklist_lock);
+
+	return error;
+}
+
+static inline union cpu_time_count
+timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
+{
+	union cpu_time_count ret;
+	ret.sched = 0;		/* high half always zero when .cpu used */
+	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+		ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
+	} else {
+		ret.cpu = timespec_to_cputime(tp);
+	}
+	return ret;
+}
+
+static void sample_to_timespec(clockid_t which_clock,
+			       union cpu_time_count cpu,
+			       struct timespec *tp)
+{
+	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+		tp->tv_sec = div_long_long_rem(cpu.sched,
+					       NSEC_PER_SEC, &tp->tv_nsec);
+	} else {
+		cputime_to_timespec(cpu.cpu, tp);
+	}
+}
+
+static inline int cpu_time_before(clockid_t which_clock,
+				  union cpu_time_count now,
+				  union cpu_time_count then)
+{
+	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+		return now.sched < then.sched;
+	}  else {
+		return cputime_lt(now.cpu, then.cpu);
+	}
+}
+static inline void cpu_time_add(clockid_t which_clock,
+				union cpu_time_count *acc,
+			        union cpu_time_count val)
+{
+	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+		acc->sched += val.sched;
+	}  else {
+		acc->cpu = cputime_add(acc->cpu, val.cpu);
+	}
+}
+static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
+						union cpu_time_count a,
+						union cpu_time_count b)
+{
+	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+		a.sched -= b.sched;
+	}  else {
+		a.cpu = cputime_sub(a.cpu, b.cpu);
+	}
+	return a;
+}
+
+/*
+ * Update expiry time from increment, and increase overrun count,
+ * given the current clock sample.
+ */
+static inline void bump_cpu_timer(struct k_itimer *timer,
+				  union cpu_time_count now)
+{
+	int i;
+
+	if (timer->it.cpu.incr.sched == 0)
+		return;
+
+	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
+		unsigned long long delta, incr;
+
+		if (now.sched < timer->it.cpu.expires.sched)
+			return;
+		incr = timer->it.cpu.incr.sched;
+		delta = now.sched + incr - timer->it.cpu.expires.sched;
+		/* Don't use (incr*2 < delta), incr*2 might overflow. */
+		for (i = 0; incr < delta - incr; i++)
+			incr = incr << 1;
+		for (; i >= 0; incr >>= 1, i--) {
+			if (delta <= incr)
+				continue;
+			timer->it.cpu.expires.sched += incr;
+			timer->it_overrun += 1 << i;
+			delta -= incr;
+		}
+	} else {
+		cputime_t delta, incr;
+
+		if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu))
+			return;
+		incr = timer->it.cpu.incr.cpu;
+		delta = cputime_sub(cputime_add(now.cpu, incr),
+				    timer->it.cpu.expires.cpu);
+		/* Don't use (incr*2 < delta), incr*2 might overflow. */
+		for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
+			     incr = cputime_add(incr, incr);
+		for (; i >= 0; incr = cputime_halve(incr), i--) {
+			if (cputime_le(delta, incr))
+				continue;
+			timer->it.cpu.expires.cpu =
+				cputime_add(timer->it.cpu.expires.cpu, incr);
+			timer->it_overrun += 1 << i;
+			delta = cputime_sub(delta, incr);
+		}
+	}
+}
+
+static inline cputime_t prof_ticks(struct task_struct *p)
+{
+	return cputime_add(p->utime, p->stime);
+}
+static inline cputime_t virt_ticks(struct task_struct *p)
+{
+	return p->utime;
+}
+static inline unsigned long long sched_ns(struct task_struct *p)
+{
+	return (p == current) ? current_sched_time(p) : p->sched_time;
+}
+
+int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+	int error = check_clock(which_clock);
+	if (!error) {
+		tp->tv_sec = 0;
+		tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
+		if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+			/*
+			 * If sched_clock is using a cycle counter, we
+			 * don't have any idea of its true resolution
+			 * exported, but it is much more than 1s/HZ.
+			 */
+			tp->tv_nsec = 1;
+		}
+	}
+	return error;
+}
+
+int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
+{
+	/*
+	 * You can never reset a CPU clock, but we check for other errors
+	 * in the call before failing with EPERM.
+	 */
+	int error = check_clock(which_clock);
+	if (error == 0) {
+		error = -EPERM;
+	}
+	return error;
+}
+
+
+/*
+ * Sample a per-thread clock for the given task.
+ */
+static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p,
+			    union cpu_time_count *cpu)
+{
+	switch (CPUCLOCK_WHICH(which_clock)) {
+	default:
+		return -EINVAL;
+	case CPUCLOCK_PROF:
+		cpu->cpu = prof_ticks(p);
+		break;
+	case CPUCLOCK_VIRT:
+		cpu->cpu = virt_ticks(p);
+		break;
+	case CPUCLOCK_SCHED:
+		cpu->sched = sched_ns(p);
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Sample a process (thread group) clock for the given group_leader task.
+ * Must be called with tasklist_lock held for reading.
+ * Must be called with tasklist_lock held for reading, and p->sighand->siglock.
+ */
+static int cpu_clock_sample_group_locked(unsigned int clock_idx,
+					 struct task_struct *p,
+					 union cpu_time_count *cpu)
+{
+	struct task_struct *t = p;
+ 	switch (clock_idx) {
+	default:
+		return -EINVAL;
+	case CPUCLOCK_PROF:
+		cpu->cpu = cputime_add(p->signal->utime, p->signal->stime);
+		do {
+			cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
+			t = next_thread(t);
+		} while (t != p);
+		break;
+	case CPUCLOCK_VIRT:
+		cpu->cpu = p->signal->utime;
+		do {
+			cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
+			t = next_thread(t);
+		} while (t != p);
+		break;
+	case CPUCLOCK_SCHED:
+		cpu->sched = p->signal->sched_time;
+		/* Add in each other live thread.  */
+		while ((t = next_thread(t)) != p) {
+			cpu->sched += t->sched_time;
+		}
+		if (p->tgid == current->tgid) {
+			/*
+			 * We're sampling ourselves, so include the
+			 * cycles not yet banked.  We still omit
+			 * other threads running on other CPUs,
+			 * so the total can always be behind as
+			 * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
+			 */
+			cpu->sched += current_sched_time(current);
+		} else {
+			cpu->sched += p->sched_time;
+		}
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Sample a process (thread group) clock for the given group_leader task.
+ * Must be called with tasklist_lock held for reading.
+ */
+static int cpu_clock_sample_group(clockid_t which_clock,
+				  struct task_struct *p,
+				  union cpu_time_count *cpu)
+{
+	int ret;
+	unsigned long flags;
+	spin_lock_irqsave(&p->sighand->siglock, flags);
+	ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p,
+					    cpu);
+	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	return ret;
+}
+
+
+int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+	const pid_t pid = CPUCLOCK_PID(which_clock);
+	int error = -EINVAL;
+	union cpu_time_count rtn;
+
+	if (pid == 0) {
+		/*
+		 * Special case constant value for our own clocks.
+		 * We don't have to do any lookup to find ourselves.
+		 */
+		if (CPUCLOCK_PERTHREAD(which_clock)) {
+			/*
+			 * Sampling just ourselves we can do with no locking.
+			 */
+			error = cpu_clock_sample(which_clock,
+						 current, &rtn);
+		} else {
+			read_lock(&tasklist_lock);
+			error = cpu_clock_sample_group(which_clock,
+						       current, &rtn);
+			read_unlock(&tasklist_lock);
+		}
+	} else {
+		/*
+		 * Find the given PID, and validate that the caller
+		 * should be able to see it.
+		 */
+		struct task_struct *p;
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+		if (p) {
+			if (CPUCLOCK_PERTHREAD(which_clock)) {
+				if (p->tgid == current->tgid) {
+					error = cpu_clock_sample(which_clock,
+								 p, &rtn);
+				}
+			} else if (p->tgid == pid && p->signal) {
+				error = cpu_clock_sample_group(which_clock,
+							       p, &rtn);
+			}
+		}
+		read_unlock(&tasklist_lock);
+	}
+
+	if (error)
+		return error;
+	sample_to_timespec(which_clock, rtn, tp);
+	return 0;
+}
+
+
+/*
+ * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
+ * This is called from sys_timer_create with the new timer already locked.
+ */
+int posix_cpu_timer_create(struct k_itimer *new_timer)
+{
+	int ret = 0;
+	const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
+	struct task_struct *p;
+
+	if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&new_timer->it.cpu.entry);
+	new_timer->it.cpu.incr.sched = 0;
+	new_timer->it.cpu.expires.sched = 0;
+
+	read_lock(&tasklist_lock);
+	if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
+		if (pid == 0) {
+			p = current;
+		} else {
+			p = find_task_by_pid(pid);
+			if (p && p->tgid != current->tgid)
+				p = NULL;
+		}
+	} else {
+		if (pid == 0) {
+			p = current->group_leader;
+		} else {
+			p = find_task_by_pid(pid);
+			if (p && p->tgid != pid)
+				p = NULL;
+		}
+	}
+	new_timer->it.cpu.task = p;
+	if (p) {
+		get_task_struct(p);
+	} else {
+		ret = -EINVAL;
+	}
+	read_unlock(&tasklist_lock);
+
+	return ret;
+}
+
+/*
+ * Clean up a CPU-clock timer that is about to be destroyed.
+ * This is called from timer deletion with the timer already locked.
+ * If we return TIMER_RETRY, it's necessary to release the timer's lock
+ * and try again.  (This happens when the timer is in the middle of firing.)
+ */
+int posix_cpu_timer_del(struct k_itimer *timer)
+{
+	struct task_struct *p = timer->it.cpu.task;
+
+	if (timer->it.cpu.firing)
+		return TIMER_RETRY;
+
+	if (unlikely(p == NULL))
+		return 0;
+
+	if (!list_empty(&timer->it.cpu.entry)) {
+		read_lock(&tasklist_lock);
+		if (unlikely(p->signal == NULL)) {
+			/*
+			 * We raced with the reaping of the task.
+			 * The deletion should have cleared us off the list.
+			 */
+			BUG_ON(!list_empty(&timer->it.cpu.entry));
+		} else {
+			/*
+			 * Take us off the task's timer list.
+			 */
+			spin_lock(&p->sighand->siglock);
+			list_del(&timer->it.cpu.entry);
+			spin_unlock(&p->sighand->siglock);
+		}
+		read_unlock(&tasklist_lock);
+	}
+	put_task_struct(p);
+
+	return 0;
+}
+
+/*
+ * Clean out CPU timers still ticking when a thread exited.  The task
+ * pointer is cleared, and the expiry time is replaced with the residual
+ * time for later timer_gettime calls to return.
+ * This must be called with the siglock held.
+ */
+static void cleanup_timers(struct list_head *head,
+			   cputime_t utime, cputime_t stime,
+			   unsigned long long sched_time)
+{
+	struct cpu_timer_list *timer, *next;
+	cputime_t ptime = cputime_add(utime, stime);
+
+	list_for_each_entry_safe(timer, next, head, entry) {
+		timer->task = NULL;
+		list_del_init(&timer->entry);
+		if (cputime_lt(timer->expires.cpu, ptime)) {
+			timer->expires.cpu = cputime_zero;
+		} else {
+			timer->expires.cpu = cputime_sub(timer->expires.cpu,
+							 ptime);
+		}
+	}
+
+	++head;
+	list_for_each_entry_safe(timer, next, head, entry) {
+		timer->task = NULL;
+		list_del_init(&timer->entry);
+		if (cputime_lt(timer->expires.cpu, utime)) {
+			timer->expires.cpu = cputime_zero;
+		} else {
+			timer->expires.cpu = cputime_sub(timer->expires.cpu,
+							 utime);
+		}
+	}
+
+	++head;
+	list_for_each_entry_safe(timer, next, head, entry) {
+		timer->task = NULL;
+		list_del_init(&timer->entry);
+		if (timer->expires.sched < sched_time) {
+			timer->expires.sched = 0;
+		} else {
+			timer->expires.sched -= sched_time;
+		}
+	}
+}
+
+/*
+ * These are both called with the siglock held, when the current thread
+ * is being reaped.  When the final (leader) thread in the group is reaped,
+ * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
+ */
+void posix_cpu_timers_exit(struct task_struct *tsk)
+{
+	cleanup_timers(tsk->cpu_timers,
+		       tsk->utime, tsk->stime, tsk->sched_time);
+
+}
+void posix_cpu_timers_exit_group(struct task_struct *tsk)
+{
+	cleanup_timers(tsk->signal->cpu_timers,
+		       cputime_add(tsk->utime, tsk->signal->utime),
+		       cputime_add(tsk->stime, tsk->signal->stime),
+		       tsk->sched_time + tsk->signal->sched_time);
+}
+
+
+/*
+ * Set the expiry times of all the threads in the process so one of them
+ * will go off before the process cumulative expiry total is reached.
+ */
+static void process_timer_rebalance(struct task_struct *p,
+				    unsigned int clock_idx,
+				    union cpu_time_count expires,
+				    union cpu_time_count val)
+{
+	cputime_t ticks, left;
+	unsigned long long ns, nsleft;
+ 	struct task_struct *t = p;
+	unsigned int nthreads = atomic_read(&p->signal->live);
+
+	switch (clock_idx) {
+	default:
+		BUG();
+		break;
+	case CPUCLOCK_PROF:
+		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
+				   nthreads);
+		do {
+			if (!unlikely(t->exit_state)) {
+				ticks = cputime_add(prof_ticks(t), left);
+				if (cputime_eq(t->it_prof_expires,
+					       cputime_zero) ||
+				    cputime_gt(t->it_prof_expires, ticks)) {
+					t->it_prof_expires = ticks;
+				}
+			}
+			t = next_thread(t);
+		} while (t != p);
+		break;
+	case CPUCLOCK_VIRT:
+		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
+				   nthreads);
+		do {
+			if (!unlikely(t->exit_state)) {
+				ticks = cputime_add(virt_ticks(t), left);
+				if (cputime_eq(t->it_virt_expires,
+					       cputime_zero) ||
+				    cputime_gt(t->it_virt_expires, ticks)) {
+					t->it_virt_expires = ticks;
+				}
+			}
+			t = next_thread(t);
+		} while (t != p);
+		break;
+	case CPUCLOCK_SCHED:
+		nsleft = expires.sched - val.sched;
+		do_div(nsleft, nthreads);
+		do {
+			if (!unlikely(t->exit_state)) {
+				ns = t->sched_time + nsleft;
+				if (t->it_sched_expires == 0 ||
+				    t->it_sched_expires > ns) {
+					t->it_sched_expires = ns;
+				}
+			}
+			t = next_thread(t);
+		} while (t != p);
+		break;
+	}
+}
+
+static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
+{
+	/*
+	 * That's all for this thread or process.
+	 * We leave our residual in expires to be reported.
+	 */
+	put_task_struct(timer->it.cpu.task);
+	timer->it.cpu.task = NULL;
+	timer->it.cpu.expires = cpu_time_sub(timer->it_clock,
+					     timer->it.cpu.expires,
+					     now);
+}
+
+/*
+ * Insert the timer on the appropriate list before any timers that
+ * expire later.  This must be called with the tasklist_lock held
+ * for reading, and interrupts disabled.
+ */
+static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
+{
+	struct task_struct *p = timer->it.cpu.task;
+	struct list_head *head, *listpos;
+	struct cpu_timer_list *const nt = &timer->it.cpu;
+	struct cpu_timer_list *next;
+	unsigned long i;
+
+	head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
+		p->cpu_timers : p->signal->cpu_timers);
+	head += CPUCLOCK_WHICH(timer->it_clock);
+
+	BUG_ON(!irqs_disabled());
+	spin_lock(&p->sighand->siglock);
+
+	listpos = head;
+	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
+		list_for_each_entry(next, head, entry) {
+			if (next->expires.sched > nt->expires.sched) {
+				listpos = &next->entry;
+				break;
+			}
+		}
+	} else {
+		list_for_each_entry(next, head, entry) {
+			if (cputime_gt(next->expires.cpu, nt->expires.cpu)) {
+				listpos = &next->entry;
+				break;
+			}
+		}
+	}
+	list_add(&nt->entry, listpos);
+
+	if (listpos == head) {
+		/*
+		 * We are the new earliest-expiring timer.
+		 * If we are a thread timer, there can always
+		 * be a process timer telling us to stop earlier.
+		 */
+
+		if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+			switch (CPUCLOCK_WHICH(timer->it_clock)) {
+			default:
+				BUG();
+			case CPUCLOCK_PROF:
+				if (cputime_eq(p->it_prof_expires,
+					       cputime_zero) ||
+				    cputime_gt(p->it_prof_expires,
+					       nt->expires.cpu))
+					p->it_prof_expires = nt->expires.cpu;
+				break;
+			case CPUCLOCK_VIRT:
+				if (cputime_eq(p->it_virt_expires,
+					       cputime_zero) ||
+				    cputime_gt(p->it_virt_expires,
+					       nt->expires.cpu))
+					p->it_virt_expires = nt->expires.cpu;
+				break;
+			case CPUCLOCK_SCHED:
+				if (p->it_sched_expires == 0 ||
+				    p->it_sched_expires > nt->expires.sched)
+					p->it_sched_expires = nt->expires.sched;
+				break;
+			}
+		} else {
+			/*
+			 * For a process timer, we must balance
+			 * all the live threads' expirations.
+			 */
+			switch (CPUCLOCK_WHICH(timer->it_clock)) {
+			default:
+				BUG();
+			case CPUCLOCK_VIRT:
+				if (!cputime_eq(p->signal->it_virt_expires,
+						cputime_zero) &&
+				    cputime_lt(p->signal->it_virt_expires,
+					       timer->it.cpu.expires.cpu))
+					break;
+				goto rebalance;
+			case CPUCLOCK_PROF:
+				if (!cputime_eq(p->signal->it_prof_expires,
+						cputime_zero) &&
+				    cputime_lt(p->signal->it_prof_expires,
+					       timer->it.cpu.expires.cpu))
+					break;
+				i = p->signal->rlim[RLIMIT_CPU].rlim_cur;
+				if (i != RLIM_INFINITY &&
+				    i <= cputime_to_secs(timer->it.cpu.expires.cpu))
+					break;
+				goto rebalance;
+			case CPUCLOCK_SCHED:
+			rebalance:
+				process_timer_rebalance(
+					timer->it.cpu.task,
+					CPUCLOCK_WHICH(timer->it_clock),
+					timer->it.cpu.expires, now);
+				break;
+			}
+		}
+	}
+
+	spin_unlock(&p->sighand->siglock);
+}
+
+/*
+ * The timer is locked, fire it and arrange for its reload.
+ */
+static void cpu_timer_fire(struct k_itimer *timer)
+{
+	if (unlikely(timer->sigq == NULL)) {
+		/*
+		 * This a special case for clock_nanosleep,
+		 * not a normal timer from sys_timer_create.
+		 */
+		wake_up_process(timer->it_process);
+		timer->it.cpu.expires.sched = 0;
+	} else if (timer->it.cpu.incr.sched == 0) {
+		/*
+		 * One-shot timer.  Clear it as soon as it's fired.
+		 */
+		posix_timer_event(timer, 0);
+		timer->it.cpu.expires.sched = 0;
+	} else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
+		/*
+		 * The signal did not get queued because the signal
+		 * was ignored, so we won't get any callback to
+		 * reload the timer.  But we need to keep it
+		 * ticking in case the signal is deliverable next time.
+		 */
+		posix_cpu_timer_schedule(timer);
+	}
+}
+
+/*
+ * Guts of sys_timer_settime for CPU timers.
+ * This is called with the timer locked and interrupts disabled.
+ * If we return TIMER_RETRY, it's necessary to release the timer's lock
+ * and try again.  (This happens when the timer is in the middle of firing.)
+ */
+int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+			struct itimerspec *new, struct itimerspec *old)
+{
+	struct task_struct *p = timer->it.cpu.task;
+	union cpu_time_count old_expires, new_expires, val;
+	int ret;
+
+	if (unlikely(p == NULL)) {
+		/*
+		 * Timer refers to a dead task's clock.
+		 */
+		return -ESRCH;
+	}
+
+	new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
+
+	read_lock(&tasklist_lock);
+	/*
+	 * We need the tasklist_lock to protect against reaping that
+	 * clears p->signal.  If p has just been reaped, we can no
+	 * longer get any information about it at all.
+	 */
+	if (unlikely(p->signal == NULL)) {
+		read_unlock(&tasklist_lock);
+		put_task_struct(p);
+		timer->it.cpu.task = NULL;
+		return -ESRCH;
+	}
+
+	/*
+	 * Disarm any old timer after extracting its expiry time.
+	 */
+	BUG_ON(!irqs_disabled());
+	spin_lock(&p->sighand->siglock);
+	old_expires = timer->it.cpu.expires;
+	list_del_init(&timer->it.cpu.entry);
+	spin_unlock(&p->sighand->siglock);
+
+	/*
+	 * We need to sample the current value to convert the new
+	 * value from to relative and absolute, and to convert the
+	 * old value from absolute to relative.  To set a process
+	 * timer, we need a sample to balance the thread expiry
+	 * times (in arm_timer).  With an absolute time, we must
+	 * check if it's already passed.  In short, we need a sample.
+	 */
+	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+		cpu_clock_sample(timer->it_clock, p, &val);
+	} else {
+		cpu_clock_sample_group(timer->it_clock, p, &val);
+	}
+
+	if (old) {
+		if (old_expires.sched == 0) {
+			old->it_value.tv_sec = 0;
+			old->it_value.tv_nsec = 0;
+		} else {
+			/*
+			 * Update the timer in case it has
+			 * overrun already.  If it has,
+			 * we'll report it as having overrun
+			 * and with the next reloaded timer
+			 * already ticking, though we are
+			 * swallowing that pending
+			 * notification here to install the
+			 * new setting.
+			 */
+			bump_cpu_timer(timer, val);
+			if (cpu_time_before(timer->it_clock, val,
+					    timer->it.cpu.expires)) {
+				old_expires = cpu_time_sub(
+					timer->it_clock,
+					timer->it.cpu.expires, val);
+				sample_to_timespec(timer->it_clock,
+						   old_expires,
+						   &old->it_value);
+			} else {
+				old->it_value.tv_nsec = 1;
+				old->it_value.tv_sec = 0;
+			}
+		}
+	}
+
+	if (unlikely(timer->it.cpu.firing)) {
+		/*
+		 * We are colliding with the timer actually firing.
+		 * Punt after filling in the timer's old value, and
+		 * disable this firing since we are already reporting
+		 * it as an overrun (thanks to bump_cpu_timer above).
+		 */
+		read_unlock(&tasklist_lock);
+		timer->it.cpu.firing = -1;
+		ret = TIMER_RETRY;
+		goto out;
+	}
+
+	if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) {
+		cpu_time_add(timer->it_clock, &new_expires, val);
+	}
+
+	/*
+	 * Install the new expiry time (or zero).
+	 * For a timer with no notification action, we don't actually
+	 * arm the timer (we'll just fake it for timer_gettime).
+	 */
+	timer->it.cpu.expires = new_expires;
+	if (new_expires.sched != 0 &&
+	    (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
+	    cpu_time_before(timer->it_clock, val, new_expires)) {
+		arm_timer(timer, val);
+	}
+
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * Install the new reload setting, and
+	 * set up the signal and overrun bookkeeping.
+	 */
+	timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
+						&new->it_interval);
+
+	/*
+	 * This acts as a modification timestamp for the timer,
+	 * so any automatic reload attempt will punt on seeing
+	 * that we have reset the timer manually.
+	 */
+	timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
+		~REQUEUE_PENDING;
+	timer->it_overrun_last = 0;
+	timer->it_overrun = -1;
+
+	if (new_expires.sched != 0 &&
+	    (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
+	    !cpu_time_before(timer->it_clock, val, new_expires)) {
+		/*
+		 * The designated time already passed, so we notify
+		 * immediately, even if the thread never runs to
+		 * accumulate more time on this clock.
+		 */
+		cpu_timer_fire(timer);
+	}
+
+	ret = 0;
+ out:
+	if (old) {
+		sample_to_timespec(timer->it_clock,
+				   timer->it.cpu.incr, &old->it_interval);
+	}
+	return ret;
+}
+
+void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+{
+	union cpu_time_count now;
+	struct task_struct *p = timer->it.cpu.task;
+	int clear_dead;
+
+	/*
+	 * Easy part: convert the reload time.
+	 */
+	sample_to_timespec(timer->it_clock,
+			   timer->it.cpu.incr, &itp->it_interval);
+
+	if (timer->it.cpu.expires.sched == 0) {	/* Timer not armed at all.  */
+		itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
+		return;
+	}
+
+	if (unlikely(p == NULL)) {
+		/*
+		 * This task already died and the timer will never fire.
+		 * In this case, expires is actually the dead value.
+		 */
+	dead:
+		sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
+				   &itp->it_value);
+		return;
+	}
+
+	/*
+	 * Sample the clock to take the difference with the expiry time.
+	 */
+	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+		cpu_clock_sample(timer->it_clock, p, &now);
+		clear_dead = p->exit_state;
+	} else {
+		read_lock(&tasklist_lock);
+		if (unlikely(p->signal == NULL)) {
+			/*
+			 * The process has been reaped.
+			 * We can't even collect a sample any more.
+			 * Call the timer disarmed, nothing else to do.
+			 */
+			put_task_struct(p);
+			timer->it.cpu.task = NULL;
+			timer->it.cpu.expires.sched = 0;
+			read_unlock(&tasklist_lock);
+			goto dead;
+		} else {
+			cpu_clock_sample_group(timer->it_clock, p, &now);
+			clear_dead = (unlikely(p->exit_state) &&
+				      thread_group_empty(p));
+		}
+		read_unlock(&tasklist_lock);
+	}
+
+	if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+		if (timer->it.cpu.incr.sched == 0 &&
+		    cpu_time_before(timer->it_clock,
+				    timer->it.cpu.expires, now)) {
+			/*
+			 * Do-nothing timer expired and has no reload,
+			 * so it's as if it was never set.
+			 */
+			timer->it.cpu.expires.sched = 0;
+			itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
+			return;
+		}
+		/*
+		 * Account for any expirations and reloads that should
+		 * have happened.
+		 */
+		bump_cpu_timer(timer, now);
+	}
+
+	if (unlikely(clear_dead)) {
+		/*
+		 * We've noticed that the thread is dead, but
+		 * not yet reaped.  Take this opportunity to
+		 * drop our task ref.
+		 */
+		clear_dead_task(timer, now);
+		goto dead;
+	}
+
+	if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) {
+		sample_to_timespec(timer->it_clock,
+				   cpu_time_sub(timer->it_clock,
+						timer->it.cpu.expires, now),
+				   &itp->it_value);
+	} else {
+		/*
+		 * The timer should have expired already, but the firing
+		 * hasn't taken place yet.  Say it's just about to expire.
+		 */
+		itp->it_value.tv_nsec = 1;
+		itp->it_value.tv_sec = 0;
+	}
+}
+
+/*
+ * Check for any per-thread CPU timers that have fired and move them off
+ * the tsk->cpu_timers[N] list onto the firing list.  Here we update the
+ * tsk->it_*_expires values to reflect the remaining thread CPU timers.
+ */
+static void check_thread_timers(struct task_struct *tsk,
+				struct list_head *firing)
+{
+	struct list_head *timers = tsk->cpu_timers;
+
+	tsk->it_prof_expires = cputime_zero;
+	while (!list_empty(timers)) {
+		struct cpu_timer_list *t = list_entry(timers->next,
+						      struct cpu_timer_list,
+						      entry);
+		if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+			tsk->it_prof_expires = t->expires.cpu;
+			break;
+		}
+		t->firing = 1;
+		list_move_tail(&t->entry, firing);
+	}
+
+	++timers;
+	tsk->it_virt_expires = cputime_zero;
+	while (!list_empty(timers)) {
+		struct cpu_timer_list *t = list_entry(timers->next,
+						      struct cpu_timer_list,
+						      entry);
+		if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+			tsk->it_virt_expires = t->expires.cpu;
+			break;
+		}
+		t->firing = 1;
+		list_move_tail(&t->entry, firing);
+	}
+
+	++timers;
+	tsk->it_sched_expires = 0;
+	while (!list_empty(timers)) {
+		struct cpu_timer_list *t = list_entry(timers->next,
+						      struct cpu_timer_list,
+						      entry);
+		if (tsk->sched_time < t->expires.sched) {
+			tsk->it_sched_expires = t->expires.sched;
+			break;
+		}
+		t->firing = 1;
+		list_move_tail(&t->entry, firing);
+	}
+}
+
+/*
+ * Check for any per-thread CPU timers that have fired and move them
+ * off the tsk->*_timers list onto the firing list.  Per-thread timers
+ * have already been taken off.
+ */
+static void check_process_timers(struct task_struct *tsk,
+				 struct list_head *firing)
+{
+	struct signal_struct *const sig = tsk->signal;
+	cputime_t utime, stime, ptime, virt_expires, prof_expires;
+	unsigned long long sched_time, sched_expires;
+	struct task_struct *t;
+	struct list_head *timers = sig->cpu_timers;
+
+	/*
+	 * Don't sample the current process CPU clocks if there are no timers.
+	 */
+	if (list_empty(&timers[CPUCLOCK_PROF]) &&
+	    cputime_eq(sig->it_prof_expires, cputime_zero) &&
+	    sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
+	    list_empty(&timers[CPUCLOCK_VIRT]) &&
+	    cputime_eq(sig->it_virt_expires, cputime_zero) &&
+	    list_empty(&timers[CPUCLOCK_SCHED]))
+		return;
+
+	/*
+	 * Collect the current process totals.
+	 */
+	utime = sig->utime;
+	stime = sig->stime;
+	sched_time = sig->sched_time;
+	t = tsk;
+	do {
+		utime = cputime_add(utime, t->utime);
+		stime = cputime_add(stime, t->stime);
+		sched_time += t->sched_time;
+		t = next_thread(t);
+	} while (t != tsk);
+	ptime = cputime_add(utime, stime);
+
+	prof_expires = cputime_zero;
+	while (!list_empty(timers)) {
+		struct cpu_timer_list *t = list_entry(timers->next,
+						      struct cpu_timer_list,
+						      entry);
+		if (cputime_lt(ptime, t->expires.cpu)) {
+			prof_expires = t->expires.cpu;
+			break;
+		}
+		t->firing = 1;
+		list_move_tail(&t->entry, firing);
+	}
+
+	++timers;
+	virt_expires = cputime_zero;
+	while (!list_empty(timers)) {
+		struct cpu_timer_list *t = list_entry(timers->next,
+						      struct cpu_timer_list,
+						      entry);
+		if (cputime_lt(utime, t->expires.cpu)) {
+			virt_expires = t->expires.cpu;
+			break;
+		}
+		t->firing = 1;
+		list_move_tail(&t->entry, firing);
+	}
+
+	++timers;
+	sched_expires = 0;
+	while (!list_empty(timers)) {
+		struct cpu_timer_list *t = list_entry(timers->next,
+						      struct cpu_timer_list,
+						      entry);
+		if (sched_time < t->expires.sched) {
+			sched_expires = t->expires.sched;
+			break;
+		}
+		t->firing = 1;
+		list_move_tail(&t->entry, firing);
+	}
+
+	/*
+	 * Check for the special case process timers.
+	 */
+	if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
+		if (cputime_ge(ptime, sig->it_prof_expires)) {
+			/* ITIMER_PROF fires and reloads.  */
+			sig->it_prof_expires = sig->it_prof_incr;
+			if (!cputime_eq(sig->it_prof_expires, cputime_zero)) {
+				sig->it_prof_expires = cputime_add(
+					sig->it_prof_expires, ptime);
+			}
+			__group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
+		}
+		if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
+		    (cputime_eq(prof_expires, cputime_zero) ||
+		     cputime_lt(sig->it_prof_expires, prof_expires))) {
+			prof_expires = sig->it_prof_expires;
+		}
+	}
+	if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
+		if (cputime_ge(utime, sig->it_virt_expires)) {
+			/* ITIMER_VIRTUAL fires and reloads.  */
+			sig->it_virt_expires = sig->it_virt_incr;
+			if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
+				sig->it_virt_expires = cputime_add(
+					sig->it_virt_expires, utime);
+			}
+			__group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
+		}
+		if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
+		    (cputime_eq(virt_expires, cputime_zero) ||
+		     cputime_lt(sig->it_virt_expires, virt_expires))) {
+			virt_expires = sig->it_virt_expires;
+		}
+	}
+	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+		unsigned long psecs = cputime_to_secs(ptime);
+		cputime_t x;
+		if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) {
+			/*
+			 * At the hard limit, we just die.
+			 * No need to calculate anything else now.
+			 */
+			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+			return;
+		}
+		if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) {
+			/*
+			 * At the soft limit, send a SIGXCPU every second.
+			 */
+			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+			if (sig->rlim[RLIMIT_CPU].rlim_cur
+			    < sig->rlim[RLIMIT_CPU].rlim_max) {
+				sig->rlim[RLIMIT_CPU].rlim_cur++;
+			}
+		}
+		x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+		if (cputime_eq(prof_expires, cputime_zero) ||
+		    cputime_lt(x, prof_expires)) {
+			prof_expires = x;
+		}
+	}
+
+	if (!cputime_eq(prof_expires, cputime_zero) ||
+	    !cputime_eq(virt_expires, cputime_zero) ||
+	    sched_expires != 0) {
+		/*
+		 * Rebalance the threads' expiry times for the remaining
+		 * process CPU timers.
+		 */
+
+		cputime_t prof_left, virt_left, ticks;
+		unsigned long long sched_left, sched;
+		const unsigned int nthreads = atomic_read(&sig->live);
+
+		prof_left = cputime_sub(prof_expires, utime);
+		prof_left = cputime_sub(prof_left, stime);
+		prof_left = cputime_div(prof_left, nthreads);
+		virt_left = cputime_sub(virt_expires, utime);
+		virt_left = cputime_div(virt_left, nthreads);
+		if (sched_expires) {
+			sched_left = sched_expires - sched_time;
+			do_div(sched_left, nthreads);
+		} else {
+			sched_left = 0;
+		}
+		t = tsk;
+		do {
+			ticks = cputime_add(cputime_add(t->utime, t->stime),
+					    prof_left);
+			if (!cputime_eq(prof_expires, cputime_zero) &&
+			    (cputime_eq(t->it_prof_expires, cputime_zero) ||
+			     cputime_gt(t->it_prof_expires, ticks))) {
+				t->it_prof_expires = ticks;
+			}
+
+			ticks = cputime_add(t->utime, virt_left);
+			if (!cputime_eq(virt_expires, cputime_zero) &&
+			    (cputime_eq(t->it_virt_expires, cputime_zero) ||
+			     cputime_gt(t->it_virt_expires, ticks))) {
+				t->it_virt_expires = ticks;
+			}
+
+			sched = t->sched_time + sched_left;
+			if (sched_expires && (t->it_sched_expires == 0 ||
+					      t->it_sched_expires > sched)) {
+				t->it_sched_expires = sched;
+			}
+
+			do {
+				t = next_thread(t);
+			} while (unlikely(t->exit_state));
+		} while (t != tsk);
+	}
+}
+
+/*
+ * This is called from the signal code (via do_schedule_next_timer)
+ * when the last timer signal was delivered and we have to reload the timer.
+ */
+void posix_cpu_timer_schedule(struct k_itimer *timer)
+{
+	struct task_struct *p = timer->it.cpu.task;
+	union cpu_time_count now;
+
+	if (unlikely(p == NULL))
+		/*
+		 * The task was cleaned up already, no future firings.
+		 */
+		return;
+
+	/*
+	 * Fetch the current sample and update the timer's expiry time.
+	 */
+	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+		cpu_clock_sample(timer->it_clock, p, &now);
+		bump_cpu_timer(timer, now);
+		if (unlikely(p->exit_state)) {
+			clear_dead_task(timer, now);
+			return;
+		}
+		read_lock(&tasklist_lock); /* arm_timer needs it.  */
+	} else {
+		read_lock(&tasklist_lock);
+		if (unlikely(p->signal == NULL)) {
+			/*
+			 * The process has been reaped.
+			 * We can't even collect a sample any more.
+			 */
+			put_task_struct(p);
+			timer->it.cpu.task = p = NULL;
+			timer->it.cpu.expires.sched = 0;
+			read_unlock(&tasklist_lock);
+			return;
+		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
+			/*
+			 * We've noticed that the thread is dead, but
+			 * not yet reaped.  Take this opportunity to
+			 * drop our task ref.
+			 */
+			clear_dead_task(timer, now);
+			read_unlock(&tasklist_lock);
+			return;
+		}
+		cpu_clock_sample_group(timer->it_clock, p, &now);
+		bump_cpu_timer(timer, now);
+		/* Leave the tasklist_lock locked for the call below.  */
+	}
+
+	/*
+	 * Now re-arm for the new expiry time.
+	 */
+	arm_timer(timer, now);
+
+	read_unlock(&tasklist_lock);
+}
+
+/*
+ * This is called from the timer interrupt handler.  The irq handler has
+ * already updated our counts.  We need to check if any timers fire now.
+ * Interrupts are disabled.
+ */
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+	LIST_HEAD(firing);
+	struct k_itimer *timer, *next;
+
+	BUG_ON(!irqs_disabled());
+
+#define UNEXPIRED(clock) \
+		(cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \
+		 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires))
+
+	if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
+	    (tsk->it_sched_expires == 0 ||
+	     tsk->sched_time < tsk->it_sched_expires))
+		return;
+
+#undef	UNEXPIRED
+
+	BUG_ON(tsk->exit_state);
+
+	/*
+	 * Double-check with locks held.
+	 */
+	read_lock(&tasklist_lock);
+	spin_lock(&tsk->sighand->siglock);
+
+	/*
+	 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+	 * all the timers that are firing, and put them on the firing list.
+	 */
+	check_thread_timers(tsk, &firing);
+	check_process_timers(tsk, &firing);
+
+	/*
+	 * We must release these locks before taking any timer's lock.
+	 * There is a potential race with timer deletion here, as the
+	 * siglock now protects our private firing list.  We have set
+	 * the firing flag in each timer, so that a deletion attempt
+	 * that gets the timer lock before we do will give it up and
+	 * spin until we've taken care of that timer below.
+	 */
+	spin_unlock(&tsk->sighand->siglock);
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * Now that all the timers on our list have the firing flag,
+	 * noone will touch their list entries but us.  We'll take
+	 * each timer's lock before clearing its firing flag, so no
+	 * timer call will interfere.
+	 */
+	list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
+		int firing;
+		spin_lock(&timer->it_lock);
+		list_del_init(&timer->it.cpu.entry);
+		firing = timer->it.cpu.firing;
+		timer->it.cpu.firing = 0;
+		/*
+		 * The firing flag is -1 if we collided with a reset
+		 * of the timer, which already reported this
+		 * almost-firing as an overrun.  So don't generate an event.
+		 */
+		if (likely(firing >= 0)) {
+			cpu_timer_fire(timer);
+		}
+		spin_unlock(&timer->it_lock);
+	}
+}
+
+/*
+ * Set one of the process-wide special case CPU timers.
+ * The tasklist_lock and tsk->sighand->siglock must be held by the caller.
+ * The oldval argument is null for the RLIMIT_CPU timer, where *newval is
+ * absolute; non-null for ITIMER_*, where *newval is relative and we update
+ * it to be absolute, *oldval is absolute and we update it to be relative.
+ */
+void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
+			   cputime_t *newval, cputime_t *oldval)
+{
+	union cpu_time_count now;
+	struct list_head *head;
+
+	BUG_ON(clock_idx == CPUCLOCK_SCHED);
+	cpu_clock_sample_group_locked(clock_idx, tsk, &now);
+
+	if (oldval) {
+		if (!cputime_eq(*oldval, cputime_zero)) {
+			if (cputime_le(*oldval, now.cpu)) {
+				/* Just about to fire. */
+				*oldval = jiffies_to_cputime(1);
+			} else {
+				*oldval = cputime_sub(*oldval, now.cpu);
+			}
+		}
+
+		if (cputime_eq(*newval, cputime_zero))
+			return;
+		*newval = cputime_add(*newval, now.cpu);
+
+		/*
+		 * If the RLIMIT_CPU timer will expire before the
+		 * ITIMER_PROF timer, we have nothing else to do.
+		 */
+		if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
+		    < cputime_to_secs(*newval))
+			return;
+	}
+
+	/*
+	 * Check whether there are any process timers already set to fire
+	 * before this one.  If so, we don't have anything more to do.
+	 */
+	head = &tsk->signal->cpu_timers[clock_idx];
+	if (list_empty(head) ||
+	    cputime_ge(list_entry(head->next,
+				  struct cpu_timer_list, entry)->expires.cpu,
+		       *newval)) {
+		/*
+		 * Rejigger each thread's expiry time so that one will
+		 * notice before we hit the process-cumulative expiry time.
+		 */
+		union cpu_time_count expires = { .sched = 0 };
+		expires.cpu = *newval;
+		process_timer_rebalance(tsk, clock_idx, expires, now);
+	}
+}
+
+static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
+
+int posix_cpu_nsleep(clockid_t which_clock, int flags,
+		     struct timespec *rqtp)
+{
+	struct restart_block *restart_block =
+	    &current_thread_info()->restart_block;
+	struct k_itimer timer;
+	int error;
+
+	/*
+	 * Diagnose required errors first.
+	 */
+	if (CPUCLOCK_PERTHREAD(which_clock) &&
+	    (CPUCLOCK_PID(which_clock) == 0 ||
+	     CPUCLOCK_PID(which_clock) == current->pid))
+		return -EINVAL;
+
+	/*
+	 * Set up a temporary timer and then wait for it to go off.
+	 */
+	memset(&timer, 0, sizeof timer);
+	spin_lock_init(&timer.it_lock);
+	timer.it_clock = which_clock;
+	timer.it_overrun = -1;
+	error = posix_cpu_timer_create(&timer);
+	timer.it_process = current;
+	if (!error) {
+		struct timespec __user *rmtp;
+		static struct itimerspec zero_it;
+		struct itimerspec it = { .it_value = *rqtp,
+					 .it_interval = {} };
+
+		spin_lock_irq(&timer.it_lock);
+		error = posix_cpu_timer_set(&timer, flags, &it, NULL);
+		if (error) {
+			spin_unlock_irq(&timer.it_lock);
+			return error;
+		}
+
+		while (!signal_pending(current)) {
+			if (timer.it.cpu.expires.sched == 0) {
+				/*
+				 * Our timer fired and was reset.
+				 */
+				spin_unlock_irq(&timer.it_lock);
+				return 0;
+			}
+
+			/*
+			 * Block until cpu_timer_fire (or a signal) wakes us.
+			 */
+			__set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&timer.it_lock);
+			schedule();
+			spin_lock_irq(&timer.it_lock);
+		}
+
+		/*
+		 * We were interrupted by a signal.
+		 */
+		sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
+		posix_cpu_timer_set(&timer, 0, &zero_it, &it);
+		spin_unlock_irq(&timer.it_lock);
+
+		if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
+			/*
+			 * It actually did fire already.
+			 */
+			return 0;
+		}
+
+		/*
+		 * Report back to the user the time still remaining.
+		 */
+		rmtp = (struct timespec __user *) restart_block->arg1;
+		if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
+		    copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+			return -EFAULT;
+
+		restart_block->fn = posix_cpu_clock_nanosleep_restart;
+		/* Caller already set restart_block->arg1 */
+		restart_block->arg0 = which_clock;
+		restart_block->arg2 = rqtp->tv_sec;
+		restart_block->arg3 = rqtp->tv_nsec;
+
+		error = -ERESTART_RESTARTBLOCK;
+	}
+
+	return error;
+}
+
+static long
+posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
+{
+	clockid_t which_clock = restart_block->arg0;
+	struct timespec t = { .tv_sec = restart_block->arg2,
+			      .tv_nsec = restart_block->arg3 };
+	restart_block->fn = do_no_restart_syscall;
+	return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t);
+}
+
+
+#define PROCESS_CLOCK	MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
+#define THREAD_CLOCK	MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
+
+static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
+}
+static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
+}
+static int process_cpu_timer_create(struct k_itimer *timer)
+{
+	timer->it_clock = PROCESS_CLOCK;
+	return posix_cpu_timer_create(timer);
+}
+static int process_cpu_nsleep(clockid_t which_clock, int flags,
+			      struct timespec *rqtp)
+{
+	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
+}
+static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
+}
+static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+	return posix_cpu_clock_get(THREAD_CLOCK, tp);
+}
+static int thread_cpu_timer_create(struct k_itimer *timer)
+{
+	timer->it_clock = THREAD_CLOCK;
+	return posix_cpu_timer_create(timer);
+}
+static int thread_cpu_nsleep(clockid_t which_clock, int flags,
+			      struct timespec *rqtp)
+{
+	return -EINVAL;
+}
+
+static __init int init_posix_cpu_timers(void)
+{
+	struct k_clock process = {
+		.clock_getres = process_cpu_clock_getres,
+		.clock_get = process_cpu_clock_get,
+		.clock_set = do_posix_clock_nosettime,
+		.timer_create = process_cpu_timer_create,
+		.nsleep = process_cpu_nsleep,
+	};
+	struct k_clock thread = {
+		.clock_getres = thread_cpu_clock_getres,
+		.clock_get = thread_cpu_clock_get,
+		.clock_set = do_posix_clock_nosettime,
+		.timer_create = thread_cpu_timer_create,
+		.nsleep = thread_cpu_nsleep,
+	};
+
+	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
+	register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+
+	return 0;
+}
+__initcall(init_posix_cpu_timers);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
new file mode 100644
index 000000000000..fd316c272260
--- /dev/null
+++ b/kernel/posix-timers.c
@@ -0,0 +1,1584 @@
+/*
+ * linux/kernel/posix_timers.c
+ *
+ *
+ * 2002-10-15  Posix Clocks & timers
+ *                           by George Anzinger george@mvista.com
+ *
+ *			     Copyright (C) 2002 2003 by MontaVista Software.
+ *
+ * 2004-06-01  Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
+ *			     Copyright (C) 2004 Boris Hu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA
+ */
+
+/* These are all the functions necessary to implement
+ * POSIX clocks & timers
+ */
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/idr.h>
+#include <linux/posix-timers.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/module.h>
+
+#ifndef div_long_long_rem
+#include <asm/div64.h>
+
+#define div_long_long_rem(dividend,divisor,remainder) ({ \
+		       u64 result = dividend;		\
+		       *remainder = do_div(result,divisor); \
+		       result; })
+
+#endif
+#define CLOCK_REALTIME_RES TICK_NSEC  /* In nano seconds. */
+
+static inline u64  mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2)
+{
+	return (u64)mpy1 * mpy2;
+}
+/*
+ * Management arrays for POSIX timers.	 Timers are kept in slab memory
+ * Timer ids are allocated by an external routine that keeps track of the
+ * id and the timer.  The external interface is:
+ *
+ * void *idr_find(struct idr *idp, int id);           to find timer_id <id>
+ * int idr_get_new(struct idr *idp, void *ptr);       to get a new id and
+ *                                                    related it to <ptr>
+ * void idr_remove(struct idr *idp, int id);          to release <id>
+ * void idr_init(struct idr *idp);                    to initialize <idp>
+ *                                                    which we supply.
+ * The idr_get_new *may* call slab for more memory so it must not be
+ * called under a spin lock.  Likewise idr_remore may release memory
+ * (but it may be ok to do this under a lock...).
+ * idr_find is just a memory look up and is quite fast.  A -1 return
+ * indicates that the requested id does not exist.
+ */
+
+/*
+ * Lets keep our timers in a slab cache :-)
+ */
+static kmem_cache_t *posix_timers_cache;
+static struct idr posix_timers_id;
+static DEFINE_SPINLOCK(idr_lock);
+
+/*
+ * Just because the timer is not in the timer list does NOT mean it is
+ * inactive.  It could be in the "fire" routine getting a new expire time.
+ */
+#define TIMER_INACTIVE 1
+
+#ifdef CONFIG_SMP
+# define timer_active(tmr) \
+		((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE)
+# define set_timer_inactive(tmr) \
+		do { \
+			(tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \
+		} while (0)
+#else
+# define timer_active(tmr) BARFY	// error to use outside of SMP
+# define set_timer_inactive(tmr) do { } while (0)
+#endif
+/*
+ * we assume that the new SIGEV_THREAD_ID shares no bits with the other
+ * SIGEV values.  Here we put out an error if this assumption fails.
+ */
+#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
+                       ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
+#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
+#endif
+
+
+/*
+ * The timer ID is turned into a timer address by idr_find().
+ * Verifying a valid ID consists of:
+ *
+ * a) checking that idr_find() returns other than -1.
+ * b) checking that the timer id matches the one in the timer itself.
+ * c) that the timer owner is in the callers thread group.
+ */
+
+/*
+ * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
+ *	    to implement others.  This structure defines the various
+ *	    clocks and allows the possibility of adding others.	 We
+ *	    provide an interface to add clocks to the table and expect
+ *	    the "arch" code to add at least one clock that is high
+ *	    resolution.	 Here we define the standard CLOCK_REALTIME as a
+ *	    1/HZ resolution clock.
+ *
+ * RESOLUTION: Clock resolution is used to round up timer and interval
+ *	    times, NOT to report clock times, which are reported with as
+ *	    much resolution as the system can muster.  In some cases this
+ *	    resolution may depend on the underlying clock hardware and
+ *	    may not be quantifiable until run time, and only then is the
+ *	    necessary code is written.	The standard says we should say
+ *	    something about this issue in the documentation...
+ *
+ * FUNCTIONS: The CLOCKs structure defines possible functions to handle
+ *	    various clock functions.  For clocks that use the standard
+ *	    system timer code these entries should be NULL.  This will
+ *	    allow dispatch without the overhead of indirect function
+ *	    calls.  CLOCKS that depend on other sources (e.g. WWV or GPS)
+ *	    must supply functions here, even if the function just returns
+ *	    ENOSYS.  The standard POSIX timer management code assumes the
+ *	    following: 1.) The k_itimer struct (sched.h) is used for the
+ *	    timer.  2.) The list, it_lock, it_clock, it_id and it_process
+ *	    fields are not modified by timer code.
+ *
+ *          At this time all functions EXCEPT clock_nanosleep can be
+ *          redirected by the CLOCKS structure.  Clock_nanosleep is in
+ *          there, but the code ignores it.
+ *
+ * Permissions: It is assumed that the clock_settime() function defined
+ *	    for each clock will take care of permission checks.	 Some
+ *	    clocks may be set able by any user (i.e. local process
+ *	    clocks) others not.	 Currently the only set able clock we
+ *	    have is CLOCK_REALTIME and its high res counter part, both of
+ *	    which we beg off on and pass to do_sys_settimeofday().
+ */
+
+static struct k_clock posix_clocks[MAX_CLOCKS];
+/*
+ * We only have one real clock that can be set so we need only one abs list,
+ * even if we should want to have several clocks with differing resolutions.
+ */
+static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list),
+				      .lock = SPIN_LOCK_UNLOCKED};
+
+static void posix_timer_fn(unsigned long);
+static u64 do_posix_clock_monotonic_gettime_parts(
+	struct timespec *tp, struct timespec *mo);
+int do_posix_clock_monotonic_gettime(struct timespec *tp);
+static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp);
+
+static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
+
+static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
+{
+	spin_unlock_irqrestore(&timr->it_lock, flags);
+}
+
+/*
+ * Call the k_clock hook function if non-null, or the default function.
+ */
+#define CLOCK_DISPATCH(clock, call, arglist) \
+ 	((clock) < 0 ? posix_cpu_##call arglist : \
+ 	 (posix_clocks[clock].call != NULL \
+ 	  ? (*posix_clocks[clock].call) arglist : common_##call arglist))
+
+/*
+ * Default clock hook functions when the struct k_clock passed
+ * to register_posix_clock leaves a function pointer null.
+ *
+ * The function common_CALL is the default implementation for
+ * the function pointer CALL in struct k_clock.
+ */
+
+static inline int common_clock_getres(clockid_t which_clock,
+				      struct timespec *tp)
+{
+	tp->tv_sec = 0;
+	tp->tv_nsec = posix_clocks[which_clock].res;
+	return 0;
+}
+
+static inline int common_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+	getnstimeofday(tp);
+	return 0;
+}
+
+static inline int common_clock_set(clockid_t which_clock, struct timespec *tp)
+{
+	return do_sys_settimeofday(tp, NULL);
+}
+
+static inline int common_timer_create(struct k_itimer *new_timer)
+{
+	INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry);
+	init_timer(&new_timer->it.real.timer);
+	new_timer->it.real.timer.data = (unsigned long) new_timer;
+	new_timer->it.real.timer.function = posix_timer_fn;
+	set_timer_inactive(new_timer);
+	return 0;
+}
+
+/*
+ * These ones are defined below.
+ */
+static int common_nsleep(clockid_t, int flags, struct timespec *t);
+static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static int common_timer_set(struct k_itimer *, int,
+			    struct itimerspec *, struct itimerspec *);
+static int common_timer_del(struct k_itimer *timer);
+
+/*
+ * Return nonzero iff we know a priori this clockid_t value is bogus.
+ */
+static inline int invalid_clockid(clockid_t which_clock)
+{
+	if (which_clock < 0)	/* CPU clock, posix_cpu_* will check it */
+		return 0;
+	if ((unsigned) which_clock >= MAX_CLOCKS)
+		return 1;
+	if (posix_clocks[which_clock].clock_getres != NULL)
+		return 0;
+#ifndef CLOCK_DISPATCH_DIRECT
+	if (posix_clocks[which_clock].res != 0)
+		return 0;
+#endif
+	return 1;
+}
+
+
+/*
+ * Initialize everything, well, just everything in Posix clocks/timers ;)
+ */
+static __init int init_posix_timers(void)
+{
+	struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES,
+					 .abs_struct = &abs_list
+	};
+	struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES,
+		.abs_struct = NULL,
+		.clock_get = do_posix_clock_monotonic_get,
+		.clock_set = do_posix_clock_nosettime
+	};
+
+	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
+	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+
+	posix_timers_cache = kmem_cache_create("posix_timers_cache",
+					sizeof (struct k_itimer), 0, 0, NULL, NULL);
+	idr_init(&posix_timers_id);
+	return 0;
+}
+
+__initcall(init_posix_timers);
+
+static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
+{
+	long sec = tp->tv_sec;
+	long nsec = tp->tv_nsec + res - 1;
+
+	if (nsec > NSEC_PER_SEC) {
+		sec++;
+		nsec -= NSEC_PER_SEC;
+	}
+
+	/*
+	 * The scaling constants are defined in <linux/time.h>
+	 * The difference between there and here is that we do the
+	 * res rounding and compute a 64-bit result (well so does that
+	 * but it then throws away the high bits).
+  	 */
+	*jiff =  (mpy_l_X_l_ll(sec, SEC_CONVERSION) +
+		  (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >> 
+		   (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+}
+
+/*
+ * This function adjusts the timer as needed as a result of the clock
+ * being set.  It should only be called for absolute timers, and then
+ * under the abs_list lock.  It computes the time difference and sets
+ * the new jiffies value in the timer.  It also updates the timers
+ * reference wall_to_monotonic value.  It is complicated by the fact
+ * that tstojiffies() only handles positive times and it needs to work
+ * with both positive and negative times.  Also, for negative offsets,
+ * we need to defeat the res round up.
+ *
+ * Return is true if there is a new time, else false.
+ */
+static long add_clockset_delta(struct k_itimer *timr,
+			       struct timespec *new_wall_to)
+{
+	struct timespec delta;
+	int sign = 0;
+	u64 exp;
+
+	set_normalized_timespec(&delta,
+				new_wall_to->tv_sec -
+				timr->it.real.wall_to_prev.tv_sec,
+				new_wall_to->tv_nsec -
+				timr->it.real.wall_to_prev.tv_nsec);
+	if (likely(!(delta.tv_sec | delta.tv_nsec)))
+		return 0;
+	if (delta.tv_sec < 0) {
+		set_normalized_timespec(&delta,
+					-delta.tv_sec,
+					1 - delta.tv_nsec -
+					posix_clocks[timr->it_clock].res);
+		sign++;
+	}
+	tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp);
+	timr->it.real.wall_to_prev = *new_wall_to;
+	timr->it.real.timer.expires += (sign ? -exp : exp);
+	return 1;
+}
+
+static void remove_from_abslist(struct k_itimer *timr)
+{
+	if (!list_empty(&timr->it.real.abs_timer_entry)) {
+		spin_lock(&abs_list.lock);
+		list_del_init(&timr->it.real.abs_timer_entry);
+		spin_unlock(&abs_list.lock);
+	}
+}
+
+static void schedule_next_timer(struct k_itimer *timr)
+{
+	struct timespec new_wall_to;
+	struct now_struct now;
+	unsigned long seq;
+
+	/*
+	 * Set up the timer for the next interval (if there is one).
+	 * Note: this code uses the abs_timer_lock to protect
+	 * it.real.wall_to_prev and must hold it until exp is set, not exactly
+	 * obvious...
+
+	 * This function is used for CLOCK_REALTIME* and
+	 * CLOCK_MONOTONIC* timers.  If we ever want to handle other
+	 * CLOCKs, the calling code (do_schedule_next_timer) would need
+	 * to pull the "clock" info from the timer and dispatch the
+	 * "other" CLOCKs "next timer" code (which, I suppose should
+	 * also be added to the k_clock structure).
+	 */
+	if (!timr->it.real.incr)
+		return;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		new_wall_to =	wall_to_monotonic;
+		posix_get_now(&now);
+	} while (read_seqretry(&xtime_lock, seq));
+
+	if (!list_empty(&timr->it.real.abs_timer_entry)) {
+		spin_lock(&abs_list.lock);
+		add_clockset_delta(timr, &new_wall_to);
+
+		posix_bump_timer(timr, now);
+
+		spin_unlock(&abs_list.lock);
+	} else {
+		posix_bump_timer(timr, now);
+	}
+	timr->it_overrun_last = timr->it_overrun;
+	timr->it_overrun = -1;
+	++timr->it_requeue_pending;
+	add_timer(&timr->it.real.timer);
+}
+
+/*
+ * This function is exported for use by the signal deliver code.  It is
+ * called just prior to the info block being released and passes that
+ * block to us.  It's function is to update the overrun entry AND to
+ * restart the timer.  It should only be called if the timer is to be
+ * restarted (i.e. we have flagged this in the sys_private entry of the
+ * info block).
+ *
+ * To protect aginst the timer going away while the interrupt is queued,
+ * we require that the it_requeue_pending flag be set.
+ */
+void do_schedule_next_timer(struct siginfo *info)
+{
+	struct k_itimer *timr;
+	unsigned long flags;
+
+	timr = lock_timer(info->si_tid, &flags);
+
+	if (!timr || timr->it_requeue_pending != info->si_sys_private)
+		goto exit;
+
+	if (timr->it_clock < 0)	/* CPU clock */
+		posix_cpu_timer_schedule(timr);
+	else
+		schedule_next_timer(timr);
+	info->si_overrun = timr->it_overrun_last;
+exit:
+	if (timr)
+		unlock_timer(timr, flags);
+}
+
+int posix_timer_event(struct k_itimer *timr,int si_private)
+{
+	memset(&timr->sigq->info, 0, sizeof(siginfo_t));
+	timr->sigq->info.si_sys_private = si_private;
+	/*
+	 * Send signal to the process that owns this timer.
+
+	 * This code assumes that all the possible abs_lists share the
+	 * same lock (there is only one list at this time). If this is
+	 * not the case, the CLOCK info would need to be used to find
+	 * the proper abs list lock.
+	 */
+
+	timr->sigq->info.si_signo = timr->it_sigev_signo;
+	timr->sigq->info.si_errno = 0;
+	timr->sigq->info.si_code = SI_TIMER;
+	timr->sigq->info.si_tid = timr->it_id;
+	timr->sigq->info.si_value = timr->it_sigev_value;
+	if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
+		if (unlikely(timr->it_process->flags & PF_EXITING)) {
+			timr->it_sigev_notify = SIGEV_SIGNAL;
+			put_task_struct(timr->it_process);
+			timr->it_process = timr->it_process->group_leader;
+			goto group;
+		}
+		return send_sigqueue(timr->it_sigev_signo, timr->sigq,
+			timr->it_process);
+	}
+	else {
+	group:
+		return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
+			timr->it_process);
+	}
+}
+EXPORT_SYMBOL_GPL(posix_timer_event);
+
+/*
+ * This function gets called when a POSIX.1b interval timer expires.  It
+ * is used as a callback from the kernel internal timer.  The
+ * run_timer_list code ALWAYS calls with interrupts on.
+
+ * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
+ */
+static void posix_timer_fn(unsigned long __data)
+{
+	struct k_itimer *timr = (struct k_itimer *) __data;
+	unsigned long flags;
+	unsigned long seq;
+	struct timespec delta, new_wall_to;
+	u64 exp = 0;
+	int do_notify = 1;
+
+	spin_lock_irqsave(&timr->it_lock, flags);
+ 	set_timer_inactive(timr);
+	if (!list_empty(&timr->it.real.abs_timer_entry)) {
+		spin_lock(&abs_list.lock);
+		do {
+			seq = read_seqbegin(&xtime_lock);
+			new_wall_to =	wall_to_monotonic;
+		} while (read_seqretry(&xtime_lock, seq));
+		set_normalized_timespec(&delta,
+					new_wall_to.tv_sec -
+					timr->it.real.wall_to_prev.tv_sec,
+					new_wall_to.tv_nsec -
+					timr->it.real.wall_to_prev.tv_nsec);
+		if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) {
+			/* do nothing, timer is on time */
+		} else if (delta.tv_sec < 0) {
+			/* do nothing, timer is already late */
+		} else {
+			/* timer is early due to a clock set */
+			tstojiffie(&delta,
+				   posix_clocks[timr->it_clock].res,
+				   &exp);
+			timr->it.real.wall_to_prev = new_wall_to;
+			timr->it.real.timer.expires += exp;
+			add_timer(&timr->it.real.timer);
+			do_notify = 0;
+		}
+		spin_unlock(&abs_list.lock);
+
+	}
+	if (do_notify)  {
+		int si_private=0;
+
+		if (timr->it.real.incr)
+			si_private = ++timr->it_requeue_pending;
+		else {
+			remove_from_abslist(timr);
+		}
+
+		if (posix_timer_event(timr, si_private))
+			/*
+			 * signal was not sent because of sig_ignor
+			 * we will not get a call back to restart it AND
+			 * it should be restarted.
+			 */
+			schedule_next_timer(timr);
+	}
+	unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */
+}
+
+
+static inline struct task_struct * good_sigevent(sigevent_t * event)
+{
+	struct task_struct *rtn = current->group_leader;
+
+	if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
+		(!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
+		 rtn->tgid != current->tgid ||
+		 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
+		return NULL;
+
+	if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
+	    ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
+		return NULL;
+
+	return rtn;
+}
+
+void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock)
+{
+	if ((unsigned) clock_id >= MAX_CLOCKS) {
+		printk("POSIX clock register failed for clock_id %d\n",
+		       clock_id);
+		return;
+	}
+
+	posix_clocks[clock_id] = *new_clock;
+}
+EXPORT_SYMBOL_GPL(register_posix_clock);
+
+static struct k_itimer * alloc_posix_timer(void)
+{
+	struct k_itimer *tmr;
+	tmr = kmem_cache_alloc(posix_timers_cache, GFP_KERNEL);
+	if (!tmr)
+		return tmr;
+	memset(tmr, 0, sizeof (struct k_itimer));
+	if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
+		kmem_cache_free(posix_timers_cache, tmr);
+		tmr = NULL;
+	}
+	return tmr;
+}
+
+#define IT_ID_SET	1
+#define IT_ID_NOT_SET	0
+static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
+{
+	if (it_id_set) {
+		unsigned long flags;
+		spin_lock_irqsave(&idr_lock, flags);
+		idr_remove(&posix_timers_id, tmr->it_id);
+		spin_unlock_irqrestore(&idr_lock, flags);
+	}
+	sigqueue_free(tmr->sigq);
+	if (unlikely(tmr->it_process) &&
+	    tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+		put_task_struct(tmr->it_process);
+	kmem_cache_free(posix_timers_cache, tmr);
+}
+
+/* Create a POSIX.1b interval timer. */
+
+asmlinkage long
+sys_timer_create(clockid_t which_clock,
+		 struct sigevent __user *timer_event_spec,
+		 timer_t __user * created_timer_id)
+{
+	int error = 0;
+	struct k_itimer *new_timer = NULL;
+	int new_timer_id;
+	struct task_struct *process = NULL;
+	unsigned long flags;
+	sigevent_t event;
+	int it_id_set = IT_ID_NOT_SET;
+
+	if (invalid_clockid(which_clock))
+		return -EINVAL;
+
+	new_timer = alloc_posix_timer();
+	if (unlikely(!new_timer))
+		return -EAGAIN;
+
+	spin_lock_init(&new_timer->it_lock);
+ retry:
+	if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) {
+		error = -EAGAIN;
+		goto out;
+	}
+	spin_lock_irq(&idr_lock);
+	error = idr_get_new(&posix_timers_id,
+			    (void *) new_timer,
+			    &new_timer_id);
+	spin_unlock_irq(&idr_lock);
+	if (error == -EAGAIN)
+		goto retry;
+	else if (error) {
+		/*
+		 * Wierd looking, but we return EAGAIN if the IDR is
+		 * full (proper POSIX return value for this)
+		 */
+		error = -EAGAIN;
+		goto out;
+	}
+
+	it_id_set = IT_ID_SET;
+	new_timer->it_id = (timer_t) new_timer_id;
+	new_timer->it_clock = which_clock;
+	new_timer->it_overrun = -1;
+	error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+	if (error)
+		goto out;
+
+	/*
+	 * return the timer_id now.  The next step is hard to
+	 * back out if there is an error.
+	 */
+	if (copy_to_user(created_timer_id,
+			 &new_timer_id, sizeof (new_timer_id))) {
+		error = -EFAULT;
+		goto out;
+	}
+	if (timer_event_spec) {
+		if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
+			error = -EFAULT;
+			goto out;
+		}
+		new_timer->it_sigev_notify = event.sigev_notify;
+		new_timer->it_sigev_signo = event.sigev_signo;
+		new_timer->it_sigev_value = event.sigev_value;
+
+		read_lock(&tasklist_lock);
+		if ((process = good_sigevent(&event))) {
+			/*
+			 * We may be setting up this process for another
+			 * thread.  It may be exiting.  To catch this
+			 * case the we check the PF_EXITING flag.  If
+			 * the flag is not set, the siglock will catch
+			 * him before it is too late (in exit_itimers).
+			 *
+			 * The exec case is a bit more invloved but easy
+			 * to code.  If the process is in our thread
+			 * group (and it must be or we would not allow
+			 * it here) and is doing an exec, it will cause
+			 * us to be killed.  In this case it will wait
+			 * for us to die which means we can finish this
+			 * linkage with our last gasp. I.e. no code :)
+			 */
+			spin_lock_irqsave(&process->sighand->siglock, flags);
+			if (!(process->flags & PF_EXITING)) {
+				new_timer->it_process = process;
+				list_add(&new_timer->list,
+					 &process->signal->posix_timers);
+				spin_unlock_irqrestore(&process->sighand->siglock, flags);
+				if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+					get_task_struct(process);
+			} else {
+				spin_unlock_irqrestore(&process->sighand->siglock, flags);
+				process = NULL;
+			}
+		}
+		read_unlock(&tasklist_lock);
+		if (!process) {
+			error = -EINVAL;
+			goto out;
+		}
+	} else {
+		new_timer->it_sigev_notify = SIGEV_SIGNAL;
+		new_timer->it_sigev_signo = SIGALRM;
+		new_timer->it_sigev_value.sival_int = new_timer->it_id;
+		process = current->group_leader;
+		spin_lock_irqsave(&process->sighand->siglock, flags);
+		new_timer->it_process = process;
+		list_add(&new_timer->list, &process->signal->posix_timers);
+		spin_unlock_irqrestore(&process->sighand->siglock, flags);
+	}
+
+ 	/*
+	 * In the case of the timer belonging to another task, after
+	 * the task is unlocked, the timer is owned by the other task
+	 * and may cease to exist at any time.  Don't use or modify
+	 * new_timer after the unlock call.
+	 */
+
+out:
+	if (error)
+		release_posix_timer(new_timer, it_id_set);
+
+	return error;
+}
+
+/*
+ * good_timespec
+ *
+ * This function checks the elements of a timespec structure.
+ *
+ * Arguments:
+ * ts	     : Pointer to the timespec structure to check
+ *
+ * Return value:
+ * If a NULL pointer was passed in, or the tv_nsec field was less than 0
+ * or greater than NSEC_PER_SEC, or the tv_sec field was less than 0,
+ * this function returns 0. Otherwise it returns 1.
+ */
+static int good_timespec(const struct timespec *ts)
+{
+	if ((!ts) || (ts->tv_sec < 0) ||
+			((unsigned) ts->tv_nsec >= NSEC_PER_SEC))
+		return 0;
+	return 1;
+}
+
+/*
+ * Locking issues: We need to protect the result of the id look up until
+ * we get the timer locked down so it is not deleted under us.  The
+ * removal is done under the idr spinlock so we use that here to bridge
+ * the find to the timer lock.  To avoid a dead lock, the timer id MUST
+ * be release with out holding the timer lock.
+ */
+static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
+{
+	struct k_itimer *timr;
+	/*
+	 * Watch out here.  We do a irqsave on the idr_lock and pass the
+	 * flags part over to the timer lock.  Must not let interrupts in
+	 * while we are moving the lock.
+	 */
+
+	spin_lock_irqsave(&idr_lock, *flags);
+	timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
+	if (timr) {
+		spin_lock(&timr->it_lock);
+		spin_unlock(&idr_lock);
+
+		if ((timr->it_id != timer_id) || !(timr->it_process) ||
+				timr->it_process->tgid != current->tgid) {
+			unlock_timer(timr, *flags);
+			timr = NULL;
+		}
+	} else
+		spin_unlock_irqrestore(&idr_lock, *flags);
+
+	return timr;
+}
+
+/*
+ * Get the time remaining on a POSIX.1b interval timer.  This function
+ * is ALWAYS called with spin_lock_irq on the timer, thus it must not
+ * mess with irq.
+ *
+ * We have a couple of messes to clean up here.  First there is the case
+ * of a timer that has a requeue pending.  These timers should appear to
+ * be in the timer list with an expiry as if we were to requeue them
+ * now.
+ *
+ * The second issue is the SIGEV_NONE timer which may be active but is
+ * not really ever put in the timer list (to save system resources).
+ * This timer may be expired, and if so, we will do it here.  Otherwise
+ * it is the same as a requeue pending timer WRT to what we should
+ * report.
+ */
+static void
+common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+{
+	unsigned long expires;
+	struct now_struct now;
+
+	do
+		expires = timr->it.real.timer.expires;
+	while ((volatile long) (timr->it.real.timer.expires) != expires);
+
+	posix_get_now(&now);
+
+	if (expires &&
+	    ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) &&
+	    !timr->it.real.incr &&
+	    posix_time_before(&timr->it.real.timer, &now))
+		timr->it.real.timer.expires = expires = 0;
+	if (expires) {
+		if (timr->it_requeue_pending & REQUEUE_PENDING ||
+		    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+			posix_bump_timer(timr, now);
+			expires = timr->it.real.timer.expires;
+		}
+		else
+			if (!timer_pending(&timr->it.real.timer))
+				expires = 0;
+		if (expires)
+			expires -= now.jiffies;
+	}
+	jiffies_to_timespec(expires, &cur_setting->it_value);
+	jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval);
+
+	if (cur_setting->it_value.tv_sec < 0) {
+		cur_setting->it_value.tv_nsec = 1;
+		cur_setting->it_value.tv_sec = 0;
+	}
+}
+
+/* Get the time remaining on a POSIX.1b interval timer. */
+asmlinkage long
+sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
+{
+	struct k_itimer *timr;
+	struct itimerspec cur_setting;
+	unsigned long flags;
+
+	timr = lock_timer(timer_id, &flags);
+	if (!timr)
+		return -EINVAL;
+
+	CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting));
+
+	unlock_timer(timr, flags);
+
+	if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+		return -EFAULT;
+
+	return 0;
+}
+/*
+ * Get the number of overruns of a POSIX.1b interval timer.  This is to
+ * be the overrun of the timer last delivered.  At the same time we are
+ * accumulating overruns on the next timer.  The overrun is frozen when
+ * the signal is delivered, either at the notify time (if the info block
+ * is not queued) or at the actual delivery time (as we are informed by
+ * the call back to do_schedule_next_timer().  So all we need to do is
+ * to pick up the frozen overrun.
+ */
+
+asmlinkage long
+sys_timer_getoverrun(timer_t timer_id)
+{
+	struct k_itimer *timr;
+	int overrun;
+	long flags;
+
+	timr = lock_timer(timer_id, &flags);
+	if (!timr)
+		return -EINVAL;
+
+	overrun = timr->it_overrun_last;
+	unlock_timer(timr, flags);
+
+	return overrun;
+}
+/*
+ * Adjust for absolute time
+ *
+ * If absolute time is given and it is not CLOCK_MONOTONIC, we need to
+ * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and
+ * what ever clock he is using.
+ *
+ * If it is relative time, we need to add the current (CLOCK_MONOTONIC)
+ * time to it to get the proper time for the timer.
+ */
+static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, 
+			   int abs, u64 *exp, struct timespec *wall_to)
+{
+	struct timespec now;
+	struct timespec oc = *tp;
+	u64 jiffies_64_f;
+	int rtn =0;
+
+	if (abs) {
+		/*
+		 * The mask pick up the 4 basic clocks 
+		 */
+		if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) {
+			jiffies_64_f = do_posix_clock_monotonic_gettime_parts(
+				&now,  wall_to);
+			/*
+			 * If we are doing a MONOTONIC clock
+			 */
+			if((clock - &posix_clocks[0]) & CLOCKS_MONO){
+				now.tv_sec += wall_to->tv_sec;
+				now.tv_nsec += wall_to->tv_nsec;
+			}
+		} else {
+			/*
+			 * Not one of the basic clocks
+			 */
+			clock->clock_get(clock - posix_clocks, &now);
+			jiffies_64_f = get_jiffies_64();
+		}
+		/*
+		 * Take away now to get delta
+		 */
+		oc.tv_sec -= now.tv_sec;
+		oc.tv_nsec -= now.tv_nsec;
+		/*
+		 * Normalize...
+		 */
+		while ((oc.tv_nsec - NSEC_PER_SEC) >= 0) {
+			oc.tv_nsec -= NSEC_PER_SEC;
+			oc.tv_sec++;
+		}
+		while ((oc.tv_nsec) < 0) {
+			oc.tv_nsec += NSEC_PER_SEC;
+			oc.tv_sec--;
+		}
+	}else{
+		jiffies_64_f = get_jiffies_64();
+	}
+	/*
+	 * Check if the requested time is prior to now (if so set now)
+	 */
+	if (oc.tv_sec < 0)
+		oc.tv_sec = oc.tv_nsec = 0;
+
+	if (oc.tv_sec | oc.tv_nsec)
+		set_normalized_timespec(&oc, oc.tv_sec,
+					oc.tv_nsec + clock->res);
+	tstojiffie(&oc, clock->res, exp);
+
+	/*
+	 * Check if the requested time is more than the timer code
+	 * can handle (if so we error out but return the value too).
+	 */
+	if (*exp > ((u64)MAX_JIFFY_OFFSET))
+			/*
+			 * This is a considered response, not exactly in
+			 * line with the standard (in fact it is silent on
+			 * possible overflows).  We assume such a large 
+			 * value is ALMOST always a programming error and
+			 * try not to compound it by setting a really dumb
+			 * value.
+			 */
+			rtn = -EINVAL;
+	/*
+	 * return the actual jiffies expire time, full 64 bits
+	 */
+	*exp += jiffies_64_f;
+	return rtn;
+}
+
+/* Set a POSIX.1b interval timer. */
+/* timr->it_lock is taken. */
+static inline int
+common_timer_set(struct k_itimer *timr, int flags,
+		 struct itimerspec *new_setting, struct itimerspec *old_setting)
+{
+	struct k_clock *clock = &posix_clocks[timr->it_clock];
+	u64 expire_64;
+
+	if (old_setting)
+		common_timer_get(timr, old_setting);
+
+	/* disable the timer */
+	timr->it.real.incr = 0;
+	/*
+	 * careful here.  If smp we could be in the "fire" routine which will
+	 * be spinning as we hold the lock.  But this is ONLY an SMP issue.
+	 */
+#ifdef CONFIG_SMP
+	if (timer_active(timr) && !del_timer(&timr->it.real.timer))
+		/*
+		 * It can only be active if on an other cpu.  Since
+		 * we have cleared the interval stuff above, it should
+		 * clear once we release the spin lock.  Of course once
+		 * we do that anything could happen, including the
+		 * complete melt down of the timer.  So return with
+		 * a "retry" exit status.
+		 */
+		return TIMER_RETRY;
+
+	set_timer_inactive(timr);
+#else
+	del_timer(&timr->it.real.timer);
+#endif
+	remove_from_abslist(timr);
+
+	timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 
+		~REQUEUE_PENDING;
+	timr->it_overrun_last = 0;
+	timr->it_overrun = -1;
+	/*
+	 *switch off the timer when it_value is zero
+	 */
+	if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) {
+		timr->it.real.timer.expires = 0;
+		return 0;
+	}
+
+	if (adjust_abs_time(clock,
+			    &new_setting->it_value, flags & TIMER_ABSTIME, 
+			    &expire_64, &(timr->it.real.wall_to_prev))) {
+		return -EINVAL;
+	}
+	timr->it.real.timer.expires = (unsigned long)expire_64;
+	tstojiffie(&new_setting->it_interval, clock->res, &expire_64);
+	timr->it.real.incr = (unsigned long)expire_64;
+
+	/*
+	 * We do not even queue SIGEV_NONE timers!  But we do put them
+	 * in the abs list so we can do that right.
+	 */
+	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE))
+		add_timer(&timr->it.real.timer);
+
+	if (flags & TIMER_ABSTIME && clock->abs_struct) {
+		spin_lock(&clock->abs_struct->lock);
+		list_add_tail(&(timr->it.real.abs_timer_entry),
+			      &(clock->abs_struct->list));
+		spin_unlock(&clock->abs_struct->lock);
+	}
+	return 0;
+}
+
+/* Set a POSIX.1b interval timer */
+asmlinkage long
+sys_timer_settime(timer_t timer_id, int flags,
+		  const struct itimerspec __user *new_setting,
+		  struct itimerspec __user *old_setting)
+{
+	struct k_itimer *timr;
+	struct itimerspec new_spec, old_spec;
+	int error = 0;
+	long flag;
+	struct itimerspec *rtn = old_setting ? &old_spec : NULL;
+
+	if (!new_setting)
+		return -EINVAL;
+
+	if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
+		return -EFAULT;
+
+	if ((!good_timespec(&new_spec.it_interval)) ||
+	    (!good_timespec(&new_spec.it_value)))
+		return -EINVAL;
+retry:
+	timr = lock_timer(timer_id, &flag);
+	if (!timr)
+		return -EINVAL;
+
+	error = CLOCK_DISPATCH(timr->it_clock, timer_set,
+			       (timr, flags, &new_spec, rtn));
+
+	unlock_timer(timr, flag);
+	if (error == TIMER_RETRY) {
+		rtn = NULL;	// We already got the old time...
+		goto retry;
+	}
+
+	if (old_setting && !error && copy_to_user(old_setting,
+						  &old_spec, sizeof (old_spec)))
+		error = -EFAULT;
+
+	return error;
+}
+
+static inline int common_timer_del(struct k_itimer *timer)
+{
+	timer->it.real.incr = 0;
+#ifdef CONFIG_SMP
+	if (timer_active(timer) && !del_timer(&timer->it.real.timer))
+		/*
+		 * It can only be active if on an other cpu.  Since
+		 * we have cleared the interval stuff above, it should
+		 * clear once we release the spin lock.  Of course once
+		 * we do that anything could happen, including the
+		 * complete melt down of the timer.  So return with
+		 * a "retry" exit status.
+		 */
+		return TIMER_RETRY;
+#else
+	del_timer(&timer->it.real.timer);
+#endif
+	remove_from_abslist(timer);
+
+	return 0;
+}
+
+static inline int timer_delete_hook(struct k_itimer *timer)
+{
+	return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer));
+}
+
+/* Delete a POSIX.1b interval timer. */
+asmlinkage long
+sys_timer_delete(timer_t timer_id)
+{
+	struct k_itimer *timer;
+	long flags;
+
+#ifdef CONFIG_SMP
+	int error;
+retry_delete:
+#endif
+	timer = lock_timer(timer_id, &flags);
+	if (!timer)
+		return -EINVAL;
+
+#ifdef CONFIG_SMP
+	error = timer_delete_hook(timer);
+
+	if (error == TIMER_RETRY) {
+		unlock_timer(timer, flags);
+		goto retry_delete;
+	}
+#else
+	timer_delete_hook(timer);
+#endif
+	spin_lock(&current->sighand->siglock);
+	list_del(&timer->list);
+	spin_unlock(&current->sighand->siglock);
+	/*
+	 * This keeps any tasks waiting on the spin lock from thinking
+	 * they got something (see the lock code above).
+	 */
+	if (timer->it_process) {
+		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+			put_task_struct(timer->it_process);
+		timer->it_process = NULL;
+	}
+	unlock_timer(timer, flags);
+	release_posix_timer(timer, IT_ID_SET);
+	return 0;
+}
+/*
+ * return timer owned by the process, used by exit_itimers
+ */
+static inline void itimer_delete(struct k_itimer *timer)
+{
+	unsigned long flags;
+
+#ifdef CONFIG_SMP
+	int error;
+retry_delete:
+#endif
+	spin_lock_irqsave(&timer->it_lock, flags);
+
+#ifdef CONFIG_SMP
+	error = timer_delete_hook(timer);
+
+	if (error == TIMER_RETRY) {
+		unlock_timer(timer, flags);
+		goto retry_delete;
+	}
+#else
+	timer_delete_hook(timer);
+#endif
+	list_del(&timer->list);
+	/*
+	 * This keeps any tasks waiting on the spin lock from thinking
+	 * they got something (see the lock code above).
+	 */
+	if (timer->it_process) {
+		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+			put_task_struct(timer->it_process);
+		timer->it_process = NULL;
+	}
+	unlock_timer(timer, flags);
+	release_posix_timer(timer, IT_ID_SET);
+}
+
+/*
+ * This is called by __exit_signal, only when there are no more
+ * references to the shared signal_struct.
+ */
+void exit_itimers(struct signal_struct *sig)
+{
+	struct k_itimer *tmr;
+
+	while (!list_empty(&sig->posix_timers)) {
+		tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
+		itimer_delete(tmr);
+	}
+}
+
+/*
+ * And now for the "clock" calls
+ *
+ * These functions are called both from timer functions (with the timer
+ * spin_lock_irq() held and from clock calls with no locking.	They must
+ * use the save flags versions of locks.
+ */
+
+/*
+ * We do ticks here to avoid the irq lock ( they take sooo long).
+ * The seqlock is great here.  Since we a reader, we don't really care
+ * if we are interrupted since we don't take lock that will stall us or
+ * any other cpu. Voila, no irq lock is needed.
+ *
+ */
+
+static u64 do_posix_clock_monotonic_gettime_parts(
+	struct timespec *tp, struct timespec *mo)
+{
+	u64 jiff;
+	unsigned int seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		getnstimeofday(tp);
+		*mo = wall_to_monotonic;
+		jiff = jiffies_64;
+
+	} while(read_seqretry(&xtime_lock, seq));
+
+	return jiff;
+}
+
+static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp)
+{
+	struct timespec wall_to_mono;
+
+	do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono);
+
+	tp->tv_sec += wall_to_mono.tv_sec;
+	tp->tv_nsec += wall_to_mono.tv_nsec;
+
+	if ((tp->tv_nsec - NSEC_PER_SEC) > 0) {
+		tp->tv_nsec -= NSEC_PER_SEC;
+		tp->tv_sec++;
+	}
+	return 0;
+}
+
+int do_posix_clock_monotonic_gettime(struct timespec *tp)
+{
+	return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp);
+}
+
+int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp)
+{
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
+
+int do_posix_clock_notimer_create(struct k_itimer *timer)
+{
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create);
+
+int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
+{
+#ifndef ENOTSUP
+	return -EOPNOTSUPP;	/* aka ENOTSUP in userland for POSIX */
+#else  /*  parisc does define it separately.  */
+	return -ENOTSUP;
+#endif
+}
+EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
+
+asmlinkage long
+sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp)
+{
+	struct timespec new_tp;
+
+	if (invalid_clockid(which_clock))
+		return -EINVAL;
+	if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+		return -EFAULT;
+
+	return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
+}
+
+asmlinkage long
+sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp)
+{
+	struct timespec kernel_tp;
+	int error;
+
+	if (invalid_clockid(which_clock))
+		return -EINVAL;
+	error = CLOCK_DISPATCH(which_clock, clock_get,
+			       (which_clock, &kernel_tp));
+	if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+		error = -EFAULT;
+
+	return error;
+
+}
+
+asmlinkage long
+sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
+{
+	struct timespec rtn_tp;
+	int error;
+
+	if (invalid_clockid(which_clock))
+		return -EINVAL;
+
+	error = CLOCK_DISPATCH(which_clock, clock_getres,
+			       (which_clock, &rtn_tp));
+
+	if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
+		error = -EFAULT;
+	}
+
+	return error;
+}
+
+static void nanosleep_wake_up(unsigned long __data)
+{
+	struct task_struct *p = (struct task_struct *) __data;
+
+	wake_up_process(p);
+}
+
+/*
+ * The standard says that an absolute nanosleep call MUST wake up at
+ * the requested time in spite of clock settings.  Here is what we do:
+ * For each nanosleep call that needs it (only absolute and not on
+ * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure
+ * into the "nanosleep_abs_list".  All we need is the task_struct pointer.
+ * When ever the clock is set we just wake up all those tasks.	 The rest
+ * is done by the while loop in clock_nanosleep().
+ *
+ * On locking, clock_was_set() is called from update_wall_clock which
+ * holds (or has held for it) a write_lock_irq( xtime_lock) and is
+ * called from the timer bh code.  Thus we need the irq save locks.
+ *
+ * Also, on the call from update_wall_clock, that is done as part of a
+ * softirq thing.  We don't want to delay the system that much (possibly
+ * long list of timers to fix), so we defer that work to keventd.
+ */
+
+static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue);
+static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL);
+
+static DECLARE_MUTEX(clock_was_set_lock);
+
+void clock_was_set(void)
+{
+	struct k_itimer *timr;
+	struct timespec new_wall_to;
+	LIST_HEAD(cws_list);
+	unsigned long seq;
+
+
+	if (unlikely(in_interrupt())) {
+		schedule_work(&clock_was_set_work);
+		return;
+	}
+	wake_up_all(&nanosleep_abs_wqueue);
+
+	/*
+	 * Check if there exist TIMER_ABSTIME timers to correct.
+	 *
+	 * Notes on locking: This code is run in task context with irq
+	 * on.  We CAN be interrupted!  All other usage of the abs list
+	 * lock is under the timer lock which holds the irq lock as
+	 * well.  We REALLY don't want to scan the whole list with the
+	 * interrupt system off, AND we would like a sequence lock on
+	 * this code as well.  Since we assume that the clock will not
+	 * be set often, it seems ok to take and release the irq lock
+	 * for each timer.  In fact add_timer will do this, so this is
+	 * not an issue.  So we know when we are done, we will move the
+	 * whole list to a new location.  Then as we process each entry,
+	 * we will move it to the actual list again.  This way, when our
+	 * copy is empty, we are done.  We are not all that concerned
+	 * about preemption so we will use a semaphore lock to protect
+	 * aginst reentry.  This way we will not stall another
+	 * processor.  It is possible that this may delay some timers
+	 * that should have expired, given the new clock, but even this
+	 * will be minimal as we will always update to the current time,
+	 * even if it was set by a task that is waiting for entry to
+	 * this code.  Timers that expire too early will be caught by
+	 * the expire code and restarted.
+
+	 * Absolute timers that repeat are left in the abs list while
+	 * waiting for the task to pick up the signal.  This means we
+	 * may find timers that are not in the "add_timer" list, but are
+	 * in the abs list.  We do the same thing for these, save
+	 * putting them back in the "add_timer" list.  (Note, these are
+	 * left in the abs list mainly to indicate that they are
+	 * ABSOLUTE timers, a fact that is used by the re-arm code, and
+	 * for which we have no other flag.)
+
+	 */
+
+	down(&clock_was_set_lock);
+	spin_lock_irq(&abs_list.lock);
+	list_splice_init(&abs_list.list, &cws_list);
+	spin_unlock_irq(&abs_list.lock);
+	do {
+		do {
+			seq = read_seqbegin(&xtime_lock);
+			new_wall_to =	wall_to_monotonic;
+		} while (read_seqretry(&xtime_lock, seq));
+
+		spin_lock_irq(&abs_list.lock);
+		if (list_empty(&cws_list)) {
+			spin_unlock_irq(&abs_list.lock);
+			break;
+		}
+		timr = list_entry(cws_list.next, struct k_itimer,
+				  it.real.abs_timer_entry);
+
+		list_del_init(&timr->it.real.abs_timer_entry);
+		if (add_clockset_delta(timr, &new_wall_to) &&
+		    del_timer(&timr->it.real.timer))  /* timer run yet? */
+			add_timer(&timr->it.real.timer);
+		list_add(&timr->it.real.abs_timer_entry, &abs_list.list);
+		spin_unlock_irq(&abs_list.lock);
+	} while (1);
+
+	up(&clock_was_set_lock);
+}
+
+long clock_nanosleep_restart(struct restart_block *restart_block);
+
+asmlinkage long
+sys_clock_nanosleep(clockid_t which_clock, int flags,
+		    const struct timespec __user *rqtp,
+		    struct timespec __user *rmtp)
+{
+	struct timespec t;
+	struct restart_block *restart_block =
+	    &(current_thread_info()->restart_block);
+	int ret;
+
+	if (invalid_clockid(which_clock))
+		return -EINVAL;
+
+	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+		return -EFAULT;
+
+	if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0)
+		return -EINVAL;
+
+	/*
+	 * Do this here as nsleep function does not have the real address.
+	 */
+	restart_block->arg1 = (unsigned long)rmtp;
+
+	ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t));
+
+	if ((ret == -ERESTART_RESTARTBLOCK) && rmtp &&
+					copy_to_user(rmtp, &t, sizeof (t)))
+		return -EFAULT;
+	return ret;
+}
+
+
+static int common_nsleep(clockid_t which_clock,
+			 int flags, struct timespec *tsave)
+{
+	struct timespec t, dum;
+	struct timer_list new_timer;
+	DECLARE_WAITQUEUE(abs_wqueue, current);
+	u64 rq_time = (u64)0;
+	s64 left;
+	int abs;
+	struct restart_block *restart_block =
+	    &current_thread_info()->restart_block;
+
+	abs_wqueue.flags = 0;
+	init_timer(&new_timer);
+	new_timer.expires = 0;
+	new_timer.data = (unsigned long) current;
+	new_timer.function = nanosleep_wake_up;
+	abs = flags & TIMER_ABSTIME;
+
+	if (restart_block->fn == clock_nanosleep_restart) {
+		/*
+		 * Interrupted by a non-delivered signal, pick up remaining
+		 * time and continue.  Remaining time is in arg2 & 3.
+		 */
+		restart_block->fn = do_no_restart_syscall;
+
+		rq_time = restart_block->arg3;
+		rq_time = (rq_time << 32) + restart_block->arg2;
+		if (!rq_time)
+			return -EINTR;
+		left = rq_time - get_jiffies_64();
+		if (left <= (s64)0)
+			return 0;	/* Already passed */
+	}
+
+	if (abs && (posix_clocks[which_clock].clock_get !=
+			    posix_clocks[CLOCK_MONOTONIC].clock_get))
+		add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue);
+
+	do {
+		t = *tsave;
+		if (abs || !rq_time) {
+			adjust_abs_time(&posix_clocks[which_clock], &t, abs,
+					&rq_time, &dum);
+		}
+
+		left = rq_time - get_jiffies_64();
+		if (left >= (s64)MAX_JIFFY_OFFSET)
+			left = (s64)MAX_JIFFY_OFFSET;
+		if (left < (s64)0)
+			break;
+
+		new_timer.expires = jiffies + left;
+		__set_current_state(TASK_INTERRUPTIBLE);
+		add_timer(&new_timer);
+
+		schedule();
+
+		del_timer_sync(&new_timer);
+		left = rq_time - get_jiffies_64();
+	} while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
+
+	if (abs_wqueue.task_list.next)
+		finish_wait(&nanosleep_abs_wqueue, &abs_wqueue);
+
+	if (left > (s64)0) {
+
+		/*
+		 * Always restart abs calls from scratch to pick up any
+		 * clock shifting that happened while we are away.
+		 */
+		if (abs)
+			return -ERESTARTNOHAND;
+
+		left *= TICK_NSEC;
+		tsave->tv_sec = div_long_long_rem(left, 
+						  NSEC_PER_SEC, 
+						  &tsave->tv_nsec);
+		/*
+		 * Restart works by saving the time remaing in 
+		 * arg2 & 3 (it is 64-bits of jiffies).  The other
+		 * info we need is the clock_id (saved in arg0). 
+		 * The sys_call interface needs the users 
+		 * timespec return address which _it_ saves in arg1.
+		 * Since we have cast the nanosleep call to a clock_nanosleep
+		 * both can be restarted with the same code.
+		 */
+		restart_block->fn = clock_nanosleep_restart;
+		restart_block->arg0 = which_clock;
+		/*
+		 * Caller sets arg1
+		 */
+		restart_block->arg2 = rq_time & 0xffffffffLL;
+		restart_block->arg3 = rq_time >> 32;
+
+		return -ERESTART_RESTARTBLOCK;
+	}
+
+	return 0;
+}
+/*
+ * This will restart clock_nanosleep.
+ */
+long
+clock_nanosleep_restart(struct restart_block *restart_block)
+{
+	struct timespec t;
+	int ret = common_nsleep(restart_block->arg0, 0, &t);
+
+	if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 &&
+	    copy_to_user((struct timespec __user *)(restart_block->arg1), &t,
+			 sizeof (t)))
+		return -EFAULT;
+	return ret;
+}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
new file mode 100644
index 000000000000..696387ffe49c
--- /dev/null
+++ b/kernel/power/Kconfig
@@ -0,0 +1,74 @@
+config PM
+	bool "Power Management support"
+	---help---
+	  "Power Management" means that parts of your computer are shut
+	  off or put into a power conserving "sleep" mode if they are not
+	  being used.  There are two competing standards for doing this: APM
+	  and ACPI.  If you want to use either one, say Y here and then also
+	  to the requisite support below.
+
+	  Power Management is most important for battery powered laptop
+	  computers; if you have a laptop, check out the Linux Laptop home
+	  page on the WWW at <http://www.linux-on-laptops.com/> or
+	  Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
+	  and the Battery Powered Linux mini-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  Note that, even if you say N here, Linux on the x86 architecture
+	  will issue the hlt instruction if nothing is to be done, thereby
+	  sending the processor to sleep and saving power.
+
+config PM_DEBUG
+	bool "Power Management Debug Support"
+	depends on PM
+	---help---
+	This option enables verbose debugging support in the Power Management
+	code. This is helpful when debugging and reporting various PM bugs, 
+	like suspend support.
+
+config SOFTWARE_SUSPEND
+	bool "Software Suspend (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && PM && SWAP
+	---help---
+	  Enable the possibility of suspending the machine.
+	  It doesn't need APM.
+	  You may suspend your machine by 'swsusp' or 'shutdown -z <time>' 
+	  (patch for sysvinit needed). 
+
+	  It creates an image which is saved in your active swap. Upon next
+	  boot, pass the 'resume=/dev/swappartition' argument to the kernel to
+	  have it detect the saved image, restore memory state from it, and
+	  continue to run as before. If you do not want the previous state to
+	  be reloaded, then use the 'noresume' kernel argument. However, note
+	  that your partitions will be fsck'd and you must re-mkswap your swap
+	  partitions. It does not work with swap files.
+
+	  Right now you may boot without resuming and then later resume but
+	  in meantime you cannot use those swap partitions/files which were
+	  involved in suspending. Also in this case there is a risk that buffers
+	  on disk won't match with saved ones.
+
+	  For more information take a look at <file:Documentation/power/swsusp.txt>.
+
+config PM_STD_PARTITION
+	string "Default resume partition"
+	depends on SOFTWARE_SUSPEND
+	default ""
+	---help---
+	  The default resume partition is the partition that the suspend-
+	  to-disk implementation will look for a suspended disk image. 
+
+	  The partition specified here will be different for almost every user. 
+	  It should be a valid swap partition (at least for now) that is turned
+	  on before suspending. 
+
+	  The partition specified can be overridden by specifying:
+
+		resume=/dev/<other device> 
+
+	  which will set the resume partition to the device specified. 
+
+	  Note there is currently not a way to specify which device to save the
+	  suspended image to. It will simply pick the first available swap 
+	  device.
+
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
new file mode 100644
index 000000000000..fbdc634135a7
--- /dev/null
+++ b/kernel/power/Makefile
@@ -0,0 +1,11 @@
+
+ifeq ($(CONFIG_PM_DEBUG),y)
+EXTRA_CFLAGS	+=	-DDEBUG
+endif
+
+swsusp-smp-$(CONFIG_SMP)	+= smp.o
+
+obj-y				:= main.o process.o console.o pm.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o $(swsusp-smp-y) disk.o
+
+obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
new file mode 100644
index 000000000000..7ff375e7c95f
--- /dev/null
+++ b/kernel/power/console.c
@@ -0,0 +1,58 @@
+/*
+ * drivers/power/process.c - Functions for saving/restoring console.
+ *
+ * Originally from swsusp.
+ */
+
+#include <linux/vt_kern.h>
+#include <linux/kbd_kern.h>
+#include <linux/console.h>
+#include "power.h"
+
+static int new_loglevel = 10;
+static int orig_loglevel;
+#ifdef SUSPEND_CONSOLE
+static int orig_fgconsole, orig_kmsg;
+#endif
+
+int pm_prepare_console(void)
+{
+	orig_loglevel = console_loglevel;
+	console_loglevel = new_loglevel;
+
+#ifdef SUSPEND_CONSOLE
+	acquire_console_sem();
+
+	orig_fgconsole = fg_console;
+
+	if (vc_allocate(SUSPEND_CONSOLE)) {
+	  /* we can't have a free VC for now. Too bad,
+	   * we don't want to mess the screen for now. */
+		release_console_sem();
+		return 1;
+	}
+
+	set_console(SUSPEND_CONSOLE);
+	release_console_sem();
+
+	if (vt_waitactive(SUSPEND_CONSOLE)) {
+		pr_debug("Suspend: Can't switch VCs.");
+		return 1;
+	}
+	orig_kmsg = kmsg_redirect;
+	kmsg_redirect = SUSPEND_CONSOLE;
+#endif
+	return 0;
+}
+
+void pm_restore_console(void)
+{
+	console_loglevel = orig_loglevel;
+#ifdef SUSPEND_CONSOLE
+	acquire_console_sem();
+	set_console(orig_fgconsole);
+	release_console_sem();
+	kmsg_redirect = orig_kmsg;
+#endif
+	return;
+}
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
new file mode 100644
index 000000000000..02b6764034dc
--- /dev/null
+++ b/kernel/power/disk.c
@@ -0,0 +1,431 @@
+/*
+ * kernel/power/disk.c - Suspend-to-disk support.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include "power.h"
+
+
+extern suspend_disk_method_t pm_disk_mode;
+extern struct pm_ops * pm_ops;
+
+extern int swsusp_suspend(void);
+extern int swsusp_write(void);
+extern int swsusp_check(void);
+extern int swsusp_read(void);
+extern void swsusp_close(void);
+extern int swsusp_resume(void);
+extern int swsusp_free(void);
+
+
+static int noresume = 0;
+char resume_file[256] = CONFIG_PM_STD_PARTITION;
+dev_t swsusp_resume_device;
+
+/**
+ *	power_down - Shut machine down for hibernate.
+ *	@mode:		Suspend-to-disk mode
+ *
+ *	Use the platform driver, if configured so, and return gracefully if it
+ *	fails.
+ *	Otherwise, try to power off and reboot. If they fail, halt the machine,
+ *	there ain't no turning back.
+ */
+
+static void power_down(suspend_disk_method_t mode)
+{
+	unsigned long flags;
+	int error = 0;
+
+	local_irq_save(flags);
+	switch(mode) {
+	case PM_DISK_PLATFORM:
+ 		device_shutdown();
+		error = pm_ops->enter(PM_SUSPEND_DISK);
+		break;
+	case PM_DISK_SHUTDOWN:
+		printk("Powering off system\n");
+		device_shutdown();
+		machine_power_off();
+		break;
+	case PM_DISK_REBOOT:
+		device_shutdown();
+		machine_restart(NULL);
+		break;
+	}
+	machine_halt();
+	/* Valid image is on the disk, if we continue we risk serious data corruption
+	   after resume. */
+	printk(KERN_CRIT "Please power me down manually\n");
+	while(1);
+}
+
+
+static int in_suspend __nosavedata = 0;
+
+
+/**
+ *	free_some_memory -  Try to free as much memory as possible
+ *
+ *	... but do not OOM-kill anyone
+ *
+ *	Notice: all userland should be stopped at this point, or
+ *	livelock is possible.
+ */
+
+static void free_some_memory(void)
+{
+	unsigned int i = 0;
+	unsigned int tmp;
+	unsigned long pages = 0;
+	char *p = "-\\|/";
+
+	printk("Freeing memory...  ");
+	while ((tmp = shrink_all_memory(10000))) {
+		pages += tmp;
+		printk("\b%c", p[i]);
+		i++;
+		if (i > 3)
+			i = 0;
+	}
+	printk("\bdone (%li pages freed)\n", pages);
+}
+
+
+static inline void platform_finish(void)
+{
+	if (pm_disk_mode == PM_DISK_PLATFORM) {
+		if (pm_ops && pm_ops->finish)
+			pm_ops->finish(PM_SUSPEND_DISK);
+	}
+}
+
+static void finish(void)
+{
+	device_resume();
+	platform_finish();
+	enable_nonboot_cpus();
+	thaw_processes();
+	pm_restore_console();
+}
+
+
+static int prepare_processes(void)
+{
+	int error;
+
+	pm_prepare_console();
+
+	sys_sync();
+
+	if (freeze_processes()) {
+		error = -EBUSY;
+		return error;
+	}
+
+	if (pm_disk_mode == PM_DISK_PLATFORM) {
+		if (pm_ops && pm_ops->prepare) {
+			if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
+				return error;
+		}
+	}
+
+	/* Free memory before shutting down devices. */
+	free_some_memory();
+
+	return 0;
+}
+
+static void unprepare_processes(void)
+{
+	enable_nonboot_cpus();
+	thaw_processes();
+	pm_restore_console();
+}
+
+static int prepare_devices(void)
+{
+	int error;
+
+	disable_nonboot_cpus();
+	if ((error = device_suspend(PMSG_FREEZE))) {
+		printk("Some devices failed to suspend\n");
+		platform_finish();
+		enable_nonboot_cpus();
+		return error;
+	}
+
+	return 0;
+}
+
+/**
+ *	pm_suspend_disk - The granpappy of power management.
+ *
+ *	If we're going through the firmware, then get it over with quickly.
+ *
+ *	If not, then call swsusp to do its thing, then figure out how
+ *	to power down the system.
+ */
+
+int pm_suspend_disk(void)
+{
+	int error;
+
+	error = prepare_processes();
+	if (!error) {
+		error = prepare_devices();
+	}
+
+	if (error) {
+		unprepare_processes();
+		return error;
+	}
+
+	pr_debug("PM: Attempting to suspend to disk.\n");
+	if (pm_disk_mode == PM_DISK_FIRMWARE)
+		return pm_ops->enter(PM_SUSPEND_DISK);
+
+	pr_debug("PM: snapshotting memory.\n");
+	in_suspend = 1;
+	if ((error = swsusp_suspend()))
+		goto Done;
+
+	if (in_suspend) {
+		pr_debug("PM: writing image.\n");
+		error = swsusp_write();
+		if (!error)
+			power_down(pm_disk_mode);
+	} else
+		pr_debug("PM: Image restored successfully.\n");
+	swsusp_free();
+ Done:
+	finish();
+	return error;
+}
+
+
+/**
+ *	software_resume - Resume from a saved image.
+ *
+ *	Called as a late_initcall (so all devices are discovered and
+ *	initialized), we call swsusp to see if we have a saved image or not.
+ *	If so, we quiesce devices, the restore the saved image. We will
+ *	return above (in pm_suspend_disk() ) if everything goes well.
+ *	Otherwise, we fail gracefully and return to the normally
+ *	scheduled program.
+ *
+ */
+
+static int software_resume(void)
+{
+	int error;
+
+	if (noresume) {
+		/**
+		 * FIXME: If noresume is specified, we need to find the partition
+		 * and reset it back to normal swap space.
+		 */
+		return 0;
+	}
+
+	pr_debug("PM: Checking swsusp image.\n");
+
+	if ((error = swsusp_check()))
+		goto Done;
+
+	pr_debug("PM: Preparing processes for restore.\n");
+
+	if ((error = prepare_processes())) {
+		swsusp_close();
+		goto Cleanup;
+	}
+
+	pr_debug("PM: Reading swsusp image.\n");
+
+	if ((error = swsusp_read()))
+		goto Cleanup;
+
+	pr_debug("PM: Preparing devices for restore.\n");
+
+	if ((error = prepare_devices()))
+		goto Free;
+
+	mb();
+
+	pr_debug("PM: Restoring saved image.\n");
+	swsusp_resume();
+	pr_debug("PM: Restore failed, recovering.n");
+	finish();
+ Free:
+	swsusp_free();
+ Cleanup:
+	unprepare_processes();
+ Done:
+	pr_debug("PM: Resume from disk failed.\n");
+	return 0;
+}
+
+late_initcall(software_resume);
+
+
+static char * pm_disk_modes[] = {
+	[PM_DISK_FIRMWARE]	= "firmware",
+	[PM_DISK_PLATFORM]	= "platform",
+	[PM_DISK_SHUTDOWN]	= "shutdown",
+	[PM_DISK_REBOOT]	= "reboot",
+};
+
+/**
+ *	disk - Control suspend-to-disk mode
+ *
+ *	Suspend-to-disk can be handled in several ways. The greatest
+ *	distinction is who writes memory to disk - the firmware or the OS.
+ *	If the firmware does it, we assume that it also handles suspending
+ *	the system.
+ *	If the OS does it, then we have three options for putting the system
+ *	to sleep - using the platform driver (e.g. ACPI or other PM registers),
+ *	powering off the system or rebooting the system (for testing).
+ *
+ *	The system will support either 'firmware' or 'platform', and that is
+ *	known a priori (and encoded in pm_ops). But, the user may choose
+ *	'shutdown' or 'reboot' as alternatives.
+ *
+ *	show() will display what the mode is currently set to.
+ *	store() will accept one of
+ *
+ *	'firmware'
+ *	'platform'
+ *	'shutdown'
+ *	'reboot'
+ *
+ *	It will only change to 'firmware' or 'platform' if the system
+ *	supports it (as determined from pm_ops->pm_disk_mode).
+ */
+
+static ssize_t disk_show(struct subsystem * subsys, char * buf)
+{
+	return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]);
+}
+
+
+static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
+{
+	int error = 0;
+	int i;
+	int len;
+	char *p;
+	suspend_disk_method_t mode = 0;
+
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
+
+	down(&pm_sem);
+	for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
+		if (!strncmp(buf, pm_disk_modes[i], len)) {
+			mode = i;
+			break;
+		}
+	}
+	if (mode) {
+		if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT)
+			pm_disk_mode = mode;
+		else {
+			if (pm_ops && pm_ops->enter &&
+			    (mode == pm_ops->pm_disk_mode))
+				pm_disk_mode = mode;
+			else
+				error = -EINVAL;
+		}
+	} else
+		error = -EINVAL;
+
+	pr_debug("PM: suspend-to-disk mode set to '%s'\n",
+		 pm_disk_modes[mode]);
+	up(&pm_sem);
+	return error ? error : n;
+}
+
+power_attr(disk);
+
+static ssize_t resume_show(struct subsystem * subsys, char *buf)
+{
+	return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
+		       MINOR(swsusp_resume_device));
+}
+
+static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	int len;
+	char *p;
+	unsigned int maj, min;
+	int error = -EINVAL;
+	dev_t res;
+
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
+
+	if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
+		res = MKDEV(maj,min);
+		if (maj == MAJOR(res) && min == MINOR(res)) {
+			swsusp_resume_device = res;
+			printk("Attempting manual resume\n");
+			noresume = 0;
+			software_resume();
+		}
+	}
+
+	return error >= 0 ? n : error;
+}
+
+power_attr(resume);
+
+static struct attribute * g[] = {
+	&disk_attr.attr,
+	&resume_attr.attr,
+	NULL,
+};
+
+
+static struct attribute_group attr_group = {
+	.attrs = g,
+};
+
+
+static int __init pm_disk_init(void)
+{
+	return sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+}
+
+core_initcall(pm_disk_init);
+
+
+static int __init resume_setup(char *str)
+{
+	if (noresume)
+		return 1;
+
+	strncpy( resume_file, str, 255 );
+	return 1;
+}
+
+static int __init noresume_setup(char *str)
+{
+	noresume = 1;
+	return 1;
+}
+
+__setup("noresume", noresume_setup);
+__setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
new file mode 100644
index 000000000000..7960ddf04a57
--- /dev/null
+++ b/kernel/power/main.c
@@ -0,0 +1,269 @@
+/*
+ * kernel/power/main.c - PM subsystem core functionality.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * 
+ * This file is released under the GPLv2
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+
+
+#include "power.h"
+
+DECLARE_MUTEX(pm_sem);
+
+struct pm_ops * pm_ops = NULL;
+suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
+
+/**
+ *	pm_set_ops - Set the global power method table. 
+ *	@ops:	Pointer to ops structure.
+ */
+
+void pm_set_ops(struct pm_ops * ops)
+{
+	down(&pm_sem);
+	pm_ops = ops;
+	up(&pm_sem);
+}
+
+
+/**
+ *	suspend_prepare - Do prep work before entering low-power state.
+ *	@state:		State we're entering.
+ *
+ *	This is common code that is called for each state that we're 
+ *	entering. Allocate a console, stop all processes, then make sure
+ *	the platform can enter the requested state.
+ */
+
+static int suspend_prepare(suspend_state_t state)
+{
+	int error = 0;
+
+	if (!pm_ops || !pm_ops->enter)
+		return -EPERM;
+
+	pm_prepare_console();
+
+	if (freeze_processes()) {
+		error = -EAGAIN;
+		goto Thaw;
+	}
+
+	if (pm_ops->prepare) {
+		if ((error = pm_ops->prepare(state)))
+			goto Thaw;
+	}
+
+	if ((error = device_suspend(PMSG_SUSPEND))) {
+		printk(KERN_ERR "Some devices failed to suspend\n");
+		goto Finish;
+	}
+	return 0;
+ Finish:
+	if (pm_ops->finish)
+		pm_ops->finish(state);
+ Thaw:
+	thaw_processes();
+	pm_restore_console();
+	return error;
+}
+
+
+static int suspend_enter(suspend_state_t state)
+{
+	int error = 0;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	if ((error = device_power_down(PMSG_SUSPEND))) {
+		printk(KERN_ERR "Some devices failed to power down\n");
+		goto Done;
+	}
+	error = pm_ops->enter(state);
+	device_power_up();
+ Done:
+	local_irq_restore(flags);
+	return error;
+}
+
+
+/**
+ *	suspend_finish - Do final work before exiting suspend sequence.
+ *	@state:		State we're coming out of.
+ *
+ *	Call platform code to clean up, restart processes, and free the 
+ *	console that we've allocated. This is not called for suspend-to-disk.
+ */
+
+static void suspend_finish(suspend_state_t state)
+{
+	device_resume();
+	if (pm_ops && pm_ops->finish)
+		pm_ops->finish(state);
+	thaw_processes();
+	pm_restore_console();
+}
+
+
+
+
+static char * pm_states[] = {
+	[PM_SUSPEND_STANDBY]	= "standby",
+	[PM_SUSPEND_MEM]	= "mem",
+	[PM_SUSPEND_DISK]	= "disk",
+	NULL,
+};
+
+
+/**
+ *	enter_state - Do common work of entering low-power state.
+ *	@state:		pm_state structure for state we're entering.
+ *
+ *	Make sure we're the only ones trying to enter a sleep state. Fail
+ *	if someone has beat us to it, since we don't want anything weird to
+ *	happen when we wake up.
+ *	Then, do the setup for suspend, enter the state, and cleaup (after
+ *	we've woken up).
+ */
+
+static int enter_state(suspend_state_t state)
+{
+	int error;
+
+	if (down_trylock(&pm_sem))
+		return -EBUSY;
+
+	if (state == PM_SUSPEND_DISK) {
+		error = pm_suspend_disk();
+		goto Unlock;
+	}
+
+	/* Suspend is hard to get right on SMP. */
+	if (num_online_cpus() != 1) {
+		error = -EPERM;
+		goto Unlock;
+	}
+
+	pr_debug("PM: Preparing system for suspend\n");
+	if ((error = suspend_prepare(state)))
+		goto Unlock;
+
+	pr_debug("PM: Entering state.\n");
+	error = suspend_enter(state);
+
+	pr_debug("PM: Finishing up.\n");
+	suspend_finish(state);
+ Unlock:
+	up(&pm_sem);
+	return error;
+}
+
+/*
+ * This is main interface to the outside world. It needs to be
+ * called from process context.
+ */
+int software_suspend(void)
+{
+	return enter_state(PM_SUSPEND_DISK);
+}
+
+
+/**
+ *	pm_suspend - Externally visible function for suspending system.
+ *	@state:		Enumarted value of state to enter.
+ *
+ *	Determine whether or not value is within range, get state 
+ *	structure, and enter (above).
+ */
+
+int pm_suspend(suspend_state_t state)
+{
+	if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX)
+		return enter_state(state);
+	return -EINVAL;
+}
+
+
+
+decl_subsys(power,NULL,NULL);
+
+
+/**
+ *	state - control system power state.
+ *
+ *	show() returns what states are supported, which is hard-coded to
+ *	'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
+ *	'disk' (Suspend-to-Disk).
+ *
+ *	store() accepts one of those strings, translates it into the 
+ *	proper enumerated value, and initiates a suspend transition.
+ */
+
+static ssize_t state_show(struct subsystem * subsys, char * buf)
+{
+	int i;
+	char * s = buf;
+
+	for (i = 0; i < PM_SUSPEND_MAX; i++) {
+		if (pm_states[i])
+			s += sprintf(s,"%s ",pm_states[i]);
+	}
+	s += sprintf(s,"\n");
+	return (s - buf);
+}
+
+static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	suspend_state_t state = PM_SUSPEND_STANDBY;
+	char ** s;
+	char *p;
+	int error;
+	int len;
+
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
+
+	for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
+		if (*s && !strncmp(buf, *s, len))
+			break;
+	}
+	if (*s)
+		error = enter_state(state);
+	else
+		error = -EINVAL;
+	return error ? error : n;
+}
+
+power_attr(state);
+
+static struct attribute * g[] = {
+	&state_attr.attr,
+	NULL,
+};
+
+static struct attribute_group attr_group = {
+	.attrs = g,
+};
+
+
+static int __init pm_init(void)
+{
+	int error = subsystem_register(&power_subsys);
+	if (!error)
+		error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+	return error;
+}
+
+core_initcall(pm_init);
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
new file mode 100644
index 000000000000..61deda04e39e
--- /dev/null
+++ b/kernel/power/pm.c
@@ -0,0 +1,265 @@
+/*
+ *  pm.c - Power management interface
+ *
+ *  Copyright (C) 2000 Andrew Henroid
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pm.h>
+#include <linux/interrupt.h>
+
+int pm_active;
+
+/*
+ *	Locking notes:
+ *		pm_devs_lock can be a semaphore providing pm ops are not called
+ *	from an interrupt handler (already a bad idea so no change here). Each
+ *	change must be protected so that an unlink of an entry doesn't clash
+ *	with a pm send - which is permitted to sleep in the current architecture
+ *
+ *	Module unloads clashing with pm events now work out safely, the module 
+ *	unload path will block until the event has been sent. It may well block
+ *	until a resume but that will be fine.
+ */
+ 
+static DECLARE_MUTEX(pm_devs_lock);
+static LIST_HEAD(pm_devs);
+
+/**
+ *	pm_register - register a device with power management
+ *	@type: device type 
+ *	@id: device ID
+ *	@callback: callback function
+ *
+ *	Add a device to the list of devices that wish to be notified about
+ *	power management events. A &pm_dev structure is returned on success,
+ *	on failure the return is %NULL.
+ *
+ *      The callback function will be called in process context and
+ *      it may sleep.
+ */
+ 
+struct pm_dev *pm_register(pm_dev_t type,
+			   unsigned long id,
+			   pm_callback callback)
+{
+	struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
+	if (dev) {
+		memset(dev, 0, sizeof(*dev));
+		dev->type = type;
+		dev->id = id;
+		dev->callback = callback;
+
+		down(&pm_devs_lock);
+		list_add(&dev->entry, &pm_devs);
+		up(&pm_devs_lock);
+	}
+	return dev;
+}
+
+/**
+ *	pm_unregister -  unregister a device with power management
+ *	@dev: device to unregister
+ *
+ *	Remove a device from the power management notification lists. The
+ *	dev passed must be a handle previously returned by pm_register.
+ */
+ 
+void pm_unregister(struct pm_dev *dev)
+{
+	if (dev) {
+		down(&pm_devs_lock);
+		list_del(&dev->entry);
+		up(&pm_devs_lock);
+
+		kfree(dev);
+	}
+}
+
+static void __pm_unregister(struct pm_dev *dev)
+{
+	if (dev) {
+		list_del(&dev->entry);
+		kfree(dev);
+	}
+}
+
+/**
+ *	pm_unregister_all - unregister all devices with matching callback
+ *	@callback: callback function pointer
+ *
+ *	Unregister every device that would call the callback passed. This
+ *	is primarily meant as a helper function for loadable modules. It
+ *	enables a module to give up all its managed devices without keeping
+ *	its own private list.
+ */
+ 
+void pm_unregister_all(pm_callback callback)
+{
+	struct list_head *entry;
+
+	if (!callback)
+		return;
+
+	down(&pm_devs_lock);
+	entry = pm_devs.next;
+	while (entry != &pm_devs) {
+		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+		entry = entry->next;
+		if (dev->callback == callback)
+			__pm_unregister(dev);
+	}
+	up(&pm_devs_lock);
+}
+
+/**
+ *	pm_send - send request to a single device
+ *	@dev: device to send to
+ *	@rqst: power management request
+ *	@data: data for the callback
+ *
+ *	Issue a power management request to a given device. The 
+ *	%PM_SUSPEND and %PM_RESUME events are handled specially. The
+ *	data field must hold the intended next state. No call is made
+ *	if the state matches.
+ *
+ *	BUGS: what stops two power management requests occurring in parallel
+ *	and conflicting.
+ *
+ *	WARNING: Calling pm_send directly is not generally recommended, in
+ *	particular there is no locking against the pm_dev going away. The
+ *	caller must maintain all needed locking or have 'inside knowledge'
+ *	on the safety. Also remember that this function is not locked against
+ *	pm_unregister. This means that you must handle SMP races on callback
+ *	execution and unload yourself.
+ */
+ 
+static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
+{
+	int status = 0;
+	unsigned long prev_state, next_state;
+
+	if (in_interrupt())
+		BUG();
+
+	switch (rqst) {
+	case PM_SUSPEND:
+	case PM_RESUME:
+		prev_state = dev->state;
+		next_state = (unsigned long) data;
+		if (prev_state != next_state) {
+			if (dev->callback)
+				status = (*dev->callback)(dev, rqst, data);
+			if (!status) {
+				dev->state = next_state;
+				dev->prev_state = prev_state;
+			}
+		}
+		else {
+			dev->prev_state = prev_state;
+		}
+		break;
+	default:
+		if (dev->callback)
+			status = (*dev->callback)(dev, rqst, data);
+		break;
+	}
+	return status;
+}
+
+/*
+ * Undo incomplete request
+ */
+static void pm_undo_all(struct pm_dev *last)
+{
+	struct list_head *entry = last->entry.prev;
+	while (entry != &pm_devs) {
+		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+		if (dev->state != dev->prev_state) {
+			/* previous state was zero (running) resume or
+			 * previous state was non-zero (suspended) suspend
+			 */
+			pm_request_t undo = (dev->prev_state
+					     ? PM_SUSPEND:PM_RESUME);
+			pm_send(dev, undo, (void*) dev->prev_state);
+		}
+		entry = entry->prev;
+	}
+}
+
+/**
+ *	pm_send_all - send request to all managed devices
+ *	@rqst: power management request
+ *	@data: data for the callback
+ *
+ *	Issue a power management request to a all devices. The 
+ *	%PM_SUSPEND events are handled specially. Any device is 
+ *	permitted to fail a suspend by returning a non zero (error)
+ *	value from its callback function. If any device vetoes a 
+ *	suspend request then all other devices that have suspended 
+ *	during the processing of this request are restored to their
+ *	previous state.
+ *
+ *	WARNING:  This function takes the pm_devs_lock. The lock is not dropped until
+ *	the callbacks have completed. This prevents races against pm locking
+ *	functions, races against module unload pm_unregister code. It does
+ *	mean however that you must not issue pm_ functions within the callback
+ *	or you will deadlock and users will hate you.
+ *
+ *	Zero is returned on success. If a suspend fails then the status
+ *	from the device that vetoes the suspend is returned.
+ *
+ *	BUGS: what stops two power management requests occurring in parallel
+ *	and conflicting.
+ */
+ 
+int pm_send_all(pm_request_t rqst, void *data)
+{
+	struct list_head *entry;
+	
+	down(&pm_devs_lock);
+	entry = pm_devs.next;
+	while (entry != &pm_devs) {
+		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+		if (dev->callback) {
+			int status = pm_send(dev, rqst, data);
+			if (status) {
+				/* return devices to previous state on
+				 * failed suspend request
+				 */
+				if (rqst == PM_SUSPEND)
+					pm_undo_all(dev);
+				up(&pm_devs_lock);
+				return status;
+			}
+		}
+		entry = entry->next;
+	}
+	up(&pm_devs_lock);
+	return 0;
+}
+
+EXPORT_SYMBOL(pm_register);
+EXPORT_SYMBOL(pm_unregister);
+EXPORT_SYMBOL(pm_unregister_all);
+EXPORT_SYMBOL(pm_send_all);
+EXPORT_SYMBOL(pm_active);
+
+
diff --git a/kernel/power/power.h b/kernel/power/power.h
new file mode 100644
index 000000000000..cd6a3493cc0d
--- /dev/null
+++ b/kernel/power/power.h
@@ -0,0 +1,52 @@
+#include <linux/suspend.h>
+#include <linux/utsname.h>
+
+/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but
+   we probably do not take enough locks for switching consoles, etc,
+   so bad things might happen.
+*/
+#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
+#define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
+#endif
+
+
+struct swsusp_info {
+	struct new_utsname	uts;
+	u32			version_code;
+	unsigned long		num_physpages;
+	int			cpus;
+	unsigned long		image_pages;
+	unsigned long		pagedir_pages;
+	suspend_pagedir_t	* suspend_pagedir;
+	swp_entry_t		pagedir[768];
+} __attribute__((aligned(PAGE_SIZE)));
+
+
+
+#ifdef CONFIG_SOFTWARE_SUSPEND
+extern int pm_suspend_disk(void);
+
+#else
+static inline int pm_suspend_disk(void)
+{
+	return -EPERM;
+}
+#endif
+extern struct semaphore pm_sem;
+#define power_attr(_name) \
+static struct subsys_attribute _name##_attr = {	\
+	.attr	= {				\
+		.name = __stringify(_name),	\
+		.mode = 0644,			\
+	},					\
+	.show	= _name##_show,			\
+	.store	= _name##_store,		\
+}
+
+extern struct subsystem power_subsys;
+
+extern int freeze_processes(void);
+extern void thaw_processes(void);
+
+extern int pm_prepare_console(void);
+extern void pm_restore_console(void);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
new file mode 100644
index 000000000000..715081b2d829
--- /dev/null
+++ b/kernel/power/poweroff.c
@@ -0,0 +1,45 @@
+/*
+ * poweroff.c - sysrq handler to gracefully power down machine.
+ *
+ * This file is released under the GPL v2
+ */
+
+#include <linux/kernel.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <linux/workqueue.h>
+
+/*
+ * When the user hits Sys-Rq o to power down the machine this is the
+ * callback we use.
+ */
+
+static void do_poweroff(void *dummy)
+{
+	if (pm_power_off)
+		pm_power_off();
+}
+
+static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
+
+static void handle_poweroff(int key, struct pt_regs *pt_regs,
+				struct tty_struct *tty)
+{
+	schedule_work(&poweroff_work);
+}
+
+static struct sysrq_key_op	sysrq_poweroff_op = {
+	.handler        = handle_poweroff,
+	.help_msg       = "powerOff",
+	.action_msg     = "Power Off",
+ 	.enable_mask	= SYSRQ_ENABLE_BOOT,
+};
+
+static int pm_sysrq_init(void)
+{
+	register_sysrq_key('o', &sysrq_poweroff_op);
+	return 0;
+}
+
+subsys_initcall(pm_sysrq_init);
diff --git a/kernel/power/process.c b/kernel/power/process.c
new file mode 100644
index 000000000000..78d92dc6a1ed
--- /dev/null
+++ b/kernel/power/process.c
@@ -0,0 +1,121 @@
+/*
+ * drivers/power/process.c - Functions for starting/stopping processes on 
+ *                           suspend transitions.
+ *
+ * Originally from swsusp.
+ */
+
+
+#undef DEBUG
+
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+
+/* 
+ * Timeout for stopping processes
+ */
+#define TIMEOUT	(6 * HZ)
+
+
+static inline int freezeable(struct task_struct * p)
+{
+	if ((p == current) || 
+	    (p->flags & PF_NOFREEZE) ||
+	    (p->exit_state == EXIT_ZOMBIE) ||
+	    (p->exit_state == EXIT_DEAD) ||
+	    (p->state == TASK_STOPPED) ||
+	    (p->state == TASK_TRACED))
+		return 0;
+	return 1;
+}
+
+/* Refrigerator is place where frozen processes are stored :-). */
+void refrigerator(unsigned long flag)
+{
+	/* Hmm, should we be allowed to suspend when there are realtime
+	   processes around? */
+	long save;
+	save = current->state;
+	current->state = TASK_UNINTERRUPTIBLE;
+	pr_debug("%s entered refrigerator\n", current->comm);
+	printk("=");
+	current->flags &= ~PF_FREEZE;
+
+	spin_lock_irq(&current->sighand->siglock);
+	recalc_sigpending(); /* We sent fake signal, clean it up */
+	spin_unlock_irq(&current->sighand->siglock);
+
+	current->flags |= PF_FROZEN;
+	while (current->flags & PF_FROZEN)
+		schedule();
+	pr_debug("%s left refrigerator\n", current->comm);
+	current->state = save;
+}
+
+/* 0 = success, else # of processes that we failed to stop */
+int freeze_processes(void)
+{
+       int todo;
+       unsigned long start_time;
+	struct task_struct *g, *p;
+	
+	printk( "Stopping tasks: " );
+	start_time = jiffies;
+	do {
+		todo = 0;
+		read_lock(&tasklist_lock);
+		do_each_thread(g, p) {
+			unsigned long flags;
+			if (!freezeable(p))
+				continue;
+			if ((p->flags & PF_FROZEN) ||
+			    (p->state == TASK_TRACED) ||
+			    (p->state == TASK_STOPPED))
+				continue;
+
+			/* FIXME: smp problem here: we may not access other process' flags
+			   without locking */
+			p->flags |= PF_FREEZE;
+			spin_lock_irqsave(&p->sighand->siglock, flags);
+			signal_wake_up(p, 0);
+			spin_unlock_irqrestore(&p->sighand->siglock, flags);
+			todo++;
+		} while_each_thread(g, p);
+		read_unlock(&tasklist_lock);
+		yield();			/* Yield is okay here */
+		if (time_after(jiffies, start_time + TIMEOUT)) {
+			printk( "\n" );
+			printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
+			return todo;
+		}
+	} while(todo);
+	
+	printk( "|\n" );
+	BUG_ON(in_atomic());
+	return 0;
+}
+
+void thaw_processes(void)
+{
+	struct task_struct *g, *p;
+
+	printk( "Restarting tasks..." );
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		if (!freezeable(p))
+			continue;
+		if (p->flags & PF_FROZEN) {
+			p->flags &= ~PF_FROZEN;
+			wake_up_process(p);
+		} else
+			printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
+	} while_each_thread(g, p);
+
+	read_unlock(&tasklist_lock);
+	schedule();
+	printk( " done\n" );
+}
+
+EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
new file mode 100644
index 000000000000..7fa7f6e2b7fb
--- /dev/null
+++ b/kernel/power/smp.c
@@ -0,0 +1,85 @@
+/*
+ * drivers/power/smp.c - Functions for stopping other CPUs.
+ *
+ * Copyright 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#undef DEBUG
+
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <asm/atomic.h>
+#include <asm/tlbflush.h>
+
+static atomic_t cpu_counter, freeze;
+
+
+static void smp_pause(void * data)
+{
+	struct saved_context ctxt;
+	__save_processor_state(&ctxt);
+	printk("Sleeping in:\n");
+	dump_stack();
+	atomic_inc(&cpu_counter);
+	while (atomic_read(&freeze)) {
+		/* FIXME: restore takes place at random piece inside this.
+		   This should probably be written in assembly, and
+		   preserve general-purpose registers, too
+
+		   What about stack? We may need to move to new stack here.
+
+		   This should better be ran with interrupts disabled.
+		 */
+		cpu_relax();
+		barrier();
+	}
+	atomic_dec(&cpu_counter);
+	__restore_processor_state(&ctxt);
+}
+
+static cpumask_t oldmask;
+
+void disable_nonboot_cpus(void)
+{
+	printk("Freezing CPUs (at %d)", smp_processor_id());
+	oldmask = current->cpus_allowed;
+	set_cpus_allowed(current, cpumask_of_cpu(0));
+	current->state = TASK_INTERRUPTIBLE;
+	schedule_timeout(HZ);
+	printk("...");
+	BUG_ON(smp_processor_id() != 0);
+
+	/* FIXME: for this to work, all the CPUs must be running
+	 * "idle" thread (or we deadlock). Is that guaranteed? */
+
+	atomic_set(&cpu_counter, 0);
+	atomic_set(&freeze, 1);
+	smp_call_function(smp_pause, NULL, 0, 0);
+	while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) {
+		cpu_relax();
+		barrier();
+	}
+	printk("ok\n");
+}
+
+void enable_nonboot_cpus(void)
+{
+	printk("Restarting CPUs");
+	atomic_set(&freeze, 0);
+	while (atomic_read(&cpu_counter)) {
+		cpu_relax();
+		barrier();
+	}
+	printk("...");
+	set_cpus_allowed(current, oldmask);
+	schedule();
+	printk("ok\n");
+
+}
+
+
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
new file mode 100644
index 000000000000..ae5bebc3b18f
--- /dev/null
+++ b/kernel/power/swsusp.c
@@ -0,0 +1,1433 @@
+/*
+ * linux/kernel/power/swsusp.c
+ *
+ * This file is to realize architecture-independent
+ * machine suspend feature using pretty near only high-level routines
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2.
+ *
+ * I'd like to thank the following people for their work:
+ * 
+ * Pavel Machek <pavel@ucw.cz>:
+ * Modifications, defectiveness pointing, being with me at the very beginning,
+ * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
+ *
+ * Steve Doddi <dirk@loth.demon.co.uk>: 
+ * Support the possibility of hardware state restoring.
+ *
+ * Raph <grey.havens@earthling.net>:
+ * Support for preserving states of network devices and virtual console
+ * (including X and svgatextmode)
+ *
+ * Kurt Garloff <garloff@suse.de>:
+ * Straightened the critical function in order to prevent compilers from
+ * playing tricks with local variables.
+ *
+ * Andreas Mohr <a.mohr@mailto.de>
+ *
+ * Alex Badea <vampire@go.ro>:
+ * Fixed runaway init
+ *
+ * More state savers are welcome. Especially for the scsi layer...
+ *
+ * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/smp_lock.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/bitops.h>
+#include <linux/vt_kern.h>
+#include <linux/kbd_kern.h>
+#include <linux/keyboard.h>
+#include <linux/spinlock.h>
+#include <linux/genhd.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/swap.h>
+#include <linux/pm.h>
+#include <linux/device.h>
+#include <linux/buffer_head.h>
+#include <linux/swapops.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/console.h>
+#include <linux/highmem.h>
+#include <linux/bio.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#include "power.h"
+
+/* References to section boundaries */
+extern const void __nosave_begin, __nosave_end;
+
+/* Variables to be preserved over suspend */
+static int nr_copy_pages_check;
+
+extern char resume_file[];
+
+/* Local variables that should not be affected by save */
+unsigned int nr_copy_pages __nosavedata = 0;
+
+/* Suspend pagedir is allocated before final copy, therefore it
+   must be freed after resume 
+
+   Warning: this is evil. There are actually two pagedirs at time of
+   resume. One is "pagedir_save", which is empty frame allocated at
+   time of suspend, that must be freed. Second is "pagedir_nosave", 
+   allocated at time of resume, that travels through memory not to
+   collide with anything.
+
+   Warning: this is even more evil than it seems. Pagedirs this file
+   talks about are completely different from page directories used by
+   MMU hardware.
+ */
+suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
+static suspend_pagedir_t *pagedir_save;
+
+#define SWSUSP_SIG	"S1SUSPEND"
+
+static struct swsusp_header {
+	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
+	swp_entry_t swsusp_info;
+	char	orig_sig[10];
+	char	sig[10];
+} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
+
+static struct swsusp_info swsusp_info;
+
+/*
+ * XXX: We try to keep some more pages free so that I/O operations succeed
+ * without paging. Might this be more?
+ */
+#define PAGES_FOR_IO	512
+
+/*
+ * Saving part...
+ */
+
+/* We memorize in swapfile_used what swap devices are used for suspension */
+#define SWAPFILE_UNUSED    0
+#define SWAPFILE_SUSPEND   1	/* This is the suspending device */
+#define SWAPFILE_IGNORED   2	/* Those are other swap devices ignored for suspension */
+
+static unsigned short swapfile_used[MAX_SWAPFILES];
+static unsigned short root_swap;
+
+static int mark_swapfiles(swp_entry_t prev)
+{
+	int error;
+
+	rw_swap_page_sync(READ, 
+			  swp_entry(root_swap, 0),
+			  virt_to_page((unsigned long)&swsusp_header));
+	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
+	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
+		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
+		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
+		swsusp_header.swsusp_info = prev;
+		error = rw_swap_page_sync(WRITE, 
+					  swp_entry(root_swap, 0),
+					  virt_to_page((unsigned long)
+						       &swsusp_header));
+	} else {
+		pr_debug("swsusp: Partition is not swap space.\n");
+		error = -ENODEV;
+	}
+	return error;
+}
+
+/*
+ * Check whether the swap device is the specified resume
+ * device, irrespective of whether they are specified by
+ * identical names.
+ *
+ * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
+ * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
+ * and they'll be considered the same device.  This is *necessary* for
+ * devfs, since the resume code can only recognize the form /dev/hda4,
+ * but the suspend code would see the long name.)
+ */
+static int is_resume_device(const struct swap_info_struct *swap_info)
+{
+	struct file *file = swap_info->swap_file;
+	struct inode *inode = file->f_dentry->d_inode;
+
+	return S_ISBLK(inode->i_mode) &&
+		swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
+}
+
+static int swsusp_swap_check(void) /* This is called before saving image */
+{
+	int i, len;
+	
+	len=strlen(resume_file);
+	root_swap = 0xFFFF;
+	
+	swap_list_lock();
+	for(i=0; i<MAX_SWAPFILES; i++) {
+		if (swap_info[i].flags == 0) {
+			swapfile_used[i]=SWAPFILE_UNUSED;
+		} else {
+			if(!len) {
+	    			printk(KERN_WARNING "resume= option should be used to set suspend device" );
+				if(root_swap == 0xFFFF) {
+					swapfile_used[i] = SWAPFILE_SUSPEND;
+					root_swap = i;
+				} else
+					swapfile_used[i] = SWAPFILE_IGNORED;				  
+			} else {
+	  			/* we ignore all swap devices that are not the resume_file */
+				if (is_resume_device(&swap_info[i])) {
+					swapfile_used[i] = SWAPFILE_SUSPEND;
+					root_swap = i;
+				} else {
+				  	swapfile_used[i] = SWAPFILE_IGNORED;
+				}
+			}
+		}
+	}
+	swap_list_unlock();
+	return (root_swap != 0xffff) ? 0 : -ENODEV;
+}
+
+/**
+ * This is called after saving image so modification
+ * will be lost after resume... and that's what we want.
+ * we make the device unusable. A new call to
+ * lock_swapdevices can unlock the devices. 
+ */
+static void lock_swapdevices(void)
+{
+	int i;
+
+	swap_list_lock();
+	for(i = 0; i< MAX_SWAPFILES; i++)
+		if(swapfile_used[i] == SWAPFILE_IGNORED) {
+			swap_info[i].flags ^= 0xFF;
+		}
+	swap_list_unlock();
+}
+
+/**
+ *	write_swap_page - Write one page to a fresh swap location.
+ *	@addr:	Address we're writing.
+ *	@loc:	Place to store the entry we used.
+ *
+ *	Allocate a new swap entry and 'sync' it. Note we discard -EIO
+ *	errors. That is an artifact left over from swsusp. It did not 
+ *	check the return of rw_swap_page_sync() at all, since most pages
+ *	written back to swap would return -EIO.
+ *	This is a partial improvement, since we will at least return other
+ *	errors, though we need to eventually fix the damn code.
+ */
+static int write_page(unsigned long addr, swp_entry_t * loc)
+{
+	swp_entry_t entry;
+	int error = 0;
+
+	entry = get_swap_page();
+	if (swp_offset(entry) && 
+	    swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
+		error = rw_swap_page_sync(WRITE, entry,
+					  virt_to_page(addr));
+		if (error == -EIO)
+			error = 0;
+		if (!error)
+			*loc = entry;
+	} else
+		error = -ENOSPC;
+	return error;
+}
+
+/**
+ *	data_free - Free the swap entries used by the saved image.
+ *
+ *	Walk the list of used swap entries and free each one. 
+ *	This is only used for cleanup when suspend fails.
+ */
+static void data_free(void)
+{
+	swp_entry_t entry;
+	int i;
+
+	for (i = 0; i < nr_copy_pages; i++) {
+		entry = (pagedir_nosave + i)->swap_address;
+		if (entry.val)
+			swap_free(entry);
+		else
+			break;
+		(pagedir_nosave + i)->swap_address = (swp_entry_t){0};
+	}
+}
+
+/**
+ *	data_write - Write saved image to swap.
+ *
+ *	Walk the list of pages in the image and sync each one to swap.
+ */
+static int data_write(void)
+{
+	int error = 0, i = 0;
+	unsigned int mod = nr_copy_pages / 100;
+	struct pbe *p;
+
+	if (!mod)
+		mod = 1;
+
+	printk( "Writing data to swap (%d pages)...     ", nr_copy_pages );
+	for_each_pbe(p, pagedir_nosave) {
+		if (!(i%mod))
+			printk( "\b\b\b\b%3d%%", i / mod );
+		if ((error = write_page(p->address, &(p->swap_address))))
+			return error;
+		i++;
+	}
+	printk("\b\b\b\bdone\n");
+	return error;
+}
+
+static void dump_info(void)
+{
+	pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
+	pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
+	pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
+	pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
+	pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
+	pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
+	pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
+	pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
+	pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
+	pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
+	pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages);
+}
+
+static void init_header(void)
+{
+	memset(&swsusp_info, 0, sizeof(swsusp_info));
+	swsusp_info.version_code = LINUX_VERSION_CODE;
+	swsusp_info.num_physpages = num_physpages;
+	memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
+
+	swsusp_info.suspend_pagedir = pagedir_nosave;
+	swsusp_info.cpus = num_online_cpus();
+	swsusp_info.image_pages = nr_copy_pages;
+}
+
+static int close_swap(void)
+{
+	swp_entry_t entry;
+	int error;
+
+	dump_info();
+	error = write_page((unsigned long)&swsusp_info, &entry);
+	if (!error) { 
+		printk( "S" );
+		error = mark_swapfiles(entry);
+		printk( "|\n" );
+	}
+	return error;
+}
+
+/**
+ *	free_pagedir_entries - Free pages used by the page directory.
+ *
+ *	This is used during suspend for error recovery.
+ */
+
+static void free_pagedir_entries(void)
+{
+	int i;
+
+	for (i = 0; i < swsusp_info.pagedir_pages; i++)
+		swap_free(swsusp_info.pagedir[i]);
+}
+
+
+/**
+ *	write_pagedir - Write the array of pages holding the page directory.
+ *	@last:	Last swap entry we write (needed for header).
+ */
+
+static int write_pagedir(void)
+{
+	int error = 0;
+	unsigned n = 0;
+	struct pbe * pbe;
+
+	printk( "Writing pagedir...");
+	for_each_pb_page(pbe, pagedir_nosave) {
+		if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
+			return error;
+	}
+
+	swsusp_info.pagedir_pages = n;
+	printk("done (%u pages)\n", n);
+	return error;
+}
+
+/**
+ *	write_suspend_image - Write entire image and metadata.
+ *
+ */
+
+static int write_suspend_image(void)
+{
+	int error;
+
+	init_header();
+	if ((error = data_write()))
+		goto FreeData;
+
+	if ((error = write_pagedir()))
+		goto FreePagedir;
+
+	if ((error = close_swap()))
+		goto FreePagedir;
+ Done:
+	return error;
+ FreePagedir:
+	free_pagedir_entries();
+ FreeData:
+	data_free();
+	goto Done;
+}
+
+
+#ifdef CONFIG_HIGHMEM
+struct highmem_page {
+	char *data;
+	struct page *page;
+	struct highmem_page *next;
+};
+
+static struct highmem_page *highmem_copy;
+
+static int save_highmem_zone(struct zone *zone)
+{
+	unsigned long zone_pfn;
+	mark_free_pages(zone);
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		struct highmem_page *save;
+		void *kaddr;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+		if (!(pfn%1000))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		/*
+		 * This condition results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations. This should be
+		 * corrected eventually when the cases giving rise to this
+		 * are better understood.
+		 */
+		if (PageReserved(page)) {
+			printk("highmem reserved page?!\n");
+			continue;
+		}
+		BUG_ON(PageNosave(page));
+		if (PageNosaveFree(page))
+			continue;
+		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
+		if (!save)
+			return -ENOMEM;
+		save->next = highmem_copy;
+		save->page = page;
+		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
+		if (!save->data) {
+			kfree(save);
+			return -ENOMEM;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(save->data, kaddr, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		highmem_copy = save;
+	}
+	return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+
+static int save_highmem(void)
+{
+#ifdef CONFIG_HIGHMEM
+	struct zone *zone;
+	int res = 0;
+
+	pr_debug("swsusp: Saving Highmem\n");
+	for_each_zone(zone) {
+		if (is_highmem(zone))
+			res = save_highmem_zone(zone);
+		if (res)
+			return res;
+	}
+#endif
+	return 0;
+}
+
+static int restore_highmem(void)
+{
+#ifdef CONFIG_HIGHMEM
+	printk("swsusp: Restoring Highmem\n");
+	while (highmem_copy) {
+		struct highmem_page *save = highmem_copy;
+		void *kaddr;
+		highmem_copy = save->next;
+
+		kaddr = kmap_atomic(save->page, KM_USER0);
+		memcpy(kaddr, save->data, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long) save->data);
+		kfree(save);
+	}
+#endif
+	return 0;
+}
+
+
+static int pfn_is_nosave(unsigned long pfn)
+{
+	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+
+/**
+ *	saveable - Determine whether a page should be cloned or not.
+ *	@pfn:	The page
+ *
+ *	We save a page if it's Reserved, and not in the range of pages
+ *	statically defined as 'unsaveable', or if it isn't reserved, and
+ *	isn't part of a free chunk of pages.
+ */
+
+static int saveable(struct zone * zone, unsigned long * zone_pfn)
+{
+	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
+	struct page * page;
+
+	if (!pfn_valid(pfn))
+		return 0;
+
+	page = pfn_to_page(pfn);
+	BUG_ON(PageReserved(page) && PageNosave(page));
+	if (PageNosave(page))
+		return 0;
+	if (PageReserved(page) && pfn_is_nosave(pfn)) {
+		pr_debug("[nosave pfn 0x%lx]", pfn);
+		return 0;
+	}
+	if (PageNosaveFree(page))
+		return 0;
+
+	return 1;
+}
+
+static void count_data_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+
+	nr_copy_pages = 0;
+
+	for_each_zone(zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			nr_copy_pages += saveable(zone, &zone_pfn);
+	}
+}
+
+
+static void copy_data_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe * pbe = pagedir_nosave;
+	
+	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
+	for_each_zone(zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+			if (saveable(zone, &zone_pfn)) {
+				struct page * page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				BUG_ON(!pbe);
+				pbe->orig_address = (long) page_address(page);
+				/* copy_page is not usable for copying task structs. */
+				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
+				pbe = pbe->next;
+			}
+		}
+	}
+	BUG_ON(pbe);
+}
+
+
+/**
+ *	calc_nr - Determine the number of pages needed for a pbe list.
+ */
+
+static int calc_nr(int nr_copy)
+{
+	int extra = 0;
+	int mod = !!(nr_copy % PBES_PER_PAGE);
+	int diff = (nr_copy / PBES_PER_PAGE) + mod;
+
+	do {
+		extra += diff;
+		nr_copy += diff;
+		mod = !!(nr_copy % PBES_PER_PAGE);
+		diff = (nr_copy / PBES_PER_PAGE) + mod - extra;
+	} while (diff > 0);
+
+	return nr_copy;
+}
+
+/**
+ *	free_pagedir - free pages allocated with alloc_pagedir()
+ */
+
+static inline void free_pagedir(struct pbe *pblist)
+{
+	struct pbe *pbe;
+
+	while (pblist) {
+		pbe = (pblist + PB_PAGE_SKIP)->next;
+		free_page((unsigned long)pblist);
+		pblist = pbe;
+	}
+}
+
+/**
+ *	fill_pb_page - Create a list of PBEs on a given memory page
+ */
+
+static inline void fill_pb_page(struct pbe *pbpage)
+{
+	struct pbe *p;
+
+	p = pbpage;
+	pbpage += PB_PAGE_SKIP;
+	do
+		p->next = p + 1;
+	while (++p < pbpage);
+}
+
+/**
+ *	create_pbe_list - Create a list of PBEs on top of a given chain
+ *	of memory pages allocated with alloc_pagedir()
+ */
+
+static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
+{
+	struct pbe *pbpage, *p;
+	unsigned num = PBES_PER_PAGE;
+
+	for_each_pb_page (pbpage, pblist) {
+		if (num >= nr_pages)
+			break;
+
+		fill_pb_page(pbpage);
+		num += PBES_PER_PAGE;
+	}
+	if (pbpage) {
+		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
+			p->next = p + 1;
+		p->next = NULL;
+	}
+	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
+}
+
+/**
+ *	alloc_pagedir - Allocate the page directory.
+ *
+ *	First, determine exactly how many pages we need and
+ *	allocate them.
+ *
+ *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
+ *	struct pbe elements (pbes) and the last element in the page points
+ *	to the next page.
+ *
+ *	On each page we set up a list of struct_pbe elements.
+ */
+
+static struct pbe * alloc_pagedir(unsigned nr_pages)
+{
+	unsigned num;
+	struct pbe *pblist, *pbe;
+
+	if (!nr_pages)
+		return NULL;
+
+	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
+	pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
+        		pbe = pbe->next, num += PBES_PER_PAGE) {
+		pbe += PB_PAGE_SKIP;
+		pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	}
+	if (!pbe) { /* get_zeroed_page() failed */
+		free_pagedir(pblist);
+		pblist = NULL;
+        }
+	return pblist;
+}
+
+/**
+ *	free_image_pages - Free pages allocated for snapshot
+ */
+
+static void free_image_pages(void)
+{
+	struct pbe * p;
+
+	for_each_pbe(p, pagedir_save) {
+		if (p->address) {
+			ClearPageNosave(virt_to_page(p->address));
+			free_page(p->address);
+			p->address = 0;
+		}
+	}
+}
+
+/**
+ *	alloc_image_pages - Allocate pages for the snapshot.
+ */
+
+static int alloc_image_pages(void)
+{
+	struct pbe * p;
+
+	for_each_pbe(p, pagedir_save) {
+		p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+		if (!p->address)
+			return -ENOMEM;
+		SetPageNosave(virt_to_page(p->address));
+	}
+	return 0;
+}
+
+void swsusp_free(void)
+{
+	BUG_ON(PageNosave(virt_to_page(pagedir_save)));
+	BUG_ON(PageNosaveFree(virt_to_page(pagedir_save)));
+	free_image_pages();
+	free_pagedir(pagedir_save);
+}
+
+
+/**
+ *	enough_free_mem - Make sure we enough free memory to snapshot.
+ *
+ *	Returns TRUE or FALSE after checking the number of available 
+ *	free pages.
+ */
+
+static int enough_free_mem(void)
+{
+	if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) {
+		pr_debug("swsusp: Not enough free pages: Have %d\n",
+			 nr_free_pages());
+		return 0;
+	}
+	return 1;
+}
+
+
+/**
+ *	enough_swap - Make sure we have enough swap to save the image.
+ *
+ *	Returns TRUE or FALSE after checking the total amount of swap 
+ *	space avaiable.
+ *
+ *	FIXME: si_swapinfo(&i) returns all swap devices information.
+ *	We should only consider resume_device. 
+ */
+
+static int enough_swap(void)
+{
+	struct sysinfo i;
+
+	si_swapinfo(&i);
+	if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO))  {
+		pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap);
+		return 0;
+	}
+	return 1;
+}
+
+static int swsusp_alloc(void)
+{
+	int error;
+
+	pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
+		 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
+
+	pagedir_nosave = NULL;
+	if (!enough_free_mem())
+		return -ENOMEM;
+
+	if (!enough_swap())
+		return -ENOSPC;
+
+	nr_copy_pages = calc_nr(nr_copy_pages);
+
+	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
+		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
+		return -ENOMEM;
+	}
+	create_pbe_list(pagedir_save, nr_copy_pages);
+	pagedir_nosave = pagedir_save;
+	if ((error = alloc_image_pages())) {
+		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+		swsusp_free();
+		return error;
+	}
+
+	nr_copy_pages_check = nr_copy_pages;
+	return 0;
+}
+
+static int suspend_prepare_image(void)
+{
+	int error;
+
+	pr_debug("swsusp: critical section: \n");
+	if (save_highmem()) {
+		printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n");
+		restore_highmem();
+		return -ENOMEM;
+	}
+
+	drain_local_pages();
+	count_data_pages();
+	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
+
+	error = swsusp_alloc();
+	if (error)
+		return error;
+	
+	/* During allocating of suspend pagedir, new cold pages may appear. 
+	 * Kill them.
+	 */
+	drain_local_pages();
+	copy_data_pages();
+
+	/*
+	 * End of critical section. From now on, we can write to memory,
+	 * but we should not touch disk. This specially means we must _not_
+	 * touch swap space! Except we must write out our image of course.
+	 */
+
+	printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
+	return 0;
+}
+
+
+/* It is important _NOT_ to umount filesystems at this point. We want
+ * them synced (in case something goes wrong) but we DO not want to mark
+ * filesystem clean: it is not. (And it does not matter, if we resume
+ * correctly, we'll mark system clean, anyway.)
+ */
+int swsusp_write(void)
+{
+	int error;
+	device_resume();
+	lock_swapdevices();
+	error = write_suspend_image();
+	/* This will unlock ignored swap devices since writing is finished */
+	lock_swapdevices();
+	return error;
+
+}
+
+
+extern asmlinkage int swsusp_arch_suspend(void);
+extern asmlinkage int swsusp_arch_resume(void);
+
+
+asmlinkage int swsusp_save(void)
+{
+	int error = 0;
+
+	if ((error = swsusp_swap_check())) {
+		printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try "
+				"swapon -a!\n");
+		return error;
+	}
+	return suspend_prepare_image();
+}
+
+int swsusp_suspend(void)
+{
+	int error;
+	if ((error = arch_prepare_suspend()))
+		return error;
+	local_irq_disable();
+	/* At this point, device_suspend() has been called, but *not*
+	 * device_power_down(). We *must* device_power_down() now.
+	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
+	 * become desynchronized with the actual state of the hardware
+	 * at resume time, and evil weirdness ensues.
+	 */
+	if ((error = device_power_down(PMSG_FREEZE))) {
+		printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
+		local_irq_enable();
+		swsusp_free();
+		return error;
+	}
+	save_processor_state();
+	if ((error = swsusp_arch_suspend()))
+		swsusp_free();
+	/* Restore control flow magically appears here */
+	restore_processor_state();
+	BUG_ON (nr_copy_pages_check != nr_copy_pages);
+	restore_highmem();
+	device_power_up();
+	local_irq_enable();
+	return error;
+}
+
+int swsusp_resume(void)
+{
+	int error;
+	local_irq_disable();
+	if (device_power_down(PMSG_FREEZE))
+		printk(KERN_ERR "Some devices failed to power down, very bad\n");
+	/* We'll ignore saved state, but this gets preempt count (etc) right */
+	save_processor_state();
+	error = swsusp_arch_resume();
+	/* Code below is only ever reached in case of failure. Otherwise
+	 * execution continues at place where swsusp_arch_suspend was called
+         */
+	BUG_ON(!error);
+	restore_processor_state();
+	restore_highmem();
+	device_power_up();
+	local_irq_enable();
+	return error;
+}
+
+/* More restore stuff */
+
+/*
+ * Returns true if given address/order collides with any orig_address 
+ */
+static int does_collide_order(unsigned long addr, int order)
+{
+	int i;
+	
+	for (i=0; i < (1<<order); i++)
+		if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE)))
+			return 1;
+	return 0;
+}
+
+/**
+ *	On resume, for storing the PBE list and the image,
+ *	we can only use memory pages that do not conflict with the pages
+ *	which had been used before suspend.
+ *
+ *	We don't know which pages are usable until we allocate them.
+ *
+ *	Allocated but unusable (ie eaten) memory pages are linked together
+ *	to create a list, so that we can free them easily
+ *
+ *	We could have used a type other than (void *)
+ *	for this purpose, but ...
+ */
+static void **eaten_memory = NULL;
+
+static inline void eat_page(void *page)
+{
+	void **c;
+
+	c = eaten_memory;
+	eaten_memory = page;
+	*eaten_memory = c;
+}
+
+static unsigned long get_usable_page(unsigned gfp_mask)
+{
+	unsigned long m;
+
+	m = get_zeroed_page(gfp_mask);
+	while (does_collide_order(m, 0)) {
+		eat_page((void *)m);
+		m = get_zeroed_page(gfp_mask);
+		if (!m)
+			break;
+	}
+	return m;
+}
+
+static void free_eaten_memory(void)
+{
+	unsigned long m;
+	void **c;
+	int i = 0;
+
+	c = eaten_memory;
+	while (c) {
+		m = (unsigned long)c;
+		c = *c;
+		free_page(m);
+		i++;
+	}
+	eaten_memory = NULL;
+	pr_debug("swsusp: %d unused pages freed\n", i);
+}
+
+/**
+ *	check_pagedir - We ensure here that pages that the PBEs point to
+ *	won't collide with pages where we're going to restore from the loaded
+ *	pages later
+ */
+
+static int check_pagedir(struct pbe *pblist)
+{
+	struct pbe *p;
+
+	/* This is necessary, so that we can free allocated pages
+	 * in case of failure
+	 */
+	for_each_pbe (p, pblist)
+		p->address = 0UL;
+
+	for_each_pbe (p, pblist) {
+		p->address = get_usable_page(GFP_ATOMIC);
+		if (!p->address)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+/**
+ *	swsusp_pagedir_relocate - It is possible, that some memory pages
+ *	occupied by the list of PBEs collide with pages where we're going to
+ *	restore from the loaded pages later.  We relocate them here.
+ */
+
+static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe *pbpage, *tail, *p;
+	void *m;
+	int rel = 0, error = 0;
+
+	if (!pblist) /* a sanity check */
+		return NULL;
+
+	pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
+			swsusp_info.pagedir_pages);
+
+	/* Set page flags */
+
+	for_each_zone(zone) {
+        	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+                	SetPageNosaveFree(pfn_to_page(zone_pfn +
+					zone->zone_start_pfn));
+	}
+
+	/* Clear orig addresses */
+
+	for_each_pbe (p, pblist)
+		ClearPageNosaveFree(virt_to_page(p->orig_address));
+
+	tail = pblist + PB_PAGE_SKIP;
+
+	/* Relocate colliding pages */
+
+	for_each_pb_page (pbpage, pblist) {
+		if (does_collide_order((unsigned long)pbpage, 0)) {
+			m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
+			if (!m) {
+				error = -ENOMEM;
+				break;
+			}
+			memcpy(m, (void *)pbpage, PAGE_SIZE);
+			if (pbpage == pblist)
+				pblist = (struct pbe *)m;
+			else
+				tail->next = (struct pbe *)m;
+
+			eat_page((void *)pbpage);
+			pbpage = (struct pbe *)m;
+
+			/* We have to link the PBEs again */
+
+			for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
+				if (p->next) /* needed to save the end */
+					p->next = p + 1;
+
+			rel++;
+		}
+		tail = pbpage + PB_PAGE_SKIP;
+	}
+
+	if (error) {
+		printk("\nswsusp: Out of memory\n\n");
+		free_pagedir(pblist);
+		free_eaten_memory();
+		pblist = NULL;
+	}
+	else
+		printk("swsusp: Relocated %d pages\n", rel);
+
+	return pblist;
+}
+
+/**
+ *	Using bio to read from swap.
+ *	This code requires a bit more work than just using buffer heads
+ *	but, it is the recommended way for 2.5/2.6.
+ *	The following are to signal the beginning and end of I/O. Bios
+ *	finish asynchronously, while we want them to happen synchronously.
+ *	A simple atomic_t, and a wait loop take care of this problem.
+ */
+
+static atomic_t io_done = ATOMIC_INIT(0);
+
+static int end_io(struct bio * bio, unsigned int num, int err)
+{
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		panic("I/O error reading memory image");
+	atomic_set(&io_done, 0);
+	return 0;
+}
+
+static struct block_device * resume_bdev;
+
+/**
+ *	submit - submit BIO request.
+ *	@rw:	READ or WRITE.
+ *	@off	physical offset of page.
+ *	@page:	page we're reading or writing.
+ *
+ *	Straight from the textbook - allocate and initialize the bio.
+ *	If we're writing, make sure the page is marked as dirty.
+ *	Then submit it and wait.
+ */
+
+static int submit(int rw, pgoff_t page_off, void * page)
+{
+	int error = 0;
+	struct bio * bio;
+
+	bio = bio_alloc(GFP_ATOMIC, 1);
+	if (!bio)
+		return -ENOMEM;
+	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+	bio_get(bio);
+	bio->bi_bdev = resume_bdev;
+	bio->bi_end_io = end_io;
+
+	if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
+		error = -EFAULT;
+		goto Done;
+	}
+
+	if (rw == WRITE)
+		bio_set_pages_dirty(bio);
+
+	atomic_set(&io_done, 1);
+	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+	while (atomic_read(&io_done))
+		yield();
+
+ Done:
+	bio_put(bio);
+	return error;
+}
+
+static int bio_read_page(pgoff_t page_off, void * page)
+{
+	return submit(READ, page_off, page);
+}
+
+static int bio_write_page(pgoff_t page_off, void * page)
+{
+	return submit(WRITE, page_off, page);
+}
+
+/*
+ * Sanity check if this image makes sense with this kernel/swap context
+ * I really don't think that it's foolproof but more than nothing..
+ */
+
+static const char * sanity_check(void)
+{
+	dump_info();
+	if(swsusp_info.version_code != LINUX_VERSION_CODE)
+		return "kernel version";
+	if(swsusp_info.num_physpages != num_physpages)
+		return "memory size";
+	if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
+		return "system type";
+	if (strcmp(swsusp_info.uts.release,system_utsname.release))
+		return "kernel release";
+	if (strcmp(swsusp_info.uts.version,system_utsname.version))
+		return "version";
+	if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
+		return "machine";
+	if(swsusp_info.cpus != num_online_cpus())
+		return "number of cpus";
+	return NULL;
+}
+
+
+static int check_header(void)
+{
+	const char * reason = NULL;
+	int error;
+
+	if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info)))
+		return error;
+
+ 	/* Is this same machine? */
+	if ((reason = sanity_check())) {
+		printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
+		return -EPERM;
+	}
+	nr_copy_pages = swsusp_info.image_pages;
+	return error;
+}
+
+static int check_sig(void)
+{
+	int error;
+
+	memset(&swsusp_header, 0, sizeof(swsusp_header));
+	if ((error = bio_read_page(0, &swsusp_header)))
+		return error;
+	if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
+		memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+
+		/*
+		 * Reset swap signature now.
+		 */
+		error = bio_write_page(0, &swsusp_header);
+	} else { 
+		printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n");
+		return -EINVAL;
+	}
+	if (!error)
+		pr_debug("swsusp: Signature found, resuming\n");
+	return error;
+}
+
+/**
+ *	data_read - Read image pages from swap.
+ *
+ *	You do not need to check for overlaps, check_pagedir()
+ *	already did that.
+ */
+
+static int data_read(struct pbe *pblist)
+{
+	struct pbe * p;
+	int error = 0;
+	int i = 0;
+	int mod = swsusp_info.image_pages / 100;
+
+	if (!mod)
+		mod = 1;
+
+	printk("swsusp: Reading image data (%lu pages):     ",
+			swsusp_info.image_pages);
+
+	for_each_pbe (p, pblist) {
+		if (!(i % mod))
+			printk("\b\b\b\b%3d%%", i / mod);
+
+		error = bio_read_page(swp_offset(p->swap_address),
+				  (void *)p->address);
+		if (error)
+			return error;
+
+		i++;
+	}
+	printk("\b\b\b\bdone\n");
+	return error;
+}
+
+extern dev_t name_to_dev_t(const char *line);
+
+/**
+ *	read_pagedir - Read page backup list pages from swap
+ */
+
+static int read_pagedir(struct pbe *pblist)
+{
+	struct pbe *pbpage, *p;
+	unsigned i = 0;
+	int error;
+
+	if (!pblist)
+		return -EFAULT;
+
+	printk("swsusp: Reading pagedir (%lu pages)\n",
+			swsusp_info.pagedir_pages);
+
+	for_each_pb_page (pbpage, pblist) {
+		unsigned long offset = swp_offset(swsusp_info.pagedir[i++]);
+
+		error = -EFAULT;
+		if (offset) {
+			p = (pbpage + PB_PAGE_SKIP)->next;
+			error = bio_read_page(offset, (void *)pbpage);
+			(pbpage + PB_PAGE_SKIP)->next = p;
+		}
+		if (error)
+			break;
+	}
+
+	if (error)
+		free_page((unsigned long)pblist);
+
+	BUG_ON(i != swsusp_info.pagedir_pages);
+
+	return error;
+}
+
+
+static int check_suspend_image(void)
+{
+	int error = 0;
+
+	if ((error = check_sig()))
+		return error;
+
+	if ((error = check_header()))
+		return error;
+
+	return 0;
+}
+
+static int read_suspend_image(void)
+{
+	int error = 0;
+	struct pbe *p;
+
+	if (!(p = alloc_pagedir(nr_copy_pages)))
+		return -ENOMEM;
+
+	if ((error = read_pagedir(p)))
+		return error;
+
+	create_pbe_list(p, nr_copy_pages);
+
+	if (!(pagedir_nosave = swsusp_pagedir_relocate(p)))
+		return -ENOMEM;
+
+	/* Allocate memory for the image and read the data from swap */
+
+	error = check_pagedir(pagedir_nosave);
+	free_eaten_memory();
+	if (!error)
+		error = data_read(pagedir_nosave);
+
+	if (error) { /* We fail cleanly */
+		for_each_pbe (p, pagedir_nosave)
+			if (p->address) {
+				free_page(p->address);
+				p->address = 0UL;
+			}
+		free_pagedir(pagedir_nosave);
+	}
+	return error;
+}
+
+/**
+ *      swsusp_check - Check for saved image in swap
+ */
+
+int swsusp_check(void)
+{
+	int error;
+
+	if (!swsusp_resume_device) {
+		if (!strlen(resume_file))
+			return -ENOENT;
+		swsusp_resume_device = name_to_dev_t(resume_file);
+		pr_debug("swsusp: Resume From Partition %s\n", resume_file);
+	} else {
+		pr_debug("swsusp: Resume From Partition %d:%d\n",
+			 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
+	}
+
+	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+	if (!IS_ERR(resume_bdev)) {
+		set_blocksize(resume_bdev, PAGE_SIZE);
+		error = check_suspend_image();
+		if (error)
+		    blkdev_put(resume_bdev);
+	} else
+		error = PTR_ERR(resume_bdev);
+
+	if (!error)
+		pr_debug("swsusp: resume file found\n");
+	else
+		pr_debug("swsusp: Error %d check for resume file\n", error);
+	return error;
+}
+
+/**
+ *	swsusp_read - Read saved image from swap.
+ */
+
+int swsusp_read(void)
+{
+	int error;
+
+	if (IS_ERR(resume_bdev)) {
+		pr_debug("swsusp: block device not initialised\n");
+		return PTR_ERR(resume_bdev);
+	}
+
+	error = read_suspend_image();
+	blkdev_put(resume_bdev);
+
+	if (!error)
+		pr_debug("swsusp: Reading resume file was successful\n");
+	else
+		pr_debug("swsusp: Error %d resuming\n", error);
+	return error;
+}
+
+/**
+ *	swsusp_close - close swap device.
+ */
+
+void swsusp_close(void)
+{
+	if (IS_ERR(resume_bdev)) {
+		pr_debug("swsusp: block device not initialised\n");
+		return;
+	}
+
+	blkdev_put(resume_bdev);
+}
diff --git a/kernel/printk.c b/kernel/printk.c
new file mode 100644
index 000000000000..1498689548d1
--- /dev/null
+++ b/kernel/printk.c
@@ -0,0 +1,996 @@
+/*
+ *  linux/kernel/printk.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * Modified to make sys_syslog() more flexible: added commands to
+ * return the last 4k of kernel messages, regardless of whether
+ * they've been read or not.  Added option to suppress kernel printk's
+ * to the console.  Added hook for sending the console messages
+ * elsewhere, in preparation for a serial line console (someday).
+ * Ted Ts'o, 2/11/93.
+ * Modified for sysctl support, 1/8/97, Chris Horn.
+ * Fixed SMP synchronization, 08/08/99, Manfred Spraul 
+ *     manfreds@colorfullife.com
+ * Rewrote bits to get rid of console_lock
+ *	01Mar01 Andrew Morton <andrewm@uow.edu.au>
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/smp_lock.h>
+#include <linux/console.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>			/* For in_interrupt() */
+#include <linux/config.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/security.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+
+#define __LOG_BUF_LEN	(1 << CONFIG_LOG_BUF_SHIFT)
+
+/* printk's without a loglevel use this.. */
+#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
+
+/* We show everything that is MORE important than this.. */
+#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
+#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
+
+DECLARE_WAIT_QUEUE_HEAD(log_wait);
+
+int console_printk[4] = {
+	DEFAULT_CONSOLE_LOGLEVEL,	/* console_loglevel */
+	DEFAULT_MESSAGE_LOGLEVEL,	/* default_message_loglevel */
+	MINIMUM_CONSOLE_LOGLEVEL,	/* minimum_console_loglevel */
+	DEFAULT_CONSOLE_LOGLEVEL,	/* default_console_loglevel */
+};
+
+EXPORT_SYMBOL(console_printk);
+
+/*
+ * Low lever drivers may need that to know if they can schedule in
+ * their unblank() callback or not. So let's export it.
+ */
+int oops_in_progress;
+EXPORT_SYMBOL(oops_in_progress);
+
+/*
+ * console_sem protects the console_drivers list, and also
+ * provides serialisation for access to the entire console
+ * driver system.
+ */
+static DECLARE_MUTEX(console_sem);
+struct console *console_drivers;
+/*
+ * This is used for debugging the mess that is the VT code by
+ * keeping track if we have the console semaphore held. It's
+ * definitely not the perfect debug tool (we don't know if _WE_
+ * hold it are racing, but it helps tracking those weird code
+ * path in the console code where we end up in places I want
+ * locked without the console sempahore held
+ */
+static int console_locked;
+
+/*
+ * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
+ * It is also used in interesting ways to provide interlocking in
+ * release_console_sem().
+ */
+static DEFINE_SPINLOCK(logbuf_lock);
+
+static char __log_buf[__LOG_BUF_LEN];
+static char *log_buf = __log_buf;
+static int log_buf_len = __LOG_BUF_LEN;
+
+#define LOG_BUF_MASK	(log_buf_len-1)
+#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
+
+/*
+ * The indices into log_buf are not constrained to log_buf_len - they
+ * must be masked before subscripting
+ */
+static unsigned long log_start;	/* Index into log_buf: next char to be read by syslog() */
+static unsigned long con_start;	/* Index into log_buf: next char to be sent to consoles */
+static unsigned long log_end;	/* Index into log_buf: most-recently-written-char + 1 */
+static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */
+
+/*
+ *	Array of consoles built from command line options (console=)
+ */
+struct console_cmdline
+{
+	char	name[8];			/* Name of the driver	    */
+	int	index;				/* Minor dev. to use	    */
+	char	*options;			/* Options for the driver   */
+};
+
+#define MAX_CMDLINECONSOLES 8
+
+static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
+static int selected_console = -1;
+static int preferred_console = -1;
+
+/* Flag: console code may call schedule() */
+static int console_may_schedule;
+
+/*
+ *	Setup a list of consoles. Called from init/main.c
+ */
+static int __init console_setup(char *str)
+{
+	char name[sizeof(console_cmdline[0].name)];
+	char *s, *options;
+	int idx;
+
+	/*
+	 *	Decode str into name, index, options.
+	 */
+	if (str[0] >= '0' && str[0] <= '9') {
+		strcpy(name, "ttyS");
+		strncpy(name + 4, str, sizeof(name) - 5);
+	} else
+		strncpy(name, str, sizeof(name) - 1);
+	name[sizeof(name) - 1] = 0;
+	if ((options = strchr(str, ',')) != NULL)
+		*(options++) = 0;
+#ifdef __sparc__
+	if (!strcmp(str, "ttya"))
+		strcpy(name, "ttyS0");
+	if (!strcmp(str, "ttyb"))
+		strcpy(name, "ttyS1");
+#endif
+	for(s = name; *s; s++)
+		if ((*s >= '0' && *s <= '9') || *s == ',')
+			break;
+	idx = simple_strtoul(s, NULL, 10);
+	*s = 0;
+
+	add_preferred_console(name, idx, options);
+	return 1;
+}
+
+__setup("console=", console_setup);
+
+/**
+ * add_preferred_console - add a device to the list of preferred consoles.
+ *
+ * The last preferred console added will be used for kernel messages
+ * and stdin/out/err for init.  Normally this is used by console_setup
+ * above to handle user-supplied console arguments; however it can also
+ * be used by arch-specific code either to override the user or more
+ * commonly to provide a default console (ie from PROM variables) when
+ * the user has not supplied one.
+ */
+int __init add_preferred_console(char *name, int idx, char *options)
+{
+	struct console_cmdline *c;
+	int i;
+
+	/*
+	 *	See if this tty is not yet registered, and
+	 *	if we have a slot free.
+	 */
+	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+		if (strcmp(console_cmdline[i].name, name) == 0 &&
+			  console_cmdline[i].index == idx) {
+				selected_console = i;
+				return 0;
+		}
+	if (i == MAX_CMDLINECONSOLES)
+		return -E2BIG;
+	selected_console = i;
+	c = &console_cmdline[i];
+	memcpy(c->name, name, sizeof(c->name));
+	c->name[sizeof(c->name) - 1] = 0;
+	c->options = options;
+	c->index = idx;
+	return 0;
+}
+
+static int __init log_buf_len_setup(char *str)
+{
+	unsigned long size = memparse(str, &str);
+	unsigned long flags;
+
+	if (size)
+		size = roundup_pow_of_two(size);
+	if (size > log_buf_len) {
+		unsigned long start, dest_idx, offset;
+		char * new_log_buf;
+
+		new_log_buf = alloc_bootmem(size);
+		if (!new_log_buf) {
+			printk("log_buf_len: allocation failed\n");
+			goto out;
+		}
+
+		spin_lock_irqsave(&logbuf_lock, flags);
+		log_buf_len = size;
+		log_buf = new_log_buf;
+
+		offset = start = min(con_start, log_start);
+		dest_idx = 0;
+		while (start != log_end) {
+			log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)];
+			start++;
+			dest_idx++;
+		}
+		log_start -= offset;
+		con_start -= offset;
+		log_end -= offset;
+		spin_unlock_irqrestore(&logbuf_lock, flags);
+
+		printk("log_buf_len: %d\n", log_buf_len);
+	}
+out:
+
+	return 1;
+}
+
+__setup("log_buf_len=", log_buf_len_setup);
+
+/*
+ * Commands to do_syslog:
+ *
+ * 	0 -- Close the log.  Currently a NOP.
+ * 	1 -- Open the log. Currently a NOP.
+ * 	2 -- Read from the log.
+ * 	3 -- Read all messages remaining in the ring buffer.
+ * 	4 -- Read and clear all messages remaining in the ring buffer
+ * 	5 -- Clear ring buffer.
+ * 	6 -- Disable printk's to console
+ * 	7 -- Enable printk's to console
+ *	8 -- Set level of messages printed to console
+ *	9 -- Return number of unread characters in the log buffer
+ *     10 -- Return size of the log buffer
+ */
+int do_syslog(int type, char __user * buf, int len)
+{
+	unsigned long i, j, limit, count;
+	int do_clear = 0;
+	char c;
+	int error = 0;
+
+	error = security_syslog(type);
+	if (error)
+		return error;
+
+	switch (type) {
+	case 0:		/* Close log */
+		break;
+	case 1:		/* Open log */
+		break;
+	case 2:		/* Read from log */
+		error = -EINVAL;
+		if (!buf || len < 0)
+			goto out;
+		error = 0;
+		if (!len)
+			goto out;
+		if (!access_ok(VERIFY_WRITE, buf, len)) {
+			error = -EFAULT;
+			goto out;
+		}
+		error = wait_event_interruptible(log_wait, (log_start - log_end));
+		if (error)
+			goto out;
+		i = 0;
+		spin_lock_irq(&logbuf_lock);
+		while (!error && (log_start != log_end) && i < len) {
+			c = LOG_BUF(log_start);
+			log_start++;
+			spin_unlock_irq(&logbuf_lock);
+			error = __put_user(c,buf);
+			buf++;
+			i++;
+			cond_resched();
+			spin_lock_irq(&logbuf_lock);
+		}
+		spin_unlock_irq(&logbuf_lock);
+		if (!error)
+			error = i;
+		break;
+	case 4:		/* Read/clear last kernel messages */
+		do_clear = 1; 
+		/* FALL THRU */
+	case 3:		/* Read last kernel messages */
+		error = -EINVAL;
+		if (!buf || len < 0)
+			goto out;
+		error = 0;
+		if (!len)
+			goto out;
+		if (!access_ok(VERIFY_WRITE, buf, len)) {
+			error = -EFAULT;
+			goto out;
+		}
+		count = len;
+		if (count > log_buf_len)
+			count = log_buf_len;
+		spin_lock_irq(&logbuf_lock);
+		if (count > logged_chars)
+			count = logged_chars;
+		if (do_clear)
+			logged_chars = 0;
+		limit = log_end;
+		/*
+		 * __put_user() could sleep, and while we sleep
+		 * printk() could overwrite the messages 
+		 * we try to copy to user space. Therefore
+		 * the messages are copied in reverse. <manfreds>
+		 */
+		for(i = 0; i < count && !error; i++) {
+			j = limit-1-i;
+			if (j + log_buf_len < log_end)
+				break;
+			c = LOG_BUF(j);
+			spin_unlock_irq(&logbuf_lock);
+			error = __put_user(c,&buf[count-1-i]);
+			cond_resched();
+			spin_lock_irq(&logbuf_lock);
+		}
+		spin_unlock_irq(&logbuf_lock);
+		if (error)
+			break;
+		error = i;
+		if(i != count) {
+			int offset = count-error;
+			/* buffer overflow during copy, correct user buffer. */
+			for(i=0;i<error;i++) {
+				if (__get_user(c,&buf[i+offset]) ||
+				    __put_user(c,&buf[i])) {
+					error = -EFAULT;
+					break;
+				}
+				cond_resched();
+			}
+		}
+		break;
+	case 5:		/* Clear ring buffer */
+		logged_chars = 0;
+		break;
+	case 6:		/* Disable logging to console */
+		console_loglevel = minimum_console_loglevel;
+		break;
+	case 7:		/* Enable logging to console */
+		console_loglevel = default_console_loglevel;
+		break;
+	case 8:		/* Set level of messages printed to console */
+		error = -EINVAL;
+		if (len < 1 || len > 8)
+			goto out;
+		if (len < minimum_console_loglevel)
+			len = minimum_console_loglevel;
+		console_loglevel = len;
+		error = 0;
+		break;
+	case 9:		/* Number of chars in the log buffer */
+		error = log_end - log_start;
+		break;
+	case 10:	/* Size of the log buffer */
+		error = log_buf_len;
+		break;
+	default:
+		error = -EINVAL;
+		break;
+	}
+out:
+	return error;
+}
+
+asmlinkage long sys_syslog(int type, char __user * buf, int len)
+{
+	return do_syslog(type, buf, len);
+}
+
+/*
+ * Call the console drivers on a range of log_buf
+ */
+static void __call_console_drivers(unsigned long start, unsigned long end)
+{
+	struct console *con;
+
+	for (con = console_drivers; con; con = con->next) {
+		if ((con->flags & CON_ENABLED) && con->write)
+			con->write(con, &LOG_BUF(start), end - start);
+	}
+}
+
+/*
+ * Write out chars from start to end - 1 inclusive
+ */
+static void _call_console_drivers(unsigned long start,
+				unsigned long end, int msg_log_level)
+{
+	if (msg_log_level < console_loglevel &&
+			console_drivers && start != end) {
+		if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
+			/* wrapped write */
+			__call_console_drivers(start & LOG_BUF_MASK,
+						log_buf_len);
+			__call_console_drivers(0, end & LOG_BUF_MASK);
+		} else {
+			__call_console_drivers(start, end);
+		}
+	}
+}
+
+/*
+ * Call the console drivers, asking them to write out
+ * log_buf[start] to log_buf[end - 1].
+ * The console_sem must be held.
+ */
+static void call_console_drivers(unsigned long start, unsigned long end)
+{
+	unsigned long cur_index, start_print;
+	static int msg_level = -1;
+
+	if (((long)(start - end)) > 0)
+		BUG();
+
+	cur_index = start;
+	start_print = start;
+	while (cur_index != end) {
+		if (	msg_level < 0 &&
+			((end - cur_index) > 2) &&
+			LOG_BUF(cur_index + 0) == '<' &&
+			LOG_BUF(cur_index + 1) >= '0' &&
+			LOG_BUF(cur_index + 1) <= '7' &&
+			LOG_BUF(cur_index + 2) == '>')
+		{
+			msg_level = LOG_BUF(cur_index + 1) - '0';
+			cur_index += 3;
+			start_print = cur_index;
+		}
+		while (cur_index != end) {
+			char c = LOG_BUF(cur_index);
+			cur_index++;
+
+			if (c == '\n') {
+				if (msg_level < 0) {
+					/*
+					 * printk() has already given us loglevel tags in
+					 * the buffer.  This code is here in case the
+					 * log buffer has wrapped right round and scribbled
+					 * on those tags
+					 */
+					msg_level = default_message_loglevel;
+				}
+				_call_console_drivers(start_print, cur_index, msg_level);
+				msg_level = -1;
+				start_print = cur_index;
+				break;
+			}
+		}
+	}
+	_call_console_drivers(start_print, end, msg_level);
+}
+
+static void emit_log_char(char c)
+{
+	LOG_BUF(log_end) = c;
+	log_end++;
+	if (log_end - log_start > log_buf_len)
+		log_start = log_end - log_buf_len;
+	if (log_end - con_start > log_buf_len)
+		con_start = log_end - log_buf_len;
+	if (logged_chars < log_buf_len)
+		logged_chars++;
+}
+
+/*
+ * Zap console related locks when oopsing. Only zap at most once
+ * every 10 seconds, to leave time for slow consoles to print a
+ * full oops.
+ */
+static void zap_locks(void)
+{
+	static unsigned long oops_timestamp;
+
+	if (time_after_eq(jiffies, oops_timestamp) &&
+			!time_after(jiffies, oops_timestamp + 30*HZ))
+		return;
+
+	oops_timestamp = jiffies;
+
+	/* If a crash is occurring, make sure we can't deadlock */
+	spin_lock_init(&logbuf_lock);
+	/* And make sure that we print immediately */
+	init_MUTEX(&console_sem);
+}
+
+#if defined(CONFIG_PRINTK_TIME)
+static int printk_time = 1;
+#else
+static int printk_time = 0;
+#endif
+
+static int __init printk_time_setup(char *str)
+{
+	if (*str)
+		return 0;
+	printk_time = 1;
+	return 1;
+}
+
+__setup("time", printk_time_setup);
+
+/*
+ * This is printk.  It can be called from any context.  We want it to work.
+ * 
+ * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
+ * call the console drivers.  If we fail to get the semaphore we place the output
+ * into the log buffer and return.  The current holder of the console_sem will
+ * notice the new output in release_console_sem() and will send it to the
+ * consoles before releasing the semaphore.
+ *
+ * One effect of this deferred printing is that code which calls printk() and
+ * then changes console_loglevel may break. This is because console_loglevel
+ * is inspected when the actual printing occurs.
+ */
+asmlinkage int printk(const char *fmt, ...)
+{
+	va_list args;
+	int r;
+
+	va_start(args, fmt);
+	r = vprintk(fmt, args);
+	va_end(args);
+
+	return r;
+}
+
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
+	unsigned long flags;
+	int printed_len;
+	char *p;
+	static char printk_buf[1024];
+	static int log_level_unknown = 1;
+
+	if (unlikely(oops_in_progress))
+		zap_locks();
+
+	/* This stops the holder of console_sem just where we want him */
+	spin_lock_irqsave(&logbuf_lock, flags);
+
+	/* Emit the output into the temporary buffer */
+	printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+
+	/*
+	 * Copy the output into log_buf.  If the caller didn't provide
+	 * appropriate log level tags, we insert them here
+	 */
+	for (p = printk_buf; *p; p++) {
+		if (log_level_unknown) {
+                        /* log_level_unknown signals the start of a new line */
+			if (printk_time) {
+				int loglev_char;
+				char tbuf[50], *tp;
+				unsigned tlen;
+				unsigned long long t;
+				unsigned long nanosec_rem;
+
+				/*
+				 * force the log level token to be
+				 * before the time output.
+				 */
+				if (p[0] == '<' && p[1] >='0' &&
+				   p[1] <= '7' && p[2] == '>') {
+					loglev_char = p[1];
+					p += 3;
+					printed_len += 3;
+				} else {
+					loglev_char = default_message_loglevel
+						+ '0';
+				}
+				t = sched_clock();
+				nanosec_rem = do_div(t, 1000000000);
+				tlen = sprintf(tbuf,
+						"<%c>[%5lu.%06lu] ",
+						loglev_char,
+						(unsigned long)t,
+						nanosec_rem/1000);
+
+				for (tp = tbuf; tp < tbuf + tlen; tp++)
+					emit_log_char(*tp);
+				printed_len += tlen - 3;
+			} else {
+				if (p[0] != '<' || p[1] < '0' ||
+				   p[1] > '7' || p[2] != '>') {
+					emit_log_char('<');
+					emit_log_char(default_message_loglevel
+						+ '0');
+					emit_log_char('>');
+				}
+				printed_len += 3;
+			}
+			log_level_unknown = 0;
+			if (!*p)
+				break;
+		}
+		emit_log_char(*p);
+		if (*p == '\n')
+			log_level_unknown = 1;
+	}
+
+	if (!cpu_online(smp_processor_id()) &&
+	    system_state != SYSTEM_RUNNING) {
+		/*
+		 * Some console drivers may assume that per-cpu resources have
+		 * been allocated.  So don't allow them to be called by this
+		 * CPU until it is officially up.  We shouldn't be calling into
+		 * random console drivers on a CPU which doesn't exist yet..
+		 */
+		spin_unlock_irqrestore(&logbuf_lock, flags);
+		goto out;
+	}
+	if (!down_trylock(&console_sem)) {
+		console_locked = 1;
+		/*
+		 * We own the drivers.  We can drop the spinlock and let
+		 * release_console_sem() print the text
+		 */
+		spin_unlock_irqrestore(&logbuf_lock, flags);
+		console_may_schedule = 0;
+		release_console_sem();
+	} else {
+		/*
+		 * Someone else owns the drivers.  We drop the spinlock, which
+		 * allows the semaphore holder to proceed and to call the
+		 * console drivers with the output which we just produced.
+		 */
+		spin_unlock_irqrestore(&logbuf_lock, flags);
+	}
+out:
+	return printed_len;
+}
+EXPORT_SYMBOL(printk);
+EXPORT_SYMBOL(vprintk);
+
+/**
+ * acquire_console_sem - lock the console system for exclusive use.
+ *
+ * Acquires a semaphore which guarantees that the caller has
+ * exclusive access to the console system and the console_drivers list.
+ *
+ * Can sleep, returns nothing.
+ */
+void acquire_console_sem(void)
+{
+	if (in_interrupt())
+		BUG();
+	down(&console_sem);
+	console_locked = 1;
+	console_may_schedule = 1;
+}
+EXPORT_SYMBOL(acquire_console_sem);
+
+int try_acquire_console_sem(void)
+{
+	if (down_trylock(&console_sem))
+		return -1;
+	console_locked = 1;
+	console_may_schedule = 0;
+	return 0;
+}
+EXPORT_SYMBOL(try_acquire_console_sem);
+
+int is_console_locked(void)
+{
+	return console_locked;
+}
+EXPORT_SYMBOL(is_console_locked);
+
+/**
+ * release_console_sem - unlock the console system
+ *
+ * Releases the semaphore which the caller holds on the console system
+ * and the console driver list.
+ *
+ * While the semaphore was held, console output may have been buffered
+ * by printk().  If this is the case, release_console_sem() emits
+ * the output prior to releasing the semaphore.
+ *
+ * If there is output waiting for klogd, we wake it up.
+ *
+ * release_console_sem() may be called from any context.
+ */
+void release_console_sem(void)
+{
+	unsigned long flags;
+	unsigned long _con_start, _log_end;
+	unsigned long wake_klogd = 0;
+
+	for ( ; ; ) {
+		spin_lock_irqsave(&logbuf_lock, flags);
+		wake_klogd |= log_start - log_end;
+		if (con_start == log_end)
+			break;			/* Nothing to print */
+		_con_start = con_start;
+		_log_end = log_end;
+		con_start = log_end;		/* Flush */
+		spin_unlock(&logbuf_lock);
+		call_console_drivers(_con_start, _log_end);
+		local_irq_restore(flags);
+	}
+	console_locked = 0;
+	console_may_schedule = 0;
+	up(&console_sem);
+	spin_unlock_irqrestore(&logbuf_lock, flags);
+	if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
+		wake_up_interruptible(&log_wait);
+}
+EXPORT_SYMBOL(release_console_sem);
+
+/** console_conditional_schedule - yield the CPU if required
+ *
+ * If the console code is currently allowed to sleep, and
+ * if this CPU should yield the CPU to another task, do
+ * so here.
+ *
+ * Must be called within acquire_console_sem().
+ */
+void __sched console_conditional_schedule(void)
+{
+	if (console_may_schedule)
+		cond_resched();
+}
+EXPORT_SYMBOL(console_conditional_schedule);
+
+void console_print(const char *s)
+{
+	printk(KERN_EMERG "%s", s);
+}
+EXPORT_SYMBOL(console_print);
+
+void console_unblank(void)
+{
+	struct console *c;
+
+	/*
+	 * console_unblank can no longer be called in interrupt context unless
+	 * oops_in_progress is set to 1..
+	 */
+	if (oops_in_progress) {
+		if (down_trylock(&console_sem) != 0)
+			return;
+	} else
+		acquire_console_sem();
+
+	console_locked = 1;
+	console_may_schedule = 0;
+	for (c = console_drivers; c != NULL; c = c->next)
+		if ((c->flags & CON_ENABLED) && c->unblank)
+			c->unblank();
+	release_console_sem();
+}
+EXPORT_SYMBOL(console_unblank);
+
+/*
+ * Return the console tty driver structure and its associated index
+ */
+struct tty_driver *console_device(int *index)
+{
+	struct console *c;
+	struct tty_driver *driver = NULL;
+
+	acquire_console_sem();
+	for (c = console_drivers; c != NULL; c = c->next) {
+		if (!c->device)
+			continue;
+		driver = c->device(c, index);
+		if (driver)
+			break;
+	}
+	release_console_sem();
+	return driver;
+}
+
+/*
+ * Prevent further output on the passed console device so that (for example)
+ * serial drivers can disable console output before suspending a port, and can
+ * re-enable output afterwards.
+ */
+void console_stop(struct console *console)
+{
+	acquire_console_sem();
+	console->flags &= ~CON_ENABLED;
+	release_console_sem();
+}
+EXPORT_SYMBOL(console_stop);
+
+void console_start(struct console *console)
+{
+	acquire_console_sem();
+	console->flags |= CON_ENABLED;
+	release_console_sem();
+}
+EXPORT_SYMBOL(console_start);
+
+/*
+ * The console driver calls this routine during kernel initialization
+ * to register the console printing procedure with printk() and to
+ * print any messages that were printed by the kernel before the
+ * console driver was initialized.
+ */
+void register_console(struct console * console)
+{
+	int     i;
+	unsigned long flags;
+
+	if (preferred_console < 0)
+		preferred_console = selected_console;
+
+	/*
+	 *	See if we want to use this console driver. If we
+	 *	didn't select a console we take the first one
+	 *	that registers here.
+	 */
+	if (preferred_console < 0) {
+		if (console->index < 0)
+			console->index = 0;
+		if (console->setup == NULL ||
+		    console->setup(console, NULL) == 0) {
+			console->flags |= CON_ENABLED | CON_CONSDEV;
+			preferred_console = 0;
+		}
+	}
+
+	/*
+	 *	See if this console matches one we selected on
+	 *	the command line.
+	 */
+	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) {
+		if (strcmp(console_cmdline[i].name, console->name) != 0)
+			continue;
+		if (console->index >= 0 &&
+		    console->index != console_cmdline[i].index)
+			continue;
+		if (console->index < 0)
+			console->index = console_cmdline[i].index;
+		if (console->setup &&
+		    console->setup(console, console_cmdline[i].options) != 0)
+			break;
+		console->flags |= CON_ENABLED;
+		console->index = console_cmdline[i].index;
+		if (i == preferred_console)
+			console->flags |= CON_CONSDEV;
+		break;
+	}
+
+	if (!(console->flags & CON_ENABLED))
+		return;
+
+	if (console_drivers && (console_drivers->flags & CON_BOOT)) {
+		unregister_console(console_drivers);
+		console->flags &= ~CON_PRINTBUFFER;
+	}
+
+	/*
+	 *	Put this console in the list - keep the
+	 *	preferred driver at the head of the list.
+	 */
+	acquire_console_sem();
+	if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
+		console->next = console_drivers;
+		console_drivers = console;
+	} else {
+		console->next = console_drivers->next;
+		console_drivers->next = console;
+	}
+	if (console->flags & CON_PRINTBUFFER) {
+		/*
+		 * release_console_sem() will print out the buffered messages
+		 * for us.
+		 */
+		spin_lock_irqsave(&logbuf_lock, flags);
+		con_start = log_start;
+		spin_unlock_irqrestore(&logbuf_lock, flags);
+	}
+	release_console_sem();
+}
+EXPORT_SYMBOL(register_console);
+
+int unregister_console(struct console * console)
+{
+        struct console *a,*b;
+	int res = 1;
+
+	acquire_console_sem();
+	if (console_drivers == console) {
+		console_drivers=console->next;
+		res = 0;
+	} else {
+		for (a=console_drivers->next, b=console_drivers ;
+		     a; b=a, a=b->next) {
+			if (a == console) {
+				b->next = a->next;
+				res = 0;
+				break;
+			}  
+		}
+	}
+	
+	/* If last console is removed, we re-enable picking the first
+	 * one that gets registered. Without that, pmac early boot console
+	 * would prevent fbcon from taking over.
+	 */
+	if (console_drivers == NULL)
+		preferred_console = selected_console;
+		
+
+	release_console_sem();
+	return res;
+}
+EXPORT_SYMBOL(unregister_console);
+	
+/**
+ * tty_write_message - write a message to a certain tty, not just the console.
+ *
+ * This is used for messages that need to be redirected to a specific tty.
+ * We don't put it into the syslog queue right now maybe in the future if
+ * really needed.
+ */
+void tty_write_message(struct tty_struct *tty, char *msg)
+{
+	if (tty && tty->driver->write)
+		tty->driver->write(tty, msg, strlen(msg));
+	return;
+}
+
+/*
+ * printk rate limiting, lifted from the networking subsystem.
+ *
+ * This enforces a rate limit: not more than one kernel message
+ * every printk_ratelimit_jiffies to make a denial-of-service
+ * attack impossible.
+ */
+int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
+{
+	static DEFINE_SPINLOCK(ratelimit_lock);
+	static unsigned long toks = 10*5*HZ;
+	static unsigned long last_msg;
+	static int missed;
+	unsigned long flags;
+	unsigned long now = jiffies;
+
+	spin_lock_irqsave(&ratelimit_lock, flags);
+	toks += now - last_msg;
+	last_msg = now;
+	if (toks > (ratelimit_burst * ratelimit_jiffies))
+		toks = ratelimit_burst * ratelimit_jiffies;
+	if (toks >= ratelimit_jiffies) {
+		int lost = missed;
+		missed = 0;
+		toks -= ratelimit_jiffies;
+		spin_unlock_irqrestore(&ratelimit_lock, flags);
+		if (lost)
+			printk(KERN_WARNING "printk: %d messages suppressed.\n", lost);
+		return 1;
+	}
+	missed++;
+	spin_unlock_irqrestore(&ratelimit_lock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(__printk_ratelimit);
+
+/* minimum time in jiffies between messages */
+int printk_ratelimit_jiffies = 5*HZ;
+
+/* number of messages we send before ratelimiting */
+int printk_ratelimit_burst = 10;
+
+int printk_ratelimit(void)
+{
+	return __printk_ratelimit(printk_ratelimit_jiffies,
+				printk_ratelimit_burst);
+}
+EXPORT_SYMBOL(printk_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
new file mode 100644
index 000000000000..a38fa70075fe
--- /dev/null
+++ b/kernel/profile.c
@@ -0,0 +1,563 @@
+/*
+ *  linux/kernel/profile.c
+ *  Simple profiling. Manages a direct-mapped profile hit count buffer,
+ *  with configurable resolution, support for restricting the cpus on
+ *  which profiling is done, and switching between cpu time and
+ *  schedule() calls via kernel command line parameters passed at boot.
+ *
+ *  Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
+ *	Red Hat, July 2004
+ *  Consolidation of architecture support code for profiling,
+ *	William Irwin, Oracle, July 2004
+ *  Amortized hit count accounting via per-cpu open-addressed hashtables
+ *	to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/profile.h>
+#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/mm.h>
+#include <linux/cpumask.h>
+#include <linux/cpu.h>
+#include <linux/profile.h>
+#include <linux/highmem.h>
+#include <asm/sections.h>
+#include <asm/semaphore.h>
+
+struct profile_hit {
+	u32 pc, hits;
+};
+#define PROFILE_GRPSHIFT	3
+#define PROFILE_GRPSZ		(1 << PROFILE_GRPSHIFT)
+#define NR_PROFILE_HIT		(PAGE_SIZE/sizeof(struct profile_hit))
+#define NR_PROFILE_GRP		(NR_PROFILE_HIT/PROFILE_GRPSZ)
+
+/* Oprofile timer tick hook */
+int (*timer_hook)(struct pt_regs *);
+
+static atomic_t *prof_buffer;
+static unsigned long prof_len, prof_shift;
+static int prof_on;
+static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
+static DEFINE_PER_CPU(int, cpu_profile_flip);
+static DECLARE_MUTEX(profile_flip_mutex);
+#endif /* CONFIG_SMP */
+
+static int __init profile_setup(char * str)
+{
+	int par;
+
+	if (!strncmp(str, "schedule", 8)) {
+		prof_on = SCHED_PROFILING;
+		printk(KERN_INFO "kernel schedule profiling enabled\n");
+		if (str[7] == ',')
+			str += 8;
+	}
+	if (get_option(&str,&par)) {
+		prof_shift = par;
+		prof_on = CPU_PROFILING;
+		printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
+			prof_shift);
+	}
+	return 1;
+}
+__setup("profile=", profile_setup);
+
+
+void __init profile_init(void)
+{
+	if (!prof_on) 
+		return;
+ 
+	/* only text is profiled */
+	prof_len = (_etext - _stext) >> prof_shift;
+	prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
+}
+
+/* Profile event notifications */
+ 
+#ifdef CONFIG_PROFILING
+ 
+static DECLARE_RWSEM(profile_rwsem);
+static DEFINE_RWLOCK(handoff_lock);
+static struct notifier_block * task_exit_notifier;
+static struct notifier_block * task_free_notifier;
+static struct notifier_block * munmap_notifier;
+ 
+void profile_task_exit(struct task_struct * task)
+{
+	down_read(&profile_rwsem);
+	notifier_call_chain(&task_exit_notifier, 0, task);
+	up_read(&profile_rwsem);
+}
+ 
+int profile_handoff_task(struct task_struct * task)
+{
+	int ret;
+	read_lock(&handoff_lock);
+	ret = notifier_call_chain(&task_free_notifier, 0, task);
+	read_unlock(&handoff_lock);
+	return (ret == NOTIFY_OK) ? 1 : 0;
+}
+
+void profile_munmap(unsigned long addr)
+{
+	down_read(&profile_rwsem);
+	notifier_call_chain(&munmap_notifier, 0, (void *)addr);
+	up_read(&profile_rwsem);
+}
+
+int task_handoff_register(struct notifier_block * n)
+{
+	int err = -EINVAL;
+
+	write_lock(&handoff_lock);
+	err = notifier_chain_register(&task_free_notifier, n);
+	write_unlock(&handoff_lock);
+	return err;
+}
+
+int task_handoff_unregister(struct notifier_block * n)
+{
+	int err = -EINVAL;
+
+	write_lock(&handoff_lock);
+	err = notifier_chain_unregister(&task_free_notifier, n);
+	write_unlock(&handoff_lock);
+	return err;
+}
+
+int profile_event_register(enum profile_type type, struct notifier_block * n)
+{
+	int err = -EINVAL;
+ 
+	down_write(&profile_rwsem);
+ 
+	switch (type) {
+		case PROFILE_TASK_EXIT:
+			err = notifier_chain_register(&task_exit_notifier, n);
+			break;
+		case PROFILE_MUNMAP:
+			err = notifier_chain_register(&munmap_notifier, n);
+			break;
+	}
+ 
+	up_write(&profile_rwsem);
+ 
+	return err;
+}
+
+ 
+int profile_event_unregister(enum profile_type type, struct notifier_block * n)
+{
+	int err = -EINVAL;
+ 
+	down_write(&profile_rwsem);
+ 
+	switch (type) {
+		case PROFILE_TASK_EXIT:
+			err = notifier_chain_unregister(&task_exit_notifier, n);
+			break;
+		case PROFILE_MUNMAP:
+			err = notifier_chain_unregister(&munmap_notifier, n);
+			break;
+	}
+
+	up_write(&profile_rwsem);
+	return err;
+}
+
+int register_timer_hook(int (*hook)(struct pt_regs *))
+{
+	if (timer_hook)
+		return -EBUSY;
+	timer_hook = hook;
+	return 0;
+}
+
+void unregister_timer_hook(int (*hook)(struct pt_regs *))
+{
+	WARN_ON(hook != timer_hook);
+	timer_hook = NULL;
+	/* make sure all CPUs see the NULL hook */
+	synchronize_kernel();
+}
+
+EXPORT_SYMBOL_GPL(register_timer_hook);
+EXPORT_SYMBOL_GPL(unregister_timer_hook);
+EXPORT_SYMBOL_GPL(task_handoff_register);
+EXPORT_SYMBOL_GPL(task_handoff_unregister);
+
+#endif /* CONFIG_PROFILING */
+
+EXPORT_SYMBOL_GPL(profile_event_register);
+EXPORT_SYMBOL_GPL(profile_event_unregister);
+
+#ifdef CONFIG_SMP
+/*
+ * Each cpu has a pair of open-addressed hashtables for pending
+ * profile hits. read_profile() IPI's all cpus to request them
+ * to flip buffers and flushes their contents to prof_buffer itself.
+ * Flip requests are serialized by the profile_flip_mutex. The sole
+ * use of having a second hashtable is for avoiding cacheline
+ * contention that would otherwise happen during flushes of pending
+ * profile hits required for the accuracy of reported profile hits
+ * and so resurrect the interrupt livelock issue.
+ *
+ * The open-addressed hashtables are indexed by profile buffer slot
+ * and hold the number of pending hits to that profile buffer slot on
+ * a cpu in an entry. When the hashtable overflows, all pending hits
+ * are accounted to their corresponding profile buffer slots with
+ * atomic_add() and the hashtable emptied. As numerous pending hits
+ * may be accounted to a profile buffer slot in a hashtable entry,
+ * this amortizes a number of atomic profile buffer increments likely
+ * to be far larger than the number of entries in the hashtable,
+ * particularly given that the number of distinct profile buffer
+ * positions to which hits are accounted during short intervals (e.g.
+ * several seconds) is usually very small. Exclusion from buffer
+ * flipping is provided by interrupt disablement (note that for
+ * SCHED_PROFILING profile_hit() may be called from process context).
+ * The hash function is meant to be lightweight as opposed to strong,
+ * and was vaguely inspired by ppc64 firmware-supported inverted
+ * pagetable hash functions, but uses a full hashtable full of finite
+ * collision chains, not just pairs of them.
+ *
+ * -- wli
+ */
+static void __profile_flip_buffers(void *unused)
+{
+	int cpu = smp_processor_id();
+
+	per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
+}
+
+static void profile_flip_buffers(void)
+{
+	int i, j, cpu;
+
+	down(&profile_flip_mutex);
+	j = per_cpu(cpu_profile_flip, get_cpu());
+	put_cpu();
+	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+	for_each_online_cpu(cpu) {
+		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
+		for (i = 0; i < NR_PROFILE_HIT; ++i) {
+			if (!hits[i].hits) {
+				if (hits[i].pc)
+					hits[i].pc = 0;
+				continue;
+			}
+			atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
+			hits[i].hits = hits[i].pc = 0;
+		}
+	}
+	up(&profile_flip_mutex);
+}
+
+static void profile_discard_flip_buffers(void)
+{
+	int i, cpu;
+
+	down(&profile_flip_mutex);
+	i = per_cpu(cpu_profile_flip, get_cpu());
+	put_cpu();
+	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+	for_each_online_cpu(cpu) {
+		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
+		memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
+	}
+	up(&profile_flip_mutex);
+}
+
+void profile_hit(int type, void *__pc)
+{
+	unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
+	int i, j, cpu;
+	struct profile_hit *hits;
+
+	if (prof_on != type || !prof_buffer)
+		return;
+	pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
+	i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
+	secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
+	cpu = get_cpu();
+	hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
+	if (!hits) {
+		put_cpu();
+		return;
+	}
+	local_irq_save(flags);
+	do {
+		for (j = 0; j < PROFILE_GRPSZ; ++j) {
+			if (hits[i + j].pc == pc) {
+				hits[i + j].hits++;
+				goto out;
+			} else if (!hits[i + j].hits) {
+				hits[i + j].pc = pc;
+				hits[i + j].hits = 1;
+				goto out;
+			}
+		}
+		i = (i + secondary) & (NR_PROFILE_HIT - 1);
+	} while (i != primary);
+	atomic_inc(&prof_buffer[pc]);
+	for (i = 0; i < NR_PROFILE_HIT; ++i) {
+		atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
+		hits[i].pc = hits[i].hits = 0;
+	}
+out:
+	local_irq_restore(flags);
+	put_cpu();
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int __devinit profile_cpu_callback(struct notifier_block *info,
+					unsigned long action, void *__cpu)
+{
+	int node, cpu = (unsigned long)__cpu;
+	struct page *page;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		node = cpu_to_node(cpu);
+		per_cpu(cpu_profile_flip, cpu) = 0;
+		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
+			page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+			if (!page)
+				return NOTIFY_BAD;
+			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
+		}
+		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
+			page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+			if (!page)
+				goto out_free;
+			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
+		}
+		break;
+	out_free:
+		page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+		per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+		__free_page(page);
+		return NOTIFY_BAD;
+	case CPU_ONLINE:
+		cpu_set(cpu, prof_cpu_mask);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_DEAD:
+		cpu_clear(cpu, prof_cpu_mask);
+		if (per_cpu(cpu_profile_hits, cpu)[0]) {
+			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
+			per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+			__free_page(page);
+		}
+		if (per_cpu(cpu_profile_hits, cpu)[1]) {
+			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+			per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+			__free_page(page);
+		}
+		break;
+	}
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+#else /* !CONFIG_SMP */
+#define profile_flip_buffers()		do { } while (0)
+#define profile_discard_flip_buffers()	do { } while (0)
+
+void profile_hit(int type, void *__pc)
+{
+	unsigned long pc;
+
+	if (prof_on != type || !prof_buffer)
+		return;
+	pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
+	atomic_inc(&prof_buffer[min(pc, prof_len - 1)]);
+}
+#endif /* !CONFIG_SMP */
+
+void profile_tick(int type, struct pt_regs *regs)
+{
+	if (type == CPU_PROFILING && timer_hook)
+		timer_hook(regs);
+	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
+		profile_hit(type, (void *)profile_pc(regs));
+}
+
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include <asm/ptrace.h>
+
+static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
+			int count, int *eof, void *data)
+{
+	int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
+	if (count - len < 2)
+		return -EINVAL;
+	len += sprintf(page + len, "\n");
+	return len;
+}
+
+static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer,
+					unsigned long count, void *data)
+{
+	cpumask_t *mask = (cpumask_t *)data;
+	unsigned long full_count = count, err;
+	cpumask_t new_value;
+
+	err = cpumask_parse(buffer, count, new_value);
+	if (err)
+		return err;
+
+	*mask = new_value;
+	return full_count;
+}
+
+void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
+{
+	struct proc_dir_entry *entry;
+
+	/* create /proc/irq/prof_cpu_mask */
+	if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
+		return;
+	entry->nlink = 1;
+	entry->data = (void *)&prof_cpu_mask;
+	entry->read_proc = prof_cpu_mask_read_proc;
+	entry->write_proc = prof_cpu_mask_write_proc;
+}
+
+/*
+ * This function accesses profiling information. The returned data is
+ * binary: the sampling step and the actual contents of the profile
+ * buffer. Use of the program readprofile is recommended in order to
+ * get meaningful info out of these data.
+ */
+static ssize_t
+read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	unsigned long p = *ppos;
+	ssize_t read;
+	char * pnt;
+	unsigned int sample_step = 1 << prof_shift;
+
+	profile_flip_buffers();
+	if (p >= (prof_len+1)*sizeof(unsigned int))
+		return 0;
+	if (count > (prof_len+1)*sizeof(unsigned int) - p)
+		count = (prof_len+1)*sizeof(unsigned int) - p;
+	read = 0;
+
+	while (p < sizeof(unsigned int) && count > 0) {
+		put_user(*((char *)(&sample_step)+p),buf);
+		buf++; p++; count--; read++;
+	}
+	pnt = (char *)prof_buffer + p - sizeof(atomic_t);
+	if (copy_to_user(buf,(void *)pnt,count))
+		return -EFAULT;
+	read += count;
+	*ppos += read;
+	return read;
+}
+
+/*
+ * Writing to /proc/profile resets the counters
+ *
+ * Writing a 'profiling multiplier' value into it also re-sets the profiling
+ * interrupt frequency, on architectures that support this.
+ */
+static ssize_t write_profile(struct file *file, const char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+#ifdef CONFIG_SMP
+	extern int setup_profiling_timer (unsigned int multiplier);
+
+	if (count == sizeof(int)) {
+		unsigned int multiplier;
+
+		if (copy_from_user(&multiplier, buf, sizeof(int)))
+			return -EFAULT;
+
+		if (setup_profiling_timer(multiplier))
+			return -EINVAL;
+	}
+#endif
+	profile_discard_flip_buffers();
+	memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
+	return count;
+}
+
+static struct file_operations proc_profile_operations = {
+	.read		= read_profile,
+	.write		= write_profile,
+};
+
+#ifdef CONFIG_SMP
+static void __init profile_nop(void *unused)
+{
+}
+
+static int __init create_hash_tables(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		int node = cpu_to_node(cpu);
+		struct page *page;
+
+		page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+		if (!page)
+			goto out_cleanup;
+		per_cpu(cpu_profile_hits, cpu)[1]
+				= (struct profile_hit *)page_address(page);
+		page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+		if (!page)
+			goto out_cleanup;
+		per_cpu(cpu_profile_hits, cpu)[0]
+				= (struct profile_hit *)page_address(page);
+	}
+	return 0;
+out_cleanup:
+	prof_on = 0;
+	mb();
+	on_each_cpu(profile_nop, NULL, 0, 1);
+	for_each_online_cpu(cpu) {
+		struct page *page;
+
+		if (per_cpu(cpu_profile_hits, cpu)[0]) {
+			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
+			per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+			__free_page(page);
+		}
+		if (per_cpu(cpu_profile_hits, cpu)[1]) {
+			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+			per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+			__free_page(page);
+		}
+	}
+	return -1;
+}
+#else
+#define create_hash_tables()			({ 0; })
+#endif
+
+static int __init create_proc_profile(void)
+{
+	struct proc_dir_entry *entry;
+
+	if (!prof_on)
+		return 0;
+	if (create_hash_tables())
+		return -1;
+	if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL)))
+		return 0;
+	entry->proc_fops = &proc_profile_operations;
+	entry->size = (1+prof_len) * sizeof(atomic_t);
+	hotcpu_notifier(profile_cpu_callback, 0);
+	return 0;
+}
+module_init(create_proc_profile);
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
new file mode 100644
index 000000000000..88b306c4e841
--- /dev/null
+++ b/kernel/ptrace.c
@@ -0,0 +1,389 @@
+/*
+ * linux/kernel/ptrace.c
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Common interfaces for "ptrace()" which we do not want
+ * to continually duplicate across every architecture.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/ptrace.h>
+#include <linux/security.h>
+
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+
+/*
+ * ptrace a task: make the debugger its new parent and
+ * move it to the ptrace list.
+ *
+ * Must be called with the tasklist lock write-held.
+ */
+void __ptrace_link(task_t *child, task_t *new_parent)
+{
+	if (!list_empty(&child->ptrace_list))
+		BUG();
+	if (child->parent == new_parent)
+		return;
+	list_add(&child->ptrace_list, &child->parent->ptrace_children);
+	REMOVE_LINKS(child);
+	child->parent = new_parent;
+	SET_LINKS(child);
+}
+ 
+/*
+ * Turn a tracing stop into a normal stop now, since with no tracer there
+ * would be no way to wake it up with SIGCONT or SIGKILL.  If there was a
+ * signal sent that would resume the child, but didn't because it was in
+ * TASK_TRACED, resume it now.
+ * Requires that irqs be disabled.
+ */
+void ptrace_untrace(task_t *child)
+{
+	spin_lock(&child->sighand->siglock);
+	if (child->state == TASK_TRACED) {
+		if (child->signal->flags & SIGNAL_STOP_STOPPED) {
+			child->state = TASK_STOPPED;
+		} else {
+			signal_wake_up(child, 1);
+		}
+	}
+	spin_unlock(&child->sighand->siglock);
+}
+
+/*
+ * unptrace a task: move it back to its original parent and
+ * remove it from the ptrace list.
+ *
+ * Must be called with the tasklist lock write-held.
+ */
+void __ptrace_unlink(task_t *child)
+{
+	if (!child->ptrace)
+		BUG();
+	child->ptrace = 0;
+	if (!list_empty(&child->ptrace_list)) {
+		list_del_init(&child->ptrace_list);
+		REMOVE_LINKS(child);
+		child->parent = child->real_parent;
+		SET_LINKS(child);
+	}
+
+	if (child->state == TASK_TRACED)
+		ptrace_untrace(child);
+}
+
+/*
+ * Check that we have indeed attached to the thing..
+ */
+int ptrace_check_attach(struct task_struct *child, int kill)
+{
+	int ret = -ESRCH;
+
+	/*
+	 * We take the read lock around doing both checks to close a
+	 * possible race where someone else was tracing our child and
+	 * detached between these two checks.  After this locked check,
+	 * we are sure that this is our traced child and that can only
+	 * be changed by us so it's not changing right after this.
+	 */
+	read_lock(&tasklist_lock);
+	if ((child->ptrace & PT_PTRACED) && child->parent == current &&
+	    (!(child->ptrace & PT_ATTACHED) || child->real_parent != current)
+	    && child->signal != NULL) {
+		ret = 0;
+		spin_lock_irq(&child->sighand->siglock);
+		if (child->state == TASK_STOPPED) {
+			child->state = TASK_TRACED;
+		} else if (child->state != TASK_TRACED && !kill) {
+			ret = -ESRCH;
+		}
+		spin_unlock_irq(&child->sighand->siglock);
+	}
+	read_unlock(&tasklist_lock);
+
+	if (!ret && !kill) {
+		wait_task_inactive(child);
+	}
+
+	/* All systems go.. */
+	return ret;
+}
+
+int ptrace_attach(struct task_struct *task)
+{
+	int retval;
+	task_lock(task);
+	retval = -EPERM;
+	if (task->pid <= 1)
+		goto bad;
+	if (task == current)
+		goto bad;
+	if (!task->mm)
+		goto bad;
+	if(((current->uid != task->euid) ||
+	    (current->uid != task->suid) ||
+	    (current->uid != task->uid) ||
+ 	    (current->gid != task->egid) ||
+ 	    (current->gid != task->sgid) ||
+ 	    (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+		goto bad;
+	rmb();
+	if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
+		goto bad;
+	/* the same process cannot be attached many times */
+	if (task->ptrace & PT_PTRACED)
+		goto bad;
+	retval = security_ptrace(current, task);
+	if (retval)
+		goto bad;
+
+	/* Go */
+	task->ptrace |= PT_PTRACED | ((task->real_parent != current)
+				      ? PT_ATTACHED : 0);
+	if (capable(CAP_SYS_PTRACE))
+		task->ptrace |= PT_PTRACE_CAP;
+	task_unlock(task);
+
+	write_lock_irq(&tasklist_lock);
+	__ptrace_link(task, current);
+	write_unlock_irq(&tasklist_lock);
+
+	force_sig_specific(SIGSTOP, task);
+	return 0;
+
+bad:
+	task_unlock(task);
+	return retval;
+}
+
+int ptrace_detach(struct task_struct *child, unsigned int data)
+{
+	if ((unsigned long) data > _NSIG)
+		return	-EIO;
+
+	/* Architecture-specific hardware disable .. */
+	ptrace_disable(child);
+
+	/* .. re-parent .. */
+	child->exit_code = data;
+
+	write_lock_irq(&tasklist_lock);
+	__ptrace_unlink(child);
+	/* .. and wake it up. */
+	if (child->exit_state != EXIT_ZOMBIE)
+		wake_up_process(child);
+	write_unlock_irq(&tasklist_lock);
+
+	return 0;
+}
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space, 
+ * Do not walk the page table directly, use get_user_pages
+ */
+
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	struct page *page;
+	void *old_buf = buf;
+
+	mm = get_task_mm(tsk);
+	if (!mm)
+		return 0;
+
+	down_read(&mm->mmap_sem);
+	/* ignore errors, just check how much was sucessfully transfered */
+	while (len) {
+		int bytes, ret, offset;
+		void *maddr;
+
+		ret = get_user_pages(tsk, mm, addr, 1,
+				write, 1, &page, &vma);
+		if (ret <= 0)
+			break;
+
+		bytes = len;
+		offset = addr & (PAGE_SIZE-1);
+		if (bytes > PAGE_SIZE-offset)
+			bytes = PAGE_SIZE-offset;
+
+		maddr = kmap(page);
+		if (write) {
+			copy_to_user_page(vma, page, addr,
+					  maddr + offset, buf, bytes);
+			set_page_dirty_lock(page);
+		} else {
+			copy_from_user_page(vma, page, addr,
+					    buf, maddr + offset, bytes);
+		}
+		kunmap(page);
+		page_cache_release(page);
+		len -= bytes;
+		buf += bytes;
+		addr += bytes;
+	}
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+	
+	return buf - old_buf;
+}
+
+int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
+{
+	int copied = 0;
+
+	while (len > 0) {
+		char buf[128];
+		int this_len, retval;
+
+		this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
+		retval = access_process_vm(tsk, src, buf, this_len, 0);
+		if (!retval) {
+			if (copied)
+				break;
+			return -EIO;
+		}
+		if (copy_to_user(dst, buf, retval))
+			return -EFAULT;
+		copied += retval;
+		src += retval;
+		dst += retval;
+		len -= retval;			
+	}
+	return copied;
+}
+
+int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len)
+{
+	int copied = 0;
+
+	while (len > 0) {
+		char buf[128];
+		int this_len, retval;
+
+		this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
+		if (copy_from_user(buf, src, this_len))
+			return -EFAULT;
+		retval = access_process_vm(tsk, dst, buf, this_len, 1);
+		if (!retval) {
+			if (copied)
+				break;
+			return -EIO;
+		}
+		copied += retval;
+		src += retval;
+		dst += retval;
+		len -= retval;			
+	}
+	return copied;
+}
+
+static int ptrace_setoptions(struct task_struct *child, long data)
+{
+	child->ptrace &= ~PT_TRACE_MASK;
+
+	if (data & PTRACE_O_TRACESYSGOOD)
+		child->ptrace |= PT_TRACESYSGOOD;
+
+	if (data & PTRACE_O_TRACEFORK)
+		child->ptrace |= PT_TRACE_FORK;
+
+	if (data & PTRACE_O_TRACEVFORK)
+		child->ptrace |= PT_TRACE_VFORK;
+
+	if (data & PTRACE_O_TRACECLONE)
+		child->ptrace |= PT_TRACE_CLONE;
+
+	if (data & PTRACE_O_TRACEEXEC)
+		child->ptrace |= PT_TRACE_EXEC;
+
+	if (data & PTRACE_O_TRACEVFORKDONE)
+		child->ptrace |= PT_TRACE_VFORK_DONE;
+
+	if (data & PTRACE_O_TRACEEXIT)
+		child->ptrace |= PT_TRACE_EXIT;
+
+	return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
+}
+
+static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data)
+{
+	siginfo_t lastinfo;
+	int error = -ESRCH;
+
+	read_lock(&tasklist_lock);
+	if (likely(child->sighand != NULL)) {
+		error = -EINVAL;
+		spin_lock_irq(&child->sighand->siglock);
+		if (likely(child->last_siginfo != NULL)) {
+			lastinfo = *child->last_siginfo;
+			error = 0;
+		}
+		spin_unlock_irq(&child->sighand->siglock);
+	}
+	read_unlock(&tasklist_lock);
+	if (!error)
+		return copy_siginfo_to_user(data, &lastinfo);
+	return error;
+}
+
+static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
+{
+	siginfo_t newinfo;
+	int error = -ESRCH;
+
+	if (copy_from_user(&newinfo, data, sizeof (siginfo_t)))
+		return -EFAULT;
+
+	read_lock(&tasklist_lock);
+	if (likely(child->sighand != NULL)) {
+		error = -EINVAL;
+		spin_lock_irq(&child->sighand->siglock);
+		if (likely(child->last_siginfo != NULL)) {
+			*child->last_siginfo = newinfo;
+			error = 0;
+		}
+		spin_unlock_irq(&child->sighand->siglock);
+	}
+	read_unlock(&tasklist_lock);
+	return error;
+}
+
+int ptrace_request(struct task_struct *child, long request,
+		   long addr, long data)
+{
+	int ret = -EIO;
+
+	switch (request) {
+#ifdef PTRACE_OLDSETOPTIONS
+	case PTRACE_OLDSETOPTIONS:
+#endif
+	case PTRACE_SETOPTIONS:
+		ret = ptrace_setoptions(child, data);
+		break;
+	case PTRACE_GETEVENTMSG:
+		ret = put_user(child->ptrace_message, (unsigned long __user *) data);
+		break;
+	case PTRACE_GETSIGINFO:
+		ret = ptrace_getsiginfo(child, (siginfo_t __user *) data);
+		break;
+	case PTRACE_SETSIGINFO:
+		ret = ptrace_setsiginfo(child, (siginfo_t __user *) data);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
new file mode 100644
index 000000000000..d00eded75d71
--- /dev/null
+++ b/kernel/rcupdate.c
@@ -0,0 +1,470 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2001
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ * 
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+
+/* Definition for rcupdate control block. */
+struct rcu_ctrlblk rcu_ctrlblk = 
+	{ .cur = -300, .completed = -300 };
+struct rcu_ctrlblk rcu_bh_ctrlblk =
+	{ .cur = -300, .completed = -300 };
+
+/* Bookkeeping of the progress of the grace period */
+struct rcu_state {
+	spinlock_t	lock; /* Guard this struct and writes to rcu_ctrlblk */
+	cpumask_t	cpumask; /* CPUs that need to switch in order    */
+	                              /* for current batch to proceed.        */
+};
+
+static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
+	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
+static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
+	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
+
+DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
+
+/* Fake initialization required by compiler */
+static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
+static int maxbatch = 10;
+
+/**
+ * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void fastcall call_rcu(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = &__get_cpu_var(rcu_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
+	local_irq_restore(flags);
+}
+
+/**
+ * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by rcu_read_lock() and
+ * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
+ * and rcu_read_unlock_bh(), if in process context. These may be nested.
+ */
+void fastcall call_rcu_bh(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = &__get_cpu_var(rcu_bh_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
+	local_irq_restore(flags);
+}
+
+/*
+ * Invoke the completed RCU callbacks. They are expected to be in
+ * a per-cpu list.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+	struct rcu_head *next, *list;
+	int count = 0;
+
+	list = rdp->donelist;
+	while (list) {
+		next = rdp->donelist = list->next;
+		list->func(list);
+		list = next;
+		if (++count >= maxbatch)
+			break;
+	}
+	if (!rdp->donelist)
+		rdp->donetail = &rdp->donelist;
+	else
+		tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
+}
+
+/*
+ * Grace period handling:
+ * The grace period handling consists out of two steps:
+ * - A new grace period is started.
+ *   This is done by rcu_start_batch. The start is not broadcasted to
+ *   all cpus, they must pick this up by comparing rcp->cur with
+ *   rdp->quiescbatch. All cpus are recorded  in the
+ *   rcu_state.cpumask bitmap.
+ * - All cpus must go through a quiescent state.
+ *   Since the start of the grace period is not broadcasted, at least two
+ *   calls to rcu_check_quiescent_state are required:
+ *   The first call just notices that a new grace period is running. The
+ *   following calls check if there was a quiescent state since the beginning
+ *   of the grace period. If so, it updates rcu_state.cpumask. If
+ *   the bitmap is empty, then the grace period is completed.
+ *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
+ *   period (if necessary).
+ */
+/*
+ * Register a new batch of callbacks, and start it up if there is currently no
+ * active batch and the batch to be registered has not already occurred.
+ * Caller must hold rcu_state.lock.
+ */
+static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp,
+				int next_pending)
+{
+	if (next_pending)
+		rcp->next_pending = 1;
+
+	if (rcp->next_pending &&
+			rcp->completed == rcp->cur) {
+		/* Can't change, since spin lock held. */
+		cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask);
+
+		rcp->next_pending = 0;
+		/* next_pending == 0 must be visible in __rcu_process_callbacks()
+		 * before it can see new value of cur.
+		 */
+		smp_wmb();
+		rcp->cur++;
+	}
+}
+
+/*
+ * cpu went through a quiescent state since the beginning of the grace period.
+ * Clear it from the cpu mask and complete the grace period if it was the last
+ * cpu. Start another grace period if someone has further entries pending
+ */
+static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
+{
+	cpu_clear(cpu, rsp->cpumask);
+	if (cpus_empty(rsp->cpumask)) {
+		/* batch completed ! */
+		rcp->completed = rcp->cur;
+		rcu_start_batch(rcp, rsp, 0);
+	}
+}
+
+/*
+ * Check if the cpu has gone through a quiescent state (say context
+ * switch). If so and if it already hasn't done so in this RCU
+ * quiescent cycle, then indicate that it has done so.
+ */
+static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
+			struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	if (rdp->quiescbatch != rcp->cur) {
+		/* start new grace period: */
+		rdp->qs_pending = 1;
+		rdp->passed_quiesc = 0;
+		rdp->quiescbatch = rcp->cur;
+		return;
+	}
+
+	/* Grace period already completed for this cpu?
+	 * qs_pending is checked instead of the actual bitmap to avoid
+	 * cacheline trashing.
+	 */
+	if (!rdp->qs_pending)
+		return;
+
+	/* 
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
+	 */
+	if (!rdp->passed_quiesc)
+		return;
+	rdp->qs_pending = 0;
+
+	spin_lock(&rsp->lock);
+	/*
+	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
+	 * during cpu startup. Ignore the quiescent state.
+	 */
+	if (likely(rdp->quiescbatch == rcp->cur))
+		cpu_quiet(rdp->cpu, rcp, rsp);
+
+	spin_unlock(&rsp->lock);
+}
+
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
+ * locking requirements, the list it's pulling from has to belong to a cpu
+ * which is dead and hence not processing interrupts.
+ */
+static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
+				struct rcu_head **tail)
+{
+	local_irq_disable();
+	*this_rdp->nxttail = list;
+	if (list)
+		this_rdp->nxttail = tail;
+	local_irq_enable();
+}
+
+static void __rcu_offline_cpu(struct rcu_data *this_rdp,
+	struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* if the cpu going offline owns the grace period
+	 * we can block indefinitely waiting for it, so flush
+	 * it here
+	 */
+	spin_lock_bh(&rsp->lock);
+	if (rcp->cur != rcp->completed)
+		cpu_quiet(rdp->cpu, rcp, rsp);
+	spin_unlock_bh(&rsp->lock);
+	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
+	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
+
+}
+static void rcu_offline_cpu(int cpu)
+{
+	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
+	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
+
+	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk, &rcu_state,
+					&per_cpu(rcu_data, cpu));
+	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, &rcu_bh_state,
+					&per_cpu(rcu_bh_data, cpu));
+	put_cpu_var(rcu_data);
+	put_cpu_var(rcu_bh_data);
+	tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
+}
+
+#else
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif
+
+/*
+ * This does the RCU processing work from tasklet context. 
+ */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
+			struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
+		*rdp->donetail = rdp->curlist;
+		rdp->donetail = rdp->curtail;
+		rdp->curlist = NULL;
+		rdp->curtail = &rdp->curlist;
+	}
+
+	local_irq_disable();
+	if (rdp->nxtlist && !rdp->curlist) {
+		rdp->curlist = rdp->nxtlist;
+		rdp->curtail = rdp->nxttail;
+		rdp->nxtlist = NULL;
+		rdp->nxttail = &rdp->nxtlist;
+		local_irq_enable();
+
+		/*
+		 * start the next batch of callbacks
+		 */
+
+		/* determine batch number */
+		rdp->batch = rcp->cur + 1;
+		/* see the comment and corresponding wmb() in
+		 * the rcu_start_batch()
+		 */
+		smp_rmb();
+
+		if (!rcp->next_pending) {
+			/* and start it/schedule start if it's a new batch */
+			spin_lock(&rsp->lock);
+			rcu_start_batch(rcp, rsp, 1);
+			spin_unlock(&rsp->lock);
+		}
+	} else {
+		local_irq_enable();
+	}
+	rcu_check_quiescent_state(rcp, rsp, rdp);
+	if (rdp->donelist)
+		rcu_do_batch(rdp);
+}
+
+static void rcu_process_callbacks(unsigned long unused)
+{
+	__rcu_process_callbacks(&rcu_ctrlblk, &rcu_state,
+				&__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_ctrlblk, &rcu_bh_state,
+				&__get_cpu_var(rcu_bh_data));
+}
+
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user || 
+	    (idle_cpu(cpu) && !in_softirq() && 
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+	} else if (!in_softirq())
+		rcu_bh_qsctr_inc(cpu);
+	tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
+}
+
+static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
+						struct rcu_data *rdp)
+{
+	memset(rdp, 0, sizeof(*rdp));
+	rdp->curtail = &rdp->curlist;
+	rdp->nxttail = &rdp->nxtlist;
+	rdp->donetail = &rdp->donelist;
+	rdp->quiescbatch = rcp->completed;
+	rdp->qs_pending = 0;
+	rdp->cpu = cpu;
+}
+
+static void __devinit rcu_online_cpu(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
+
+	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
+	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
+	tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
+}
+
+static int __devinit rcu_cpu_notify(struct notifier_block *self, 
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+	switch (action) {
+	case CPU_UP_PREPARE:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_DEAD:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
+/*
+ * Initializes rcu mechanism.  Assumed to be called early.
+ * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
+ * Note that rcu_qsctr and friends are implicitly
+ * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
+ */
+void __init rcu_init(void)
+{
+	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
+			(void *)(long)smp_processor_id());
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
+}
+
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
+
+/* Because of FASTCALL declaration of complete, we use this wrapper */
+static void wakeme_after_rcu(struct rcu_head  *head)
+{
+	struct rcu_synchronize *rcu;
+
+	rcu = container_of(head, struct rcu_synchronize, head);
+	complete(&rcu->completion);
+}
+
+/**
+ * synchronize_kernel - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void synchronize_kernel(void)
+{
+	struct rcu_synchronize rcu;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished */
+	call_rcu(&rcu.head, wakeme_after_rcu);
+
+	/* Wait for it */
+	wait_for_completion(&rcu.completion);
+}
+
+module_param(maxbatch, int, 0);
+EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+EXPORT_SYMBOL_GPL(synchronize_kernel);
diff --git a/kernel/resource.c b/kernel/resource.c
new file mode 100644
index 000000000000..35c99ac02c7c
--- /dev/null
+++ b/kernel/resource.c
@@ -0,0 +1,551 @@
+/*
+ *	linux/kernel/resource.c
+ *
+ * Copyright (C) 1999	Linus Torvalds
+ * Copyright (C) 1999	Martin Mares <mj@ucw.cz>
+ *
+ * Arbitrary resource management.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <asm/io.h>
+
+
+struct resource ioport_resource = {
+	.name	= "PCI IO",
+	.start	= 0x0000,
+	.end	= IO_SPACE_LIMIT,
+	.flags	= IORESOURCE_IO,
+};
+
+EXPORT_SYMBOL(ioport_resource);
+
+struct resource iomem_resource = {
+	.name	= "PCI mem",
+	.start	= 0UL,
+	.end	= ~0UL,
+	.flags	= IORESOURCE_MEM,
+};
+
+EXPORT_SYMBOL(iomem_resource);
+
+static DEFINE_RWLOCK(resource_lock);
+
+#ifdef CONFIG_PROC_FS
+
+enum { MAX_IORES_LEVEL = 5 };
+
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct resource *p = v;
+	(*pos)++;
+	if (p->child)
+		return p->child;
+	while (!p->sibling && p->parent)
+		p = p->parent;
+	return p->sibling;
+}
+
+static void *r_start(struct seq_file *m, loff_t *pos)
+	__acquires(resource_lock)
+{
+	struct resource *p = m->private;
+	loff_t l = 0;
+	read_lock(&resource_lock);
+	for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
+		;
+	return p;
+}
+
+static void r_stop(struct seq_file *m, void *v)
+	__releases(resource_lock)
+{
+	read_unlock(&resource_lock);
+}
+
+static int r_show(struct seq_file *m, void *v)
+{
+	struct resource *root = m->private;
+	struct resource *r = v, *p;
+	int width = root->end < 0x10000 ? 4 : 8;
+	int depth;
+
+	for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
+		if (p->parent == root)
+			break;
+	seq_printf(m, "%*s%0*lx-%0*lx : %s\n",
+			depth * 2, "",
+			width, r->start,
+			width, r->end,
+			r->name ? r->name : "<BAD>");
+	return 0;
+}
+
+static struct seq_operations resource_op = {
+	.start	= r_start,
+	.next	= r_next,
+	.stop	= r_stop,
+	.show	= r_show,
+};
+
+static int ioports_open(struct inode *inode, struct file *file)
+{
+	int res = seq_open(file, &resource_op);
+	if (!res) {
+		struct seq_file *m = file->private_data;
+		m->private = &ioport_resource;
+	}
+	return res;
+}
+
+static int iomem_open(struct inode *inode, struct file *file)
+{
+	int res = seq_open(file, &resource_op);
+	if (!res) {
+		struct seq_file *m = file->private_data;
+		m->private = &iomem_resource;
+	}
+	return res;
+}
+
+static struct file_operations proc_ioports_operations = {
+	.open		= ioports_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct file_operations proc_iomem_operations = {
+	.open		= iomem_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init ioresources_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("ioports", 0, NULL);
+	if (entry)
+		entry->proc_fops = &proc_ioports_operations;
+	entry = create_proc_entry("iomem", 0, NULL);
+	if (entry)
+		entry->proc_fops = &proc_iomem_operations;
+	return 0;
+}
+__initcall(ioresources_init);
+
+#endif /* CONFIG_PROC_FS */
+
+/* Return the conflict entry if you can't request it */
+static struct resource * __request_resource(struct resource *root, struct resource *new)
+{
+	unsigned long start = new->start;
+	unsigned long end = new->end;
+	struct resource *tmp, **p;
+
+	if (end < start)
+		return root;
+	if (start < root->start)
+		return root;
+	if (end > root->end)
+		return root;
+	p = &root->child;
+	for (;;) {
+		tmp = *p;
+		if (!tmp || tmp->start > end) {
+			new->sibling = tmp;
+			*p = new;
+			new->parent = root;
+			return NULL;
+		}
+		p = &tmp->sibling;
+		if (tmp->end < start)
+			continue;
+		return tmp;
+	}
+}
+
+static int __release_resource(struct resource *old)
+{
+	struct resource *tmp, **p;
+
+	p = &old->parent->child;
+	for (;;) {
+		tmp = *p;
+		if (!tmp)
+			break;
+		if (tmp == old) {
+			*p = tmp->sibling;
+			old->parent = NULL;
+			return 0;
+		}
+		p = &tmp->sibling;
+	}
+	return -EINVAL;
+}
+
+int request_resource(struct resource *root, struct resource *new)
+{
+	struct resource *conflict;
+
+	write_lock(&resource_lock);
+	conflict = __request_resource(root, new);
+	write_unlock(&resource_lock);
+	return conflict ? -EBUSY : 0;
+}
+
+EXPORT_SYMBOL(request_resource);
+
+struct resource *____request_resource(struct resource *root, struct resource *new)
+{
+	struct resource *conflict;
+
+	write_lock(&resource_lock);
+	conflict = __request_resource(root, new);
+	write_unlock(&resource_lock);
+	return conflict;
+}
+
+EXPORT_SYMBOL(____request_resource);
+
+int release_resource(struct resource *old)
+{
+	int retval;
+
+	write_lock(&resource_lock);
+	retval = __release_resource(old);
+	write_unlock(&resource_lock);
+	return retval;
+}
+
+EXPORT_SYMBOL(release_resource);
+
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ */
+static int find_resource(struct resource *root, struct resource *new,
+			 unsigned long size,
+			 unsigned long min, unsigned long max,
+			 unsigned long align,
+			 void (*alignf)(void *, struct resource *,
+					unsigned long, unsigned long),
+			 void *alignf_data)
+{
+	struct resource *this = root->child;
+
+	new->start = root->start;
+	/*
+	 * Skip past an allocated resource that starts at 0, since the assignment
+	 * of this->start - 1 to new->end below would cause an underflow.
+	 */
+	if (this && this->start == 0) {
+		new->start = this->end + 1;
+		this = this->sibling;
+	}
+	for(;;) {
+		if (this)
+			new->end = this->start - 1;
+		else
+			new->end = root->end;
+		if (new->start < min)
+			new->start = min;
+		if (new->end > max)
+			new->end = max;
+		new->start = (new->start + align - 1) & ~(align - 1);
+		if (alignf)
+			alignf(alignf_data, new, size, align);
+		if (new->start < new->end && new->end - new->start + 1 >= size) {
+			new->end = new->start + size - 1;
+			return 0;
+		}
+		if (!this)
+			break;
+		new->start = this->end + 1;
+		this = this->sibling;
+	}
+	return -EBUSY;
+}
+
+/*
+ * Allocate empty slot in the resource tree given range and alignment.
+ */
+int allocate_resource(struct resource *root, struct resource *new,
+		      unsigned long size,
+		      unsigned long min, unsigned long max,
+		      unsigned long align,
+		      void (*alignf)(void *, struct resource *,
+				     unsigned long, unsigned long),
+		      void *alignf_data)
+{
+	int err;
+
+	write_lock(&resource_lock);
+	err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+	if (err >= 0 && __request_resource(root, new))
+		err = -EBUSY;
+	write_unlock(&resource_lock);
+	return err;
+}
+
+EXPORT_SYMBOL(allocate_resource);
+
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ *
+ * This function is equivalent of request_resource when no conflict
+ * happens. If a conflict happens, and the conflicting resources
+ * entirely fit within the range of the new resource, then the new
+ * resource is inserted and the conflicting resources become childs of
+ * the new resource.  Otherwise the new resource becomes the child of
+ * the conflicting resource
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+	int result;
+	struct resource *first, *next;
+
+	write_lock(&resource_lock);
+ begin:
+ 	result = 0;
+	first = __request_resource(parent, new);
+	if (!first)
+		goto out;
+
+	result = -EBUSY;
+	if (first == parent)
+		goto out;
+
+	/* Resource fully contained by the clashing resource? Recurse into it */
+	if (first->start <= new->start && first->end >= new->end) {
+		parent = first;
+		goto begin;
+	}
+
+	for (next = first; ; next = next->sibling) {
+		/* Partial overlap? Bad, and unfixable */
+		if (next->start < new->start || next->end > new->end)
+			goto out;
+		if (!next->sibling)
+			break;
+		if (next->sibling->start > new->end)
+			break;
+	}
+
+	result = 0;
+
+	new->parent = parent;
+	new->sibling = next->sibling;
+	new->child = first;
+
+	next->sibling = NULL;
+	for (next = first; next; next = next->sibling)
+		next->parent = new;
+
+	if (parent->child == first) {
+		parent->child = new;
+	} else {
+		next = parent->child;
+		while (next->sibling != first)
+			next = next->sibling;
+		next->sibling = new;
+	}
+
+ out:
+	write_unlock(&resource_lock);
+	return result;
+}
+
+EXPORT_SYMBOL(insert_resource);
+
+/*
+ * Given an existing resource, change its start and size to match the
+ * arguments.  Returns -EBUSY if it can't fit.  Existing children of
+ * the resource are assumed to be immutable.
+ */
+int adjust_resource(struct resource *res, unsigned long start, unsigned long size)
+{
+	struct resource *tmp, *parent = res->parent;
+	unsigned long end = start + size - 1;
+	int result = -EBUSY;
+
+	write_lock(&resource_lock);
+
+	if ((start < parent->start) || (end > parent->end))
+		goto out;
+
+	for (tmp = res->child; tmp; tmp = tmp->sibling) {
+		if ((tmp->start < start) || (tmp->end > end))
+			goto out;
+	}
+
+	if (res->sibling && (res->sibling->start <= end))
+		goto out;
+
+	tmp = parent->child;
+	if (tmp != res) {
+		while (tmp->sibling != res)
+			tmp = tmp->sibling;
+		if (start <= tmp->end)
+			goto out;
+	}
+
+	res->start = start;
+	res->end = end;
+	result = 0;
+
+ out:
+	write_unlock(&resource_lock);
+	return result;
+}
+
+EXPORT_SYMBOL(adjust_resource);
+
+/*
+ * This is compatibility stuff for IO resources.
+ *
+ * Note how this, unlike the above, knows about
+ * the IO flag meanings (busy etc).
+ *
+ * Request-region creates a new busy region.
+ *
+ * Check-region returns non-zero if the area is already busy
+ *
+ * Release-region releases a matching busy region.
+ */
+struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
+{
+	struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
+
+	if (res) {
+		memset(res, 0, sizeof(*res));
+		res->name = name;
+		res->start = start;
+		res->end = start + n - 1;
+		res->flags = IORESOURCE_BUSY;
+
+		write_lock(&resource_lock);
+
+		for (;;) {
+			struct resource *conflict;
+
+			conflict = __request_resource(parent, res);
+			if (!conflict)
+				break;
+			if (conflict != parent) {
+				parent = conflict;
+				if (!(conflict->flags & IORESOURCE_BUSY))
+					continue;
+			}
+
+			/* Uhhuh, that didn't work out.. */
+			kfree(res);
+			res = NULL;
+			break;
+		}
+		write_unlock(&resource_lock);
+	}
+	return res;
+}
+
+EXPORT_SYMBOL(__request_region);
+
+int __deprecated __check_region(struct resource *parent, unsigned long start, unsigned long n)
+{
+	struct resource * res;
+
+	res = __request_region(parent, start, n, "check-region");
+	if (!res)
+		return -EBUSY;
+
+	release_resource(res);
+	kfree(res);
+	return 0;
+}
+
+EXPORT_SYMBOL(__check_region);
+
+void __release_region(struct resource *parent, unsigned long start, unsigned long n)
+{
+	struct resource **p;
+	unsigned long end;
+
+	p = &parent->child;
+	end = start + n - 1;
+
+	write_lock(&resource_lock);
+
+	for (;;) {
+		struct resource *res = *p;
+
+		if (!res)
+			break;
+		if (res->start <= start && res->end >= end) {
+			if (!(res->flags & IORESOURCE_BUSY)) {
+				p = &res->child;
+				continue;
+			}
+			if (res->start != start || res->end != end)
+				break;
+			*p = res->sibling;
+			write_unlock(&resource_lock);
+			kfree(res);
+			return;
+		}
+		p = &res->sibling;
+	}
+
+	write_unlock(&resource_lock);
+
+	printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end);
+}
+
+EXPORT_SYMBOL(__release_region);
+
+/*
+ * Called from init/main.c to reserve IO ports.
+ */
+#define MAXRESERVE 4
+static int __init reserve_setup(char *str)
+{
+	static int reserved;
+	static struct resource reserve[MAXRESERVE];
+
+	for (;;) {
+		int io_start, io_num;
+		int x = reserved;
+
+		if (get_option (&str, &io_start) != 2)
+			break;
+		if (get_option (&str, &io_num)   == 0)
+			break;
+		if (x < MAXRESERVE) {
+			struct resource *res = reserve + x;
+			res->name = "reserved";
+			res->start = io_start;
+			res->end = io_start + io_num - 1;
+			res->flags = IORESOURCE_BUSY;
+			res->child = NULL;
+			if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
+				reserved = x+1;
+		}
+	}
+	return 1;
+}
+
+__setup("reserve=", reserve_setup);
diff --git a/kernel/sched.c b/kernel/sched.c
new file mode 100644
index 000000000000..f69c4a5361e3
--- /dev/null
+++ b/kernel/sched.c
@@ -0,0 +1,5004 @@
+/*
+ *  kernel/sched.c
+ *
+ *  Kernel scheduler and related syscalls
+ *
+ *  Copyright (C) 1991-2002  Linus Torvalds
+ *
+ *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
+ *		make semaphores SMP safe
+ *  1998-11-19	Implemented schedule_timeout() and related stuff
+ *		by Andrea Arcangeli
+ *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
+ *		hybrid priority-list and round-robin design with
+ *		an array-switch method of distributing timeslices
+ *		and per-CPU runqueues.  Cleanups and useful suggestions
+ *		by Davide Libenzi, preemptible kernel bits by Robert Love.
+ *  2003-09-03	Interactivity tuning by Con Kolivas.
+ *  2004-04-02	Scheduler domains code by Nick Piggin
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+#include <asm/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/completion.h>
+#include <linux/kernel_stat.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/profile.h>
+#include <linux/suspend.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/percpu.h>
+#include <linux/kthread.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/times.h>
+#include <linux/acct.h>
+#include <asm/tlb.h>
+
+#include <asm/unistd.h>
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
+
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p)		((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+
+/*
+ * Some helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
+#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+
+/*
+ * These are the 'tuning knobs' of the scheduler:
+ *
+ * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
+ * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
+ * Timeslices get refilled after they expire.
+ */
+#define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
+#define DEF_TIMESLICE		(100 * HZ / 1000)
+#define ON_RUNQUEUE_WEIGHT	 30
+#define CHILD_PENALTY		 95
+#define PARENT_PENALTY		100
+#define EXIT_WEIGHT		  3
+#define PRIO_BONUS_RATIO	 25
+#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
+#define INTERACTIVE_DELTA	  2
+#define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
+#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
+#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
+
+/*
+ * If a task is 'interactive' then we reinsert it in the active
+ * array after it has expired its current timeslice. (it will not
+ * continue to run immediately, it will still roundrobin with
+ * other interactive tasks.)
+ *
+ * This part scales the interactivity limit depending on niceness.
+ *
+ * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
+ * Here are a few examples of different nice levels:
+ *
+ *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
+ *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
+ *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
+ *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
+ *
+ * (the X axis represents the possible -5 ... 0 ... +5 dynamic
+ *  priority range a task can explore, a value of '1' means the
+ *  task is rated interactive.)
+ *
+ * Ie. nice +19 tasks can never get 'interactive' enough to be
+ * reinserted into the active array. And only heavily CPU-hog nice -20
+ * tasks will be expired. Default nice 0 tasks are somewhere between,
+ * it takes some effort for them to get interactive, but it's not
+ * too hard.
+ */
+
+#define CURRENT_BONUS(p) \
+	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
+		MAX_SLEEP_AVG)
+
+#define GRANULARITY	(10 * HZ / 1000 ? : 1)
+
+#ifdef CONFIG_SMP
+#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
+			num_online_cpus())
+#else
+#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
+#endif
+
+#define SCALE(v1,v1_max,v2_max) \
+	(v1) * (v2_max) / (v1_max)
+
+#define DELTA(p) \
+	(SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
+
+#define TASK_INTERACTIVE(p) \
+	((p)->prio <= (p)->static_prio - DELTA(p))
+
+#define INTERACTIVE_SLEEP(p) \
+	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
+		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
+
+#define TASK_PREEMPTS_CURR(p, rq) \
+	((p)->prio < (rq)->curr->prio)
+
+/*
+ * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
+ * to time slice values: [800ms ... 100ms ... 5ms]
+ *
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
+ */
+
+#define SCALE_PRIO(x, prio) \
+	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
+
+static inline unsigned int task_timeslice(task_t *p)
+{
+	if (p->static_prio < NICE_TO_PRIO(0))
+		return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
+	else
+		return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
+}
+#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
+				< (long long) (sd)->cache_hot_time)
+
+/*
+ * These are the runqueue data structures:
+ */
+
+#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
+
+typedef struct runqueue runqueue_t;
+
+struct prio_array {
+	unsigned int nr_active;
+	unsigned long bitmap[BITMAP_SIZE];
+	struct list_head queue[MAX_PRIO];
+};
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ *
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the thread migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
+ */
+struct runqueue {
+	spinlock_t lock;
+
+	/*
+	 * nr_running and cpu_load should be in the same cacheline because
+	 * remote CPUs use both these fields when doing load calculation.
+	 */
+	unsigned long nr_running;
+#ifdef CONFIG_SMP
+	unsigned long cpu_load;
+#endif
+	unsigned long long nr_switches;
+
+	/*
+	 * This is part of a global counter where only the total sum
+	 * over all CPUs matters. A task can increase this counter on
+	 * one CPU and if it got migrated afterwards it may decrease
+	 * it on another CPU. Always updated under the runqueue lock:
+	 */
+	unsigned long nr_uninterruptible;
+
+	unsigned long expired_timestamp;
+	unsigned long long timestamp_last_tick;
+	task_t *curr, *idle;
+	struct mm_struct *prev_mm;
+	prio_array_t *active, *expired, arrays[2];
+	int best_expired_prio;
+	atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+	struct sched_domain *sd;
+
+	/* For active balancing */
+	int active_balance;
+	int push_cpu;
+
+	task_t *migration_thread;
+	struct list_head migration_queue;
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+	/* latency stats */
+	struct sched_info rq_sched_info;
+
+	/* sys_sched_yield() stats */
+	unsigned long yld_exp_empty;
+	unsigned long yld_act_empty;
+	unsigned long yld_both_empty;
+	unsigned long yld_cnt;
+
+	/* schedule() stats */
+	unsigned long sched_switch;
+	unsigned long sched_cnt;
+	unsigned long sched_goidle;
+
+	/* try_to_wake_up() stats */
+	unsigned long ttwu_cnt;
+	unsigned long ttwu_local;
+#endif
+};
+
+static DEFINE_PER_CPU(struct runqueue, runqueues);
+
+#define for_each_domain(cpu, domain) \
+	for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
+
+#define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
+#define this_rq()		(&__get_cpu_var(runqueues))
+#define task_rq(p)		cpu_rq(task_cpu(p))
+#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
+
+/*
+ * Default context-switch locking:
+ */
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(rq, next)	do { } while (0)
+# define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
+# define task_running(rq, p)		((rq)->curr == (p))
+#endif
+
+/*
+ * task_rq_lock - lock the runqueue a given task resides on and disable
+ * interrupts.  Note the ordering: we can safely lookup the task_rq without
+ * explicitly disabling preemption.
+ */
+static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+	__acquires(rq->lock)
+{
+	struct runqueue *rq;
+
+repeat_lock_task:
+	local_irq_save(*flags);
+	rq = task_rq(p);
+	spin_lock(&rq->lock);
+	if (unlikely(rq != task_rq(p))) {
+		spin_unlock_irqrestore(&rq->lock, *flags);
+		goto repeat_lock_task;
+	}
+	return rq;
+}
+
+static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
+	__releases(rq->lock)
+{
+	spin_unlock_irqrestore(&rq->lock, *flags);
+}
+
+#ifdef CONFIG_SCHEDSTATS
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 11
+
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+	int cpu;
+
+	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+	seq_printf(seq, "timestamp %lu\n", jiffies);
+	for_each_online_cpu(cpu) {
+		runqueue_t *rq = cpu_rq(cpu);
+#ifdef CONFIG_SMP
+		struct sched_domain *sd;
+		int dcnt = 0;
+#endif
+
+		/* runqueue-specific stats */
+		seq_printf(seq,
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+		    cpu, rq->yld_both_empty,
+		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
+		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
+		    rq->ttwu_cnt, rq->ttwu_local,
+		    rq->rq_sched_info.cpu_time,
+		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
+
+		seq_printf(seq, "\n");
+
+#ifdef CONFIG_SMP
+		/* domain-specific stats */
+		for_each_domain(cpu, sd) {
+			enum idle_type itype;
+			char mask_str[NR_CPUS];
+
+			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
+			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
+			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
+					itype++) {
+				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
+				    sd->lb_cnt[itype],
+				    sd->lb_balanced[itype],
+				    sd->lb_failed[itype],
+				    sd->lb_imbalance[itype],
+				    sd->lb_gained[itype],
+				    sd->lb_hot_gained[itype],
+				    sd->lb_nobusyq[itype],
+				    sd->lb_nobusyg[itype]);
+			}
+			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
+			    sd->sbe_pushed, sd->sbe_attempts,
+			    sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
+		}
+#endif
+	}
+	return 0;
+}
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+	char *buf = kmalloc(size, GFP_KERNEL);
+	struct seq_file *m;
+	int res;
+
+	if (!buf)
+		return -ENOMEM;
+	res = single_open(file, show_schedstat, NULL);
+	if (!res) {
+		m = file->private_data;
+		m->buf = buf;
+		m->size = size;
+	} else
+		kfree(buf);
+	return res;
+}
+
+struct file_operations proc_schedstat_operations = {
+	.open    = schedstat_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
+# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
+#else /* !CONFIG_SCHEDSTATS */
+# define schedstat_inc(rq, field)	do { } while (0)
+# define schedstat_add(rq, field, amt)	do { } while (0)
+#endif
+
+/*
+ * rq_lock - lock a given runqueue and disable interrupts.
+ */
+static inline runqueue_t *this_rq_lock(void)
+	__acquires(rq->lock)
+{
+	runqueue_t *rq;
+
+	local_irq_disable();
+	rq = this_rq();
+	spin_lock(&rq->lock);
+
+	return rq;
+}
+
+#ifdef CONFIG_SCHED_SMT
+static int cpu_and_siblings_are_idle(int cpu)
+{
+	int sib;
+	for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
+		if (idle_cpu(sib))
+			continue;
+		return 0;
+	}
+
+	return 1;
+}
+#else
+#define cpu_and_siblings_are_idle(A) idle_cpu(A)
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+/*
+ * Called when a process is dequeued from the active array and given
+ * the cpu.  We should note that with the exception of interactive
+ * tasks, the expired queue will become the active queue after the active
+ * queue is empty, without explicitly dequeuing and requeuing tasks in the
+ * expired queue.  (Interactive tasks may be requeued directly to the
+ * active queue, thus delaying tasks in the expired queue from running;
+ * see scheduler_tick()).
+ *
+ * This function is only called from sched_info_arrive(), rather than
+ * dequeue_task(). Even though a task may be queued and dequeued multiple
+ * times as it is shuffled about, we're really interested in knowing how
+ * long it was from the *first* time it was queued to the time that it
+ * finally hit a cpu.
+ */
+static inline void sched_info_dequeued(task_t *t)
+{
+	t->sched_info.last_queued = 0;
+}
+
+/*
+ * Called when a task finally hits the cpu.  We can now calculate how
+ * long it was waiting to run.  We also note when it began so that we
+ * can keep stats on how long its timeslice is.
+ */
+static inline void sched_info_arrive(task_t *t)
+{
+	unsigned long now = jiffies, diff = 0;
+	struct runqueue *rq = task_rq(t);
+
+	if (t->sched_info.last_queued)
+		diff = now - t->sched_info.last_queued;
+	sched_info_dequeued(t);
+	t->sched_info.run_delay += diff;
+	t->sched_info.last_arrival = now;
+	t->sched_info.pcnt++;
+
+	if (!rq)
+		return;
+
+	rq->rq_sched_info.run_delay += diff;
+	rq->rq_sched_info.pcnt++;
+}
+
+/*
+ * Called when a process is queued into either the active or expired
+ * array.  The time is noted and later used to determine how long we
+ * had to wait for us to reach the cpu.  Since the expired queue will
+ * become the active queue after active queue is empty, without dequeuing
+ * and requeuing any tasks, we are interested in queuing to either. It
+ * is unusual but not impossible for tasks to be dequeued and immediately
+ * requeued in the same or another array: this can happen in sched_yield(),
+ * set_user_nice(), and even load_balance() as it moves tasks from runqueue
+ * to runqueue.
+ *
+ * This function is only called from enqueue_task(), but also only updates
+ * the timestamp if it is already not set.  It's assumed that
+ * sched_info_dequeued() will clear that stamp when appropriate.
+ */
+static inline void sched_info_queued(task_t *t)
+{
+	if (!t->sched_info.last_queued)
+		t->sched_info.last_queued = jiffies;
+}
+
+/*
+ * Called when a process ceases being the active-running process, either
+ * voluntarily or involuntarily.  Now we can calculate how long we ran.
+ */
+static inline void sched_info_depart(task_t *t)
+{
+	struct runqueue *rq = task_rq(t);
+	unsigned long diff = jiffies - t->sched_info.last_arrival;
+
+	t->sched_info.cpu_time += diff;
+
+	if (rq)
+		rq->rq_sched_info.cpu_time += diff;
+}
+
+/*
+ * Called when tasks are switched involuntarily due, typically, to expiring
+ * their time slice.  (This may also be called when switching to or from
+ * the idle task.)  We are only called when prev != next.
+ */
+static inline void sched_info_switch(task_t *prev, task_t *next)
+{
+	struct runqueue *rq = task_rq(prev);
+
+	/*
+	 * prev now departs the cpu.  It's not interesting to record
+	 * stats about how efficient we were at scheduling the idle
+	 * process, however.
+	 */
+	if (prev != rq->idle)
+		sched_info_depart(prev);
+
+	if (next != rq->idle)
+		sched_info_arrive(next);
+}
+#else
+#define sched_info_queued(t)		do { } while (0)
+#define sched_info_switch(t, next)	do { } while (0)
+#endif /* CONFIG_SCHEDSTATS */
+
+/*
+ * Adding/removing a task to/from a priority array:
+ */
+static void dequeue_task(struct task_struct *p, prio_array_t *array)
+{
+	array->nr_active--;
+	list_del(&p->run_list);
+	if (list_empty(array->queue + p->prio))
+		__clear_bit(p->prio, array->bitmap);
+}
+
+static void enqueue_task(struct task_struct *p, prio_array_t *array)
+{
+	sched_info_queued(p);
+	list_add_tail(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
+}
+
+/*
+ * Put task to the end of the run list without the overhead of dequeue
+ * followed by enqueue.
+ */
+static void requeue_task(struct task_struct *p, prio_array_t *array)
+{
+	list_move_tail(&p->run_list, array->queue + p->prio);
+}
+
+static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
+{
+	list_add(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
+}
+
+/*
+ * effective_prio - return the priority that is based on the static
+ * priority but is modified by bonuses/penalties.
+ *
+ * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+ * into the -5 ... 0 ... +5 bonus/penalty range.
+ *
+ * We use 25% of the full 0...39 priority range so that:
+ *
+ * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+ * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ *
+ * Both properties are important to certain workloads.
+ */
+static int effective_prio(task_t *p)
+{
+	int bonus, prio;
+
+	if (rt_task(p))
+		return p->prio;
+
+	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+
+	prio = p->static_prio - bonus;
+	if (prio < MAX_RT_PRIO)
+		prio = MAX_RT_PRIO;
+	if (prio > MAX_PRIO-1)
+		prio = MAX_PRIO-1;
+	return prio;
+}
+
+/*
+ * __activate_task - move a task to the runqueue.
+ */
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+	enqueue_task(p, rq->active);
+	rq->nr_running++;
+}
+
+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+{
+	enqueue_task_head(p, rq->active);
+	rq->nr_running++;
+}
+
+static void recalc_task_prio(task_t *p, unsigned long long now)
+{
+	/* Caller must always ensure 'now >= p->timestamp' */
+	unsigned long long __sleep_time = now - p->timestamp;
+	unsigned long sleep_time;
+
+	if (__sleep_time > NS_MAX_SLEEP_AVG)
+		sleep_time = NS_MAX_SLEEP_AVG;
+	else
+		sleep_time = (unsigned long)__sleep_time;
+
+	if (likely(sleep_time > 0)) {
+		/*
+		 * User tasks that sleep a long time are categorised as
+		 * idle and will get just interactive status to stay active &
+		 * prevent them suddenly becoming cpu hogs and starving
+		 * other processes.
+		 */
+		if (p->mm && p->activated != -1 &&
+			sleep_time > INTERACTIVE_SLEEP(p)) {
+				p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
+						DEF_TIMESLICE);
+		} else {
+			/*
+			 * The lower the sleep avg a task has the more
+			 * rapidly it will rise with sleep time.
+			 */
+			sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
+
+			/*
+			 * Tasks waking from uninterruptible sleep are
+			 * limited in their sleep_avg rise as they
+			 * are likely to be waiting on I/O
+			 */
+			if (p->activated == -1 && p->mm) {
+				if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
+					sleep_time = 0;
+				else if (p->sleep_avg + sleep_time >=
+						INTERACTIVE_SLEEP(p)) {
+					p->sleep_avg = INTERACTIVE_SLEEP(p);
+					sleep_time = 0;
+				}
+			}
+
+			/*
+			 * This code gives a bonus to interactive tasks.
+			 *
+			 * The boost works by updating the 'average sleep time'
+			 * value here, based on ->timestamp. The more time a
+			 * task spends sleeping, the higher the average gets -
+			 * and the higher the priority boost gets as well.
+			 */
+			p->sleep_avg += sleep_time;
+
+			if (p->sleep_avg > NS_MAX_SLEEP_AVG)
+				p->sleep_avg = NS_MAX_SLEEP_AVG;
+		}
+	}
+
+	p->prio = effective_prio(p);
+}
+
+/*
+ * activate_task - move a task to the runqueue and do priority recalculation
+ *
+ * Update all the scheduling statistics stuff. (sleep average
+ * calculation, priority modifiers, etc.)
+ */
+static void activate_task(task_t *p, runqueue_t *rq, int local)
+{
+	unsigned long long now;
+
+	now = sched_clock();
+#ifdef CONFIG_SMP
+	if (!local) {
+		/* Compensate for drifting sched_clock */
+		runqueue_t *this_rq = this_rq();
+		now = (now - this_rq->timestamp_last_tick)
+			+ rq->timestamp_last_tick;
+	}
+#endif
+
+	recalc_task_prio(p, now);
+
+	/*
+	 * This checks to make sure it's not an uninterruptible task
+	 * that is now waking up.
+	 */
+	if (!p->activated) {
+		/*
+		 * Tasks which were woken up by interrupts (ie. hw events)
+		 * are most likely of interactive nature. So we give them
+		 * the credit of extending their sleep time to the period
+		 * of time they spend on the runqueue, waiting for execution
+		 * on a CPU, first time around:
+		 */
+		if (in_interrupt())
+			p->activated = 2;
+		else {
+			/*
+			 * Normal first-time wakeups get a credit too for
+			 * on-runqueue time, but it will be weighted down:
+			 */
+			p->activated = 1;
+		}
+	}
+	p->timestamp = now;
+
+	__activate_task(p, rq);
+}
+
+/*
+ * deactivate_task - remove a task from the runqueue.
+ */
+static void deactivate_task(struct task_struct *p, runqueue_t *rq)
+{
+	rq->nr_running--;
+	dequeue_task(p, p->array);
+	p->array = NULL;
+}
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+#ifdef CONFIG_SMP
+static void resched_task(task_t *p)
+{
+	int need_resched, nrpolling;
+
+	assert_spin_locked(&task_rq(p)->lock);
+
+	/* minimise the chance of sending an interrupt to poll_idle() */
+	nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
+	need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
+	nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
+
+	if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
+		smp_send_reschedule(task_cpu(p));
+}
+#else
+static inline void resched_task(task_t *p)
+{
+	set_tsk_need_resched(p);
+}
+#endif
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ */
+inline int task_curr(const task_t *p)
+{
+	return cpu_curr(task_cpu(p)) == p;
+}
+
+#ifdef CONFIG_SMP
+enum request_type {
+	REQ_MOVE_TASK,
+	REQ_SET_DOMAIN,
+};
+
+typedef struct {
+	struct list_head list;
+	enum request_type type;
+
+	/* For REQ_MOVE_TASK */
+	task_t *task;
+	int dest_cpu;
+
+	/* For REQ_SET_DOMAIN */
+	struct sched_domain *sd;
+
+	struct completion done;
+} migration_req_t;
+
+/*
+ * The task's runqueue lock must be held.
+ * Returns true if you have to wait for migration thread.
+ */
+static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
+{
+	runqueue_t *rq = task_rq(p);
+
+	/*
+	 * If the task is not on a runqueue (and not running), then
+	 * it is sufficient to simply update the task's cpu field.
+	 */
+	if (!p->array && !task_running(rq, p)) {
+		set_task_cpu(p, dest_cpu);
+		return 0;
+	}
+
+	init_completion(&req->done);
+	req->type = REQ_MOVE_TASK;
+	req->task = p;
+	req->dest_cpu = dest_cpu;
+	list_add(&req->list, &rq->migration_queue);
+	return 1;
+}
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+void wait_task_inactive(task_t * p)
+{
+	unsigned long flags;
+	runqueue_t *rq;
+	int preempted;
+
+repeat:
+	rq = task_rq_lock(p, &flags);
+	/* Must be off runqueue entirely, not preempted. */
+	if (unlikely(p->array || task_running(rq, p))) {
+		/* If it's preempted, we yield.  It could be a while. */
+		preempted = !task_running(rq, p);
+		task_rq_unlock(rq, &flags);
+		cpu_relax();
+		if (preempted)
+			yield();
+		goto repeat;
+	}
+	task_rq_unlock(rq, &flags);
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesnt have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(task_t *p)
+{
+	int cpu;
+
+	preempt_disable();
+	cpu = task_cpu(p);
+	if ((cpu != smp_processor_id()) && task_curr(p))
+		smp_send_reschedule(cpu);
+	preempt_enable();
+}
+
+/*
+ * Return a low guess at the load of a migration-source cpu.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static inline unsigned long source_load(int cpu)
+{
+	runqueue_t *rq = cpu_rq(cpu);
+	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+
+	return min(rq->cpu_load, load_now);
+}
+
+/*
+ * Return a high guess at the load of a migration-target cpu
+ */
+static inline unsigned long target_load(int cpu)
+{
+	runqueue_t *rq = cpu_rq(cpu);
+	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+
+	return max(rq->cpu_load, load_now);
+}
+
+#endif
+
+/*
+ * wake_idle() will wake a task on an idle cpu if task->cpu is
+ * not idle and an idle cpu is available.  The span of cpus to
+ * search starts with cpus closest then further out as needed,
+ * so we always favor a closer, idle cpu.
+ *
+ * Returns the CPU we should wake onto.
+ */
+#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+static int wake_idle(int cpu, task_t *p)
+{
+	cpumask_t tmp;
+	struct sched_domain *sd;
+	int i;
+
+	if (idle_cpu(cpu))
+		return cpu;
+
+	for_each_domain(cpu, sd) {
+		if (sd->flags & SD_WAKE_IDLE) {
+			cpus_and(tmp, sd->span, cpu_online_map);
+			cpus_and(tmp, tmp, p->cpus_allowed);
+			for_each_cpu_mask(i, tmp) {
+				if (idle_cpu(i))
+					return i;
+			}
+		}
+		else break;
+	}
+	return cpu;
+}
+#else
+static inline int wake_idle(int cpu, task_t *p)
+{
+	return cpu;
+}
+#endif
+
+/***
+ * try_to_wake_up - wake up a thread
+ * @p: the to-be-woken-up thread
+ * @state: the mask of task states that can be woken
+ * @sync: do a synchronous wakeup?
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * returns failure only if the task is already active.
+ */
+static int try_to_wake_up(task_t * p, unsigned int state, int sync)
+{
+	int cpu, this_cpu, success = 0;
+	unsigned long flags;
+	long old_state;
+	runqueue_t *rq;
+#ifdef CONFIG_SMP
+	unsigned long load, this_load;
+	struct sched_domain *sd;
+	int new_cpu;
+#endif
+
+	rq = task_rq_lock(p, &flags);
+	old_state = p->state;
+	if (!(old_state & state))
+		goto out;
+
+	if (p->array)
+		goto out_running;
+
+	cpu = task_cpu(p);
+	this_cpu = smp_processor_id();
+
+#ifdef CONFIG_SMP
+	if (unlikely(task_running(rq, p)))
+		goto out_activate;
+
+#ifdef CONFIG_SCHEDSTATS
+	schedstat_inc(rq, ttwu_cnt);
+	if (cpu == this_cpu) {
+		schedstat_inc(rq, ttwu_local);
+	} else {
+		for_each_domain(this_cpu, sd) {
+			if (cpu_isset(cpu, sd->span)) {
+				schedstat_inc(sd, ttwu_wake_remote);
+				break;
+			}
+		}
+	}
+#endif
+
+	new_cpu = cpu;
+	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+		goto out_set_cpu;
+
+	load = source_load(cpu);
+	this_load = target_load(this_cpu);
+
+	/*
+	 * If sync wakeup then subtract the (maximum possible) effect of
+	 * the currently running task from the load of the current CPU:
+	 */
+	if (sync)
+		this_load -= SCHED_LOAD_SCALE;
+
+	/* Don't pull the task off an idle CPU to a busy one */
+	if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
+		goto out_set_cpu;
+
+	new_cpu = this_cpu; /* Wake to this CPU if we can */
+
+	/*
+	 * Scan domains for affine wakeup and passive balancing
+	 * possibilities.
+	 */
+	for_each_domain(this_cpu, sd) {
+		unsigned int imbalance;
+		/*
+		 * Start passive balancing when half the imbalance_pct
+		 * limit is reached.
+		 */
+		imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
+
+		if ((sd->flags & SD_WAKE_AFFINE) &&
+				!task_hot(p, rq->timestamp_last_tick, sd)) {
+			/*
+			 * This domain has SD_WAKE_AFFINE and p is cache cold
+			 * in this domain.
+			 */
+			if (cpu_isset(cpu, sd->span)) {
+				schedstat_inc(sd, ttwu_move_affine);
+				goto out_set_cpu;
+			}
+		} else if ((sd->flags & SD_WAKE_BALANCE) &&
+				imbalance*this_load <= 100*load) {
+			/*
+			 * This domain has SD_WAKE_BALANCE and there is
+			 * an imbalance.
+			 */
+			if (cpu_isset(cpu, sd->span)) {
+				schedstat_inc(sd, ttwu_move_balance);
+				goto out_set_cpu;
+			}
+		}
+	}
+
+	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
+out_set_cpu:
+	new_cpu = wake_idle(new_cpu, p);
+	if (new_cpu != cpu) {
+		set_task_cpu(p, new_cpu);
+		task_rq_unlock(rq, &flags);
+		/* might preempt at this point */
+		rq = task_rq_lock(p, &flags);
+		old_state = p->state;
+		if (!(old_state & state))
+			goto out;
+		if (p->array)
+			goto out_running;
+
+		this_cpu = smp_processor_id();
+		cpu = task_cpu(p);
+	}
+
+out_activate:
+#endif /* CONFIG_SMP */
+	if (old_state == TASK_UNINTERRUPTIBLE) {
+		rq->nr_uninterruptible--;
+		/*
+		 * Tasks on involuntary sleep don't earn
+		 * sleep_avg beyond just interactive state.
+		 */
+		p->activated = -1;
+	}
+
+	/*
+	 * Sync wakeups (i.e. those types of wakeups where the waker
+	 * has indicated that it will leave the CPU in short order)
+	 * don't trigger a preemption, if the woken up task will run on
+	 * this cpu. (in this case the 'I will reschedule' promise of
+	 * the waker guarantees that the freshly woken up task is going
+	 * to be considered on this CPU.)
+	 */
+	activate_task(p, rq, cpu == this_cpu);
+	if (!sync || cpu != this_cpu) {
+		if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	success = 1;
+
+out_running:
+	p->state = TASK_RUNNING;
+out:
+	task_rq_unlock(rq, &flags);
+
+	return success;
+}
+
+int fastcall wake_up_process(task_t * p)
+{
+	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+}
+
+EXPORT_SYMBOL(wake_up_process);
+
+int fastcall wake_up_state(task_t *p, unsigned int state)
+{
+	return try_to_wake_up(p, state, 0);
+}
+
+#ifdef CONFIG_SMP
+static int find_idlest_cpu(struct task_struct *p, int this_cpu,
+			   struct sched_domain *sd);
+#endif
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ */
+void fastcall sched_fork(task_t *p)
+{
+	/*
+	 * We mark the process as running here, but have not actually
+	 * inserted it onto the runqueue yet. This guarantees that
+	 * nobody will actually run it, and a signal or other external
+	 * event cannot wake it up and insert it on the runqueue either.
+	 */
+	p->state = TASK_RUNNING;
+	INIT_LIST_HEAD(&p->run_list);
+	p->array = NULL;
+	spin_lock_init(&p->switch_lock);
+#ifdef CONFIG_SCHEDSTATS
+	memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+#ifdef CONFIG_PREEMPT
+	/*
+	 * During context-switch we hold precisely one spinlock, which
+	 * schedule_tail drops. (in the common case it's this_rq()->lock,
+	 * but it also can be p->switch_lock.) So we compensate with a count
+	 * of 1. Also, we want to start with kernel preemption disabled.
+	 */
+	p->thread_info->preempt_count = 1;
+#endif
+	/*
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesn't change,
+	 * resulting in more scheduling fairness.
+	 */
+	local_irq_disable();
+	p->time_slice = (current->time_slice + 1) >> 1;
+	/*
+	 * The remainder of the first timeslice might be recovered by
+	 * the parent if the child exits early enough.
+	 */
+	p->first_time_slice = 1;
+	current->time_slice >>= 1;
+	p->timestamp = sched_clock();
+	if (unlikely(!current->time_slice)) {
+		/*
+		 * This case is rare, it happens when the parent has only
+		 * a single jiffy left from its timeslice. Taking the
+		 * runqueue lock is not a problem.
+		 */
+		current->time_slice = 1;
+		preempt_disable();
+		scheduler_tick();
+		local_irq_enable();
+		preempt_enable();
+	} else
+		local_irq_enable();
+}
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
+{
+	unsigned long flags;
+	int this_cpu, cpu;
+	runqueue_t *rq, *this_rq;
+
+	rq = task_rq_lock(p, &flags);
+	cpu = task_cpu(p);
+	this_cpu = smp_processor_id();
+
+	BUG_ON(p->state != TASK_RUNNING);
+
+	/*
+	 * We decrease the sleep average of forking parents
+	 * and children as well, to keep max-interactive tasks
+	 * from forking tasks that are max-interactive. The parent
+	 * (current) is done further down, under its lock.
+	 */
+	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
+		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+
+	p->prio = effective_prio(p);
+
+	if (likely(cpu == this_cpu)) {
+		if (!(clone_flags & CLONE_VM)) {
+			/*
+			 * The VM isn't cloned, so we're in a good position to
+			 * do child-runs-first in anticipation of an exec. This
+			 * usually avoids a lot of COW overhead.
+			 */
+			if (unlikely(!current->array))
+				__activate_task(p, rq);
+			else {
+				p->prio = current->prio;
+				list_add_tail(&p->run_list, &current->run_list);
+				p->array = current->array;
+				p->array->nr_active++;
+				rq->nr_running++;
+			}
+			set_need_resched();
+		} else
+			/* Run child last */
+			__activate_task(p, rq);
+		/*
+		 * We skip the following code due to cpu == this_cpu
+	 	 *
+		 *   task_rq_unlock(rq, &flags);
+		 *   this_rq = task_rq_lock(current, &flags);
+		 */
+		this_rq = rq;
+	} else {
+		this_rq = cpu_rq(this_cpu);
+
+		/*
+		 * Not the local CPU - must adjust timestamp. This should
+		 * get optimised away in the !CONFIG_SMP case.
+		 */
+		p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
+					+ rq->timestamp_last_tick;
+		__activate_task(p, rq);
+		if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+
+		/*
+		 * Parent and child are on different CPUs, now get the
+		 * parent runqueue to update the parent's ->sleep_avg:
+		 */
+		task_rq_unlock(rq, &flags);
+		this_rq = task_rq_lock(current, &flags);
+	}
+	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
+		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+	task_rq_unlock(this_rq, &flags);
+}
+
+/*
+ * Potentially available exiting-child timeslices are
+ * retrieved here - this way the parent does not get
+ * penalized for creating too many threads.
+ *
+ * (this cannot be used to 'generate' timeslices
+ * artificially, because any timeslice recovered here
+ * was given away by the parent in the first place.)
+ */
+void fastcall sched_exit(task_t * p)
+{
+	unsigned long flags;
+	runqueue_t *rq;
+
+	/*
+	 * If the child was a (relative-) CPU hog then decrease
+	 * the sleep_avg of the parent as well.
+	 */
+	rq = task_rq_lock(p->parent, &flags);
+	if (p->first_time_slice) {
+		p->parent->time_slice += p->time_slice;
+		if (unlikely(p->parent->time_slice > task_timeslice(p)))
+			p->parent->time_slice = task_timeslice(p);
+	}
+	if (p->sleep_avg < p->parent->sleep_avg)
+		p->parent->sleep_avg = p->parent->sleep_avg /
+		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
+		(EXIT_WEIGHT + 1);
+	task_rq_unlock(rq, &flags);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * We enter this with the runqueue still locked, and finish_arch_switch()
+ * will unlock it along with doing any other architecture-specific cleanup
+ * actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock.  (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ */
+static inline void finish_task_switch(task_t *prev)
+	__releases(rq->lock)
+{
+	runqueue_t *rq = this_rq();
+	struct mm_struct *mm = rq->prev_mm;
+	unsigned long prev_task_flags;
+
+	rq->prev_mm = NULL;
+
+	/*
+	 * A task struct has one reference for the use as "current".
+	 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
+	 * calls schedule one last time. The schedule call will never return,
+	 * and the scheduled task must drop that reference.
+	 * The test for EXIT_ZOMBIE must occur while the runqueue locks are
+	 * still held, otherwise prev could be scheduled on another cpu, die
+	 * there before we look at prev->state, and then the reference would
+	 * be dropped twice.
+	 *		Manfred Spraul <manfred@colorfullife.com>
+	 */
+	prev_task_flags = prev->flags;
+	finish_arch_switch(rq, prev);
+	if (mm)
+		mmdrop(mm);
+	if (unlikely(prev_task_flags & PF_DEAD))
+		put_task_struct(prev);
+}
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage void schedule_tail(task_t *prev)
+	__releases(rq->lock)
+{
+	finish_task_switch(prev);
+
+	if (current->set_child_tid)
+		put_user(current->pid, current->set_child_tid);
+}
+
+/*
+ * context_switch - switch to the new MM and the new
+ * thread's register state.
+ */
+static inline
+task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
+{
+	struct mm_struct *mm = next->mm;
+	struct mm_struct *oldmm = prev->active_mm;
+
+	if (unlikely(!mm)) {
+		next->active_mm = oldmm;
+		atomic_inc(&oldmm->mm_count);
+		enter_lazy_tlb(oldmm, next);
+	} else
+		switch_mm(oldmm, mm, next);
+
+	if (unlikely(!prev->mm)) {
+		prev->active_mm = NULL;
+		WARN_ON(rq->prev_mm);
+		rq->prev_mm = oldmm;
+	}
+
+	/* Here we just switch the register state and the stack. */
+	switch_to(prev, next, prev);
+
+	return prev;
+}
+
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, current number of uninterruptible-sleeping threads, total
+ * number of context switches performed since bootup.
+ */
+unsigned long nr_running(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->nr_running;
+
+	return sum;
+}
+
+unsigned long nr_uninterruptible(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_cpu(i)
+		sum += cpu_rq(i)->nr_uninterruptible;
+
+	/*
+	 * Since we read the counters lockless, it might be slightly
+	 * inaccurate. Do not allow it to go below zero though:
+	 */
+	if (unlikely((long)sum < 0))
+		sum = 0;
+
+	return sum;
+}
+
+unsigned long long nr_context_switches(void)
+{
+	unsigned long long i, sum = 0;
+
+	for_each_cpu(i)
+		sum += cpu_rq(i)->nr_switches;
+
+	return sum;
+}
+
+unsigned long nr_iowait(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_cpu(i)
+		sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+	return sum;
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
+{
+	if (rq1 == rq2) {
+		spin_lock(&rq1->lock);
+		__acquire(rq2->lock);	/* Fake it out ;) */
+	} else {
+		if (rq1 < rq2) {
+			spin_lock(&rq1->lock);
+			spin_lock(&rq2->lock);
+		} else {
+			spin_lock(&rq2->lock);
+			spin_lock(&rq1->lock);
+		}
+	}
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
+	__releases(rq1->lock)
+	__releases(rq2->lock)
+{
+	spin_unlock(&rq1->lock);
+	if (rq1 != rq2)
+		spin_unlock(&rq2->lock);
+	else
+		__release(rq2->lock);
+}
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
+	__releases(this_rq->lock)
+	__acquires(busiest->lock)
+	__acquires(this_rq->lock)
+{
+	if (unlikely(!spin_trylock(&busiest->lock))) {
+		if (busiest < this_rq) {
+			spin_unlock(&this_rq->lock);
+			spin_lock(&busiest->lock);
+			spin_lock(&this_rq->lock);
+		} else
+			spin_lock(&busiest->lock);
+	}
+}
+
+/*
+ * find_idlest_cpu - find the least busy runqueue.
+ */
+static int find_idlest_cpu(struct task_struct *p, int this_cpu,
+			   struct sched_domain *sd)
+{
+	unsigned long load, min_load, this_load;
+	int i, min_cpu;
+	cpumask_t mask;
+
+	min_cpu = UINT_MAX;
+	min_load = ULONG_MAX;
+
+	cpus_and(mask, sd->span, p->cpus_allowed);
+
+	for_each_cpu_mask(i, mask) {
+		load = target_load(i);
+
+		if (load < min_load) {
+			min_cpu = i;
+			min_load = load;
+
+			/* break out early on an idle CPU: */
+			if (!min_load)
+				break;
+		}
+	}
+
+	/* add +1 to account for the new task */
+	this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
+
+	/*
+	 * Would with the addition of the new task to the
+	 * current CPU there be an imbalance between this
+	 * CPU and the idlest CPU?
+	 *
+	 * Use half of the balancing threshold - new-context is
+	 * a good opportunity to balance.
+	 */
+	if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
+		return min_cpu;
+
+	return this_cpu;
+}
+
+/*
+ * If dest_cpu is allowed for this process, migrate the task to it.
+ * This is accomplished by forcing the cpu_allowed mask to only
+ * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
+ * the cpu_allowed mask is restored.
+ */
+static void sched_migrate_task(task_t *p, int dest_cpu)
+{
+	migration_req_t req;
+	runqueue_t *rq;
+	unsigned long flags;
+
+	rq = task_rq_lock(p, &flags);
+	if (!cpu_isset(dest_cpu, p->cpus_allowed)
+	    || unlikely(cpu_is_offline(dest_cpu)))
+		goto out;
+
+	/* force the process onto the specified CPU */
+	if (migrate_task(p, dest_cpu, &req)) {
+		/* Need to wait for migration thread (might exit: take ref). */
+		struct task_struct *mt = rq->migration_thread;
+		get_task_struct(mt);
+		task_rq_unlock(rq, &flags);
+		wake_up_process(mt);
+		put_task_struct(mt);
+		wait_for_completion(&req.done);
+		return;
+	}
+out:
+	task_rq_unlock(rq, &flags);
+}
+
+/*
+ * sched_exec(): find the highest-level, exec-balance-capable
+ * domain and try to migrate the task to the least loaded CPU.
+ *
+ * execve() is a valuable balancing opportunity, because at this point
+ * the task has the smallest effective memory and cache footprint.
+ */
+void sched_exec(void)
+{
+	struct sched_domain *tmp, *sd = NULL;
+	int new_cpu, this_cpu = get_cpu();
+
+	/* Prefer the current CPU if there's only this task running */
+	if (this_rq()->nr_running <= 1)
+		goto out;
+
+	for_each_domain(this_cpu, tmp)
+		if (tmp->flags & SD_BALANCE_EXEC)
+			sd = tmp;
+
+	if (sd) {
+		schedstat_inc(sd, sbe_attempts);
+		new_cpu = find_idlest_cpu(current, this_cpu, sd);
+		if (new_cpu != this_cpu) {
+			schedstat_inc(sd, sbe_pushed);
+			put_cpu();
+			sched_migrate_task(current, new_cpu);
+			return;
+		}
+	}
+out:
+	put_cpu();
+}
+
+/*
+ * pull_task - move a task from a remote runqueue to the local runqueue.
+ * Both runqueues must be locked.
+ */
+static inline
+void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
+	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
+{
+	dequeue_task(p, src_array);
+	src_rq->nr_running--;
+	set_task_cpu(p, this_cpu);
+	this_rq->nr_running++;
+	enqueue_task(p, this_array);
+	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
+				+ this_rq->timestamp_last_tick;
+	/*
+	 * Note that idle threads have a prio of MAX_PRIO, for this test
+	 * to be always true for them.
+	 */
+	if (TASK_PREEMPTS_CURR(p, this_rq))
+		resched_task(this_rq->curr);
+}
+
+/*
+ * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ */
+static inline
+int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
+		     struct sched_domain *sd, enum idle_type idle)
+{
+	/*
+	 * We do not migrate tasks that are:
+	 * 1) running (obviously), or
+	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
+	 * 3) are cache-hot on their current CPU.
+	 */
+	if (task_running(rq, p))
+		return 0;
+	if (!cpu_isset(this_cpu, p->cpus_allowed))
+		return 0;
+
+	/*
+	 * Aggressive migration if:
+	 * 1) the [whole] cpu is idle, or
+	 * 2) too many balance attempts have failed.
+	 */
+
+	if (cpu_and_siblings_are_idle(this_cpu) || \
+			sd->nr_balance_failed > sd->cache_nice_tries)
+		return 1;
+
+	if (task_hot(p, rq->timestamp_last_tick, sd))
+			return 0;
+	return 1;
+}
+
+/*
+ * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
+ * as part of a balancing operation within "domain". Returns the number of
+ * tasks moved.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+		      unsigned long max_nr_move, struct sched_domain *sd,
+		      enum idle_type idle)
+{
+	prio_array_t *array, *dst_array;
+	struct list_head *head, *curr;
+	int idx, pulled = 0;
+	task_t *tmp;
+
+	if (max_nr_move <= 0 || busiest->nr_running <= 1)
+		goto out;
+
+	/*
+	 * We first consider expired tasks. Those will likely not be
+	 * executed in the near future, and they are most likely to
+	 * be cache-cold, thus switching CPUs has the least effect
+	 * on them.
+	 */
+	if (busiest->expired->nr_active) {
+		array = busiest->expired;
+		dst_array = this_rq->expired;
+	} else {
+		array = busiest->active;
+		dst_array = this_rq->active;
+	}
+
+new_array:
+	/* Start searching at priority 0: */
+	idx = 0;
+skip_bitmap:
+	if (!idx)
+		idx = sched_find_first_bit(array->bitmap);
+	else
+		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+	if (idx >= MAX_PRIO) {
+		if (array == busiest->expired && busiest->active->nr_active) {
+			array = busiest->active;
+			dst_array = this_rq->active;
+			goto new_array;
+		}
+		goto out;
+	}
+
+	head = array->queue + idx;
+	curr = head->prev;
+skip_queue:
+	tmp = list_entry(curr, task_t, run_list);
+
+	curr = curr->prev;
+
+	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
+	}
+
+#ifdef CONFIG_SCHEDSTATS
+	if (task_hot(tmp, busiest->timestamp_last_tick, sd))
+		schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
+
+	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+	pulled++;
+
+	/* We only want to steal up to the prescribed number of tasks. */
+	if (pulled < max_nr_move) {
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
+	}
+out:
+	/*
+	 * Right now, this is the only place pull_task() is called,
+	 * so we can safely collect pull_task() stats here rather than
+	 * inside pull_task().
+	 */
+	schedstat_add(sd, lb_gained[idle], pulled);
+	return pulled;
+}
+
+/*
+ * find_busiest_group finds and returns the busiest CPU group within the
+ * domain. It calculates and returns the number of tasks which should be
+ * moved to restore balance via the imbalance parameter.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+		   unsigned long *imbalance, enum idle_type idle)
+{
+	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+
+	max_load = this_load = total_load = total_pwr = 0;
+
+	do {
+		unsigned long load;
+		int local_group;
+		int i;
+
+		local_group = cpu_isset(this_cpu, group->cpumask);
+
+		/* Tally up the load of all CPUs in the group */
+		avg_load = 0;
+
+		for_each_cpu_mask(i, group->cpumask) {
+			/* Bias balancing toward cpus of our domain */
+			if (local_group)
+				load = target_load(i);
+			else
+				load = source_load(i);
+
+			avg_load += load;
+		}
+
+		total_load += avg_load;
+		total_pwr += group->cpu_power;
+
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+		if (local_group) {
+			this_load = avg_load;
+			this = group;
+			goto nextgroup;
+		} else if (avg_load > max_load) {
+			max_load = avg_load;
+			busiest = group;
+		}
+nextgroup:
+		group = group->next;
+	} while (group != sd->groups);
+
+	if (!busiest || this_load >= max_load)
+		goto out_balanced;
+
+	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+
+	if (this_load >= avg_load ||
+			100*max_load <= sd->imbalance_pct*this_load)
+		goto out_balanced;
+
+	/*
+	 * We're trying to get all the cpus to the average_load, so we don't
+	 * want to push ourselves above the average load, nor do we wish to
+	 * reduce the max loaded cpu below the average load, as either of these
+	 * actions would just result in more rebalancing later, and ping-pong
+	 * tasks around. Thus we look for the minimum possible imbalance.
+	 * Negative imbalances (*we* are more loaded than anyone else) will
+	 * be counted as no imbalance for these purposes -- we can't fix that
+	 * by pulling tasks to us.  Be careful of negative numbers as they'll
+	 * appear as very large values with unsigned longs.
+	 */
+	/* How much load to actually move to equalise the imbalance */
+	*imbalance = min((max_load - avg_load) * busiest->cpu_power,
+				(avg_load - this_load) * this->cpu_power)
+			/ SCHED_LOAD_SCALE;
+
+	if (*imbalance < SCHED_LOAD_SCALE) {
+		unsigned long pwr_now = 0, pwr_move = 0;
+		unsigned long tmp;
+
+		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
+			*imbalance = 1;
+			return busiest;
+		}
+
+		/*
+		 * OK, we don't have enough imbalance to justify moving tasks,
+		 * however we may be able to increase total CPU power used by
+		 * moving them.
+		 */
+
+		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
+		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+		pwr_now /= SCHED_LOAD_SCALE;
+
+		/* Amount of load we'd subtract */
+		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+		if (max_load > tmp)
+			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
+							max_load - tmp);
+
+		/* Amount of load we'd add */
+		if (max_load*busiest->cpu_power <
+				SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
+			tmp = max_load*busiest->cpu_power/this->cpu_power;
+		else
+			tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
+		pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+		pwr_move /= SCHED_LOAD_SCALE;
+
+		/* Move if we gain throughput */
+		if (pwr_move <= pwr_now)
+			goto out_balanced;
+
+		*imbalance = 1;
+		return busiest;
+	}
+
+	/* Get rid of the scaling factor, rounding down as we divide */
+	*imbalance = *imbalance / SCHED_LOAD_SCALE;
+
+	return busiest;
+
+out_balanced:
+	if (busiest && (idle == NEWLY_IDLE ||
+			(idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
+		*imbalance = 1;
+		return busiest;
+	}
+
+	*imbalance = 0;
+	return NULL;
+}
+
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+static runqueue_t *find_busiest_queue(struct sched_group *group)
+{
+	unsigned long load, max_load = 0;
+	runqueue_t *busiest = NULL;
+	int i;
+
+	for_each_cpu_mask(i, group->cpumask) {
+		load = source_load(i);
+
+		if (load > max_load) {
+			max_load = load;
+			busiest = cpu_rq(i);
+		}
+	}
+
+	return busiest;
+}
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ *
+ * Called with this_rq unlocked.
+ */
+static int load_balance(int this_cpu, runqueue_t *this_rq,
+			struct sched_domain *sd, enum idle_type idle)
+{
+	struct sched_group *group;
+	runqueue_t *busiest;
+	unsigned long imbalance;
+	int nr_moved;
+
+	spin_lock(&this_rq->lock);
+	schedstat_inc(sd, lb_cnt[idle]);
+
+	group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+	if (!group) {
+		schedstat_inc(sd, lb_nobusyg[idle]);
+		goto out_balanced;
+	}
+
+	busiest = find_busiest_queue(group);
+	if (!busiest) {
+		schedstat_inc(sd, lb_nobusyq[idle]);
+		goto out_balanced;
+	}
+
+	/*
+	 * This should be "impossible", but since load
+	 * balancing is inherently racy and statistical,
+	 * it could happen in theory.
+	 */
+	if (unlikely(busiest == this_rq)) {
+		WARN_ON(1);
+		goto out_balanced;
+	}
+
+	schedstat_add(sd, lb_imbalance[idle], imbalance);
+
+	nr_moved = 0;
+	if (busiest->nr_running > 1) {
+		/*
+		 * Attempt to move tasks. If find_busiest_group has found
+		 * an imbalance but busiest->nr_running <= 1, the group is
+		 * still unbalanced. nr_moved simply stays zero, so it is
+		 * correctly treated as an imbalance.
+		 */
+		double_lock_balance(this_rq, busiest);
+		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+						imbalance, sd, idle);
+		spin_unlock(&busiest->lock);
+	}
+	spin_unlock(&this_rq->lock);
+
+	if (!nr_moved) {
+		schedstat_inc(sd, lb_failed[idle]);
+		sd->nr_balance_failed++;
+
+		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
+			int wake = 0;
+
+			spin_lock(&busiest->lock);
+			if (!busiest->active_balance) {
+				busiest->active_balance = 1;
+				busiest->push_cpu = this_cpu;
+				wake = 1;
+			}
+			spin_unlock(&busiest->lock);
+			if (wake)
+				wake_up_process(busiest->migration_thread);
+
+			/*
+			 * We've kicked active balancing, reset the failure
+			 * counter.
+			 */
+			sd->nr_balance_failed = sd->cache_nice_tries;
+		}
+
+		/*
+		 * We were unbalanced, but unsuccessful in move_tasks(),
+		 * so bump the balance_interval to lessen the lock contention.
+		 */
+		if (sd->balance_interval < sd->max_interval)
+			sd->balance_interval++;
+	} else {
+		sd->nr_balance_failed = 0;
+
+		/* We were unbalanced, so reset the balancing interval */
+		sd->balance_interval = sd->min_interval;
+	}
+
+	return nr_moved;
+
+out_balanced:
+	spin_unlock(&this_rq->lock);
+
+	schedstat_inc(sd, lb_balanced[idle]);
+
+	/* tune up the balancing interval */
+	if (sd->balance_interval < sd->max_interval)
+		sd->balance_interval *= 2;
+
+	return 0;
+}
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ *
+ * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
+ * this_rq is locked.
+ */
+static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
+				struct sched_domain *sd)
+{
+	struct sched_group *group;
+	runqueue_t *busiest = NULL;
+	unsigned long imbalance;
+	int nr_moved = 0;
+
+	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
+	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+	if (!group) {
+		schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
+		goto out;
+	}
+
+	busiest = find_busiest_queue(group);
+	if (!busiest || busiest == this_rq) {
+		schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
+		goto out;
+	}
+
+	/* Attempt to move tasks */
+	double_lock_balance(this_rq, busiest);
+
+	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
+	nr_moved = move_tasks(this_rq, this_cpu, busiest,
+					imbalance, sd, NEWLY_IDLE);
+	if (!nr_moved)
+		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+
+	spin_unlock(&busiest->lock);
+
+out:
+	return nr_moved;
+}
+
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
+{
+	struct sched_domain *sd;
+
+	for_each_domain(this_cpu, sd) {
+		if (sd->flags & SD_BALANCE_NEWIDLE) {
+			if (load_balance_newidle(this_cpu, this_rq, sd)) {
+				/* We've pulled tasks over so stop searching */
+				break;
+			}
+		}
+	}
+}
+
+/*
+ * active_load_balance is run by migration threads. It pushes running tasks
+ * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
+ * running on each physical CPU where possible, and avoids physical /
+ * logical imbalances.
+ *
+ * Called with busiest_rq locked.
+ */
+static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
+{
+	struct sched_domain *sd;
+	struct sched_group *cpu_group;
+	runqueue_t *target_rq;
+	cpumask_t visited_cpus;
+	int cpu;
+
+	/*
+	 * Search for suitable CPUs to push tasks to in successively higher
+	 * domains with SD_LOAD_BALANCE set.
+	 */
+	visited_cpus = CPU_MASK_NONE;
+	for_each_domain(busiest_cpu, sd) {
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			/* no more domains to search */
+			break;
+
+		schedstat_inc(sd, alb_cnt);
+
+		cpu_group = sd->groups;
+		do {
+			for_each_cpu_mask(cpu, cpu_group->cpumask) {
+				if (busiest_rq->nr_running <= 1)
+					/* no more tasks left to move */
+					return;
+				if (cpu_isset(cpu, visited_cpus))
+					continue;
+				cpu_set(cpu, visited_cpus);
+				if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
+					continue;
+
+				target_rq = cpu_rq(cpu);
+				/*
+				 * This condition is "impossible", if it occurs
+				 * we need to fix it.  Originally reported by
+				 * Bjorn Helgaas on a 128-cpu setup.
+				 */
+				BUG_ON(busiest_rq == target_rq);
+
+				/* move a task from busiest_rq to target_rq */
+				double_lock_balance(busiest_rq, target_rq);
+				if (move_tasks(target_rq, cpu, busiest_rq,
+						1, sd, SCHED_IDLE)) {
+					schedstat_inc(sd, alb_pushed);
+				} else {
+					schedstat_inc(sd, alb_failed);
+				}
+				spin_unlock(&target_rq->lock);
+			}
+			cpu_group = cpu_group->next;
+		} while (cpu_group != sd->groups);
+	}
+}
+
+/*
+ * rebalance_tick will get called every timer tick, on every CPU.
+ *
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+
+/* Don't have all balancing operations going off at once */
+#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
+
+static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
+			   enum idle_type idle)
+{
+	unsigned long old_load, this_load;
+	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
+	struct sched_domain *sd;
+
+	/* Update our load */
+	old_load = this_rq->cpu_load;
+	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
+	/*
+	 * Round up the averaging division if load is increasing. This
+	 * prevents us from getting stuck on 9 if the load is 10, for
+	 * example.
+	 */
+	if (this_load > old_load)
+		old_load++;
+	this_rq->cpu_load = (old_load + this_load) / 2;
+
+	for_each_domain(this_cpu, sd) {
+		unsigned long interval;
+
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			continue;
+
+		interval = sd->balance_interval;
+		if (idle != SCHED_IDLE)
+			interval *= sd->busy_factor;
+
+		/* scale ms to jiffies */
+		interval = msecs_to_jiffies(interval);
+		if (unlikely(!interval))
+			interval = 1;
+
+		if (j - sd->last_balance >= interval) {
+			if (load_balance(this_cpu, this_rq, sd, idle)) {
+				/* We've pulled tasks over so no longer idle */
+				idle = NOT_IDLE;
+			}
+			sd->last_balance += interval;
+		}
+	}
+}
+#else
+/*
+ * on UP we do not need to balance between CPUs:
+ */
+static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
+{
+}
+static inline void idle_balance(int cpu, runqueue_t *rq)
+{
+}
+#endif
+
+static inline int wake_priority_sleeper(runqueue_t *rq)
+{
+	int ret = 0;
+#ifdef CONFIG_SCHED_SMT
+	spin_lock(&rq->lock);
+	/*
+	 * If an SMT sibling task has been put to sleep for priority
+	 * reasons reschedule the idle task to see if it can now run.
+	 */
+	if (rq->nr_running) {
+		resched_task(rq->idle);
+		ret = 1;
+	}
+	spin_unlock(&rq->lock);
+#endif
+	return ret;
+}
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+
+/*
+ * This is called on clock ticks and on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ */
+static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
+				    unsigned long long now)
+{
+	unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
+	p->sched_time += now - last;
+}
+
+/*
+ * Return current->sched_time plus any more ns on the sched_clock
+ * that have not yet been banked.
+ */
+unsigned long long current_sched_time(const task_t *tsk)
+{
+	unsigned long long ns;
+	unsigned long flags;
+	local_irq_save(flags);
+	ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
+	ns = tsk->sched_time + (sched_clock() - ns);
+	local_irq_restore(flags);
+	return ns;
+}
+
+/*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks. We also ignore the interactivity
+ * if a better static_prio task has expired:
+ */
+#define EXPIRED_STARVING(rq) \
+	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
+		(jiffies - (rq)->expired_timestamp >= \
+			STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
+			((rq)->curr->static_prio > (rq)->best_expired_prio))
+
+/*
+ * Account user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in user space since the last update
+ */
+void account_user_time(struct task_struct *p, cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t tmp;
+
+	p->utime = cputime_add(p->utime, cputime);
+
+	/* Add user time to cpustat. */
+	tmp = cputime_to_cputime64(cputime);
+	if (TASK_NICE(p) > 0)
+		cpustat->nice = cputime64_add(cpustat->nice, tmp);
+	else
+		cpustat->user = cputime64_add(cpustat->user, tmp);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+			 cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	runqueue_t *rq = this_rq();
+	cputime64_t tmp;
+
+	p->stime = cputime_add(p->stime, cputime);
+
+	/* Add system time to cpustat. */
+	tmp = cputime_to_cputime64(cputime);
+	if (hardirq_count() - hardirq_offset)
+		cpustat->irq = cputime64_add(cpustat->irq, tmp);
+	else if (softirq_count())
+		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	else if (p != rq->idle)
+		cpustat->system = cputime64_add(cpustat->system, tmp);
+	else if (atomic_read(&rq->nr_iowait) > 0)
+		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
+	else
+		cpustat->idle = cputime64_add(cpustat->idle, tmp);
+	/* Account for system time used */
+	acct_update_integrals(p);
+	/* Update rss highwater mark */
+	update_mem_hiwater(p);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @p: the process from which the cpu time has been stolen
+ * @steal: the cpu time spent in involuntary wait
+ */
+void account_steal_time(struct task_struct *p, cputime_t steal)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t tmp = cputime_to_cputime64(steal);
+	runqueue_t *rq = this_rq();
+
+	if (p == rq->idle) {
+		p->stime = cputime_add(p->stime, steal);
+		if (atomic_read(&rq->nr_iowait) > 0)
+			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
+		else
+			cpustat->idle = cputime64_add(cpustat->idle, tmp);
+	} else
+		cpustat->steal = cputime64_add(cpustat->steal, tmp);
+}
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ *
+ * It also gets called by the fork code, when changing the parent's
+ * timeslices.
+ */
+void scheduler_tick(void)
+{
+	int cpu = smp_processor_id();
+	runqueue_t *rq = this_rq();
+	task_t *p = current;
+	unsigned long long now = sched_clock();
+
+	update_cpu_clock(p, rq, now);
+
+	rq->timestamp_last_tick = now;
+
+	if (p == rq->idle) {
+		if (wake_priority_sleeper(rq))
+			goto out;
+		rebalance_tick(cpu, rq, SCHED_IDLE);
+		return;
+	}
+
+	/* Task might have expired already, but not scheduled off yet */
+	if (p->array != rq->active) {
+		set_tsk_need_resched(p);
+		goto out;
+	}
+	spin_lock(&rq->lock);
+	/*
+	 * The task was running during this tick - update the
+	 * time slice counter. Note: we do not update a thread's
+	 * priority until it either goes to sleep or uses up its
+	 * timeslice. This makes it possible for interactive tasks
+	 * to use up their timeslices at their highest priority levels.
+	 */
+	if (rt_task(p)) {
+		/*
+		 * RR tasks need a special form of timeslice management.
+		 * FIFO tasks have no timeslices.
+		 */
+		if ((p->policy == SCHED_RR) && !--p->time_slice) {
+			p->time_slice = task_timeslice(p);
+			p->first_time_slice = 0;
+			set_tsk_need_resched(p);
+
+			/* put it at the end of the queue: */
+			requeue_task(p, rq->active);
+		}
+		goto out_unlock;
+	}
+	if (!--p->time_slice) {
+		dequeue_task(p, rq->active);
+		set_tsk_need_resched(p);
+		p->prio = effective_prio(p);
+		p->time_slice = task_timeslice(p);
+		p->first_time_slice = 0;
+
+		if (!rq->expired_timestamp)
+			rq->expired_timestamp = jiffies;
+		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
+			enqueue_task(p, rq->expired);
+			if (p->static_prio < rq->best_expired_prio)
+				rq->best_expired_prio = p->static_prio;
+		} else
+			enqueue_task(p, rq->active);
+	} else {
+		/*
+		 * Prevent a too long timeslice allowing a task to monopolize
+		 * the CPU. We do this by splitting up the timeslice into
+		 * smaller pieces.
+		 *
+		 * Note: this does not mean the task's timeslices expire or
+		 * get lost in any way, they just might be preempted by
+		 * another task of equal priority. (one with higher
+		 * priority would have preempted this task already.) We
+		 * requeue this task to the end of the list on this priority
+		 * level, which is in essence a round-robin of tasks with
+		 * equal priority.
+		 *
+		 * This only applies to tasks in the interactive
+		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
+		 */
+		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
+			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
+			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
+			(p->array == rq->active)) {
+
+			requeue_task(p, rq->active);
+			set_tsk_need_resched(p);
+		}
+	}
+out_unlock:
+	spin_unlock(&rq->lock);
+out:
+	rebalance_tick(cpu, rq, NOT_IDLE);
+}
+
+#ifdef CONFIG_SCHED_SMT
+static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+{
+	struct sched_domain *sd = this_rq->sd;
+	cpumask_t sibling_map;
+	int i;
+
+	if (!(sd->flags & SD_SHARE_CPUPOWER))
+		return;
+
+	/*
+	 * Unlock the current runqueue because we have to lock in
+	 * CPU order to avoid deadlocks. Caller knows that we might
+	 * unlock. We keep IRQs disabled.
+	 */
+	spin_unlock(&this_rq->lock);
+
+	sibling_map = sd->span;
+
+	for_each_cpu_mask(i, sibling_map)
+		spin_lock(&cpu_rq(i)->lock);
+	/*
+	 * We clear this CPU from the mask. This both simplifies the
+	 * inner loop and keps this_rq locked when we exit:
+	 */
+	cpu_clear(this_cpu, sibling_map);
+
+	for_each_cpu_mask(i, sibling_map) {
+		runqueue_t *smt_rq = cpu_rq(i);
+
+		/*
+		 * If an SMT sibling task is sleeping due to priority
+		 * reasons wake it up now.
+		 */
+		if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
+			resched_task(smt_rq->idle);
+	}
+
+	for_each_cpu_mask(i, sibling_map)
+		spin_unlock(&cpu_rq(i)->lock);
+	/*
+	 * We exit with this_cpu's rq still held and IRQs
+	 * still disabled:
+	 */
+}
+
+static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+{
+	struct sched_domain *sd = this_rq->sd;
+	cpumask_t sibling_map;
+	prio_array_t *array;
+	int ret = 0, i;
+	task_t *p;
+
+	if (!(sd->flags & SD_SHARE_CPUPOWER))
+		return 0;
+
+	/*
+	 * The same locking rules and details apply as for
+	 * wake_sleeping_dependent():
+	 */
+	spin_unlock(&this_rq->lock);
+	sibling_map = sd->span;
+	for_each_cpu_mask(i, sibling_map)
+		spin_lock(&cpu_rq(i)->lock);
+	cpu_clear(this_cpu, sibling_map);
+
+	/*
+	 * Establish next task to be run - it might have gone away because
+	 * we released the runqueue lock above:
+	 */
+	if (!this_rq->nr_running)
+		goto out_unlock;
+	array = this_rq->active;
+	if (!array->nr_active)
+		array = this_rq->expired;
+	BUG_ON(!array->nr_active);
+
+	p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
+		task_t, run_list);
+
+	for_each_cpu_mask(i, sibling_map) {
+		runqueue_t *smt_rq = cpu_rq(i);
+		task_t *smt_curr = smt_rq->curr;
+
+		/*
+		 * If a user task with lower static priority than the
+		 * running task on the SMT sibling is trying to schedule,
+		 * delay it till there is proportionately less timeslice
+		 * left of the sibling task to prevent a lower priority
+		 * task from using an unfair proportion of the
+		 * physical cpu's resources. -ck
+		 */
+		if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
+			task_timeslice(p) || rt_task(smt_curr)) &&
+			p->mm && smt_curr->mm && !rt_task(p))
+				ret = 1;
+
+		/*
+		 * Reschedule a lower priority task on the SMT sibling,
+		 * or wake it up if it has been put to sleep for priority
+		 * reasons.
+		 */
+		if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
+			task_timeslice(smt_curr) || rt_task(p)) &&
+			smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
+			(smt_curr == smt_rq->idle && smt_rq->nr_running))
+				resched_task(smt_curr);
+	}
+out_unlock:
+	for_each_cpu_mask(i, sibling_map)
+		spin_unlock(&cpu_rq(i)->lock);
+	return ret;
+}
+#else
+static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+{
+}
+
+static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+{
+	return 0;
+}
+#endif
+
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+
+void fastcall add_preempt_count(int val)
+{
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(((int)preempt_count() < 0));
+	preempt_count() += val;
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void fastcall sub_preempt_count(int val)
+{
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(val > preempt_count());
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
+	preempt_count() -= val;
+}
+EXPORT_SYMBOL(sub_preempt_count);
+
+#endif
+
+/*
+ * schedule() is the main scheduler function.
+ */
+asmlinkage void __sched schedule(void)
+{
+	long *switch_count;
+	task_t *prev, *next;
+	runqueue_t *rq;
+	prio_array_t *array;
+	struct list_head *queue;
+	unsigned long long now;
+	unsigned long run_time;
+	int cpu, idx;
+
+	/*
+	 * Test if we are atomic.  Since do_exit() needs to call into
+	 * schedule() atomically, we ignore that path for now.
+	 * Otherwise, whine if we are scheduling when we should not be.
+	 */
+	if (likely(!current->exit_state)) {
+		if (unlikely(in_atomic())) {
+			printk(KERN_ERR "scheduling while atomic: "
+				"%s/0x%08x/%d\n",
+				current->comm, preempt_count(), current->pid);
+			dump_stack();
+		}
+	}
+	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+need_resched:
+	preempt_disable();
+	prev = current;
+	release_kernel_lock(prev);
+need_resched_nonpreemptible:
+	rq = this_rq();
+
+	/*
+	 * The idle thread is not allowed to schedule!
+	 * Remove this check after it has been exercised a bit.
+	 */
+	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
+		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+		dump_stack();
+	}
+
+	schedstat_inc(rq, sched_cnt);
+	now = sched_clock();
+	if (likely((long long)now - prev->timestamp < NS_MAX_SLEEP_AVG)) {
+		run_time = now - prev->timestamp;
+		if (unlikely((long long)now - prev->timestamp < 0))
+			run_time = 0;
+	} else
+		run_time = NS_MAX_SLEEP_AVG;
+
+	/*
+	 * Tasks charged proportionately less run_time at high sleep_avg to
+	 * delay them losing their interactive status
+	 */
+	run_time /= (CURRENT_BONUS(prev) ? : 1);
+
+	spin_lock_irq(&rq->lock);
+
+	if (unlikely(prev->flags & PF_DEAD))
+		prev->state = EXIT_DEAD;
+
+	switch_count = &prev->nivcsw;
+	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+		switch_count = &prev->nvcsw;
+		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
+				unlikely(signal_pending(prev))))
+			prev->state = TASK_RUNNING;
+		else {
+			if (prev->state == TASK_UNINTERRUPTIBLE)
+				rq->nr_uninterruptible++;
+			deactivate_task(prev, rq);
+		}
+	}
+
+	cpu = smp_processor_id();
+	if (unlikely(!rq->nr_running)) {
+go_idle:
+		idle_balance(cpu, rq);
+		if (!rq->nr_running) {
+			next = rq->idle;
+			rq->expired_timestamp = 0;
+			wake_sleeping_dependent(cpu, rq);
+			/*
+			 * wake_sleeping_dependent() might have released
+			 * the runqueue, so break out if we got new
+			 * tasks meanwhile:
+			 */
+			if (!rq->nr_running)
+				goto switch_tasks;
+		}
+	} else {
+		if (dependent_sleeper(cpu, rq)) {
+			next = rq->idle;
+			goto switch_tasks;
+		}
+		/*
+		 * dependent_sleeper() releases and reacquires the runqueue
+		 * lock, hence go into the idle loop if the rq went
+		 * empty meanwhile:
+		 */
+		if (unlikely(!rq->nr_running))
+			goto go_idle;
+	}
+
+	array = rq->active;
+	if (unlikely(!array->nr_active)) {
+		/*
+		 * Switch the active and expired arrays.
+		 */
+		schedstat_inc(rq, sched_switch);
+		rq->active = rq->expired;
+		rq->expired = array;
+		array = rq->active;
+		rq->expired_timestamp = 0;
+		rq->best_expired_prio = MAX_PRIO;
+	}
+
+	idx = sched_find_first_bit(array->bitmap);
+	queue = array->queue + idx;
+	next = list_entry(queue->next, task_t, run_list);
+
+	if (!rt_task(next) && next->activated > 0) {
+		unsigned long long delta = now - next->timestamp;
+		if (unlikely((long long)now - next->timestamp < 0))
+			delta = 0;
+
+		if (next->activated == 1)
+			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
+
+		array = next->array;
+		dequeue_task(next, array);
+		recalc_task_prio(next, next->timestamp + delta);
+		enqueue_task(next, array);
+	}
+	next->activated = 0;
+switch_tasks:
+	if (next == rq->idle)
+		schedstat_inc(rq, sched_goidle);
+	prefetch(next);
+	clear_tsk_need_resched(prev);
+	rcu_qsctr_inc(task_cpu(prev));
+
+	update_cpu_clock(prev, rq, now);
+
+	prev->sleep_avg -= run_time;
+	if ((long)prev->sleep_avg <= 0)
+		prev->sleep_avg = 0;
+	prev->timestamp = prev->last_ran = now;
+
+	sched_info_switch(prev, next);
+	if (likely(prev != next)) {
+		next->timestamp = now;
+		rq->nr_switches++;
+		rq->curr = next;
+		++*switch_count;
+
+		prepare_arch_switch(rq, next);
+		prev = context_switch(rq, prev, next);
+		barrier();
+
+		finish_task_switch(prev);
+	} else
+		spin_unlock_irq(&rq->lock);
+
+	prev = current;
+	if (unlikely(reacquire_kernel_lock(prev) < 0))
+		goto need_resched_nonpreemptible;
+	preempt_enable_no_resched();
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+		goto need_resched;
+}
+
+EXPORT_SYMBOL(schedule);
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable.  Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage void __sched preempt_schedule(void)
+{
+	struct thread_info *ti = current_thread_info();
+#ifdef CONFIG_PREEMPT_BKL
+	struct task_struct *task = current;
+	int saved_lock_depth;
+#endif
+	/*
+	 * If there is a non-zero preempt_count or interrupts are disabled,
+	 * we do not want to preempt the current task.  Just return..
+	 */
+	if (unlikely(ti->preempt_count || irqs_disabled()))
+		return;
+
+need_resched:
+	add_preempt_count(PREEMPT_ACTIVE);
+	/*
+	 * We keep the big kernel semaphore locked, but we
+	 * clear ->lock_depth so that schedule() doesnt
+	 * auto-release the semaphore:
+	 */
+#ifdef CONFIG_PREEMPT_BKL
+	saved_lock_depth = task->lock_depth;
+	task->lock_depth = -1;
+#endif
+	schedule();
+#ifdef CONFIG_PREEMPT_BKL
+	task->lock_depth = saved_lock_depth;
+#endif
+	sub_preempt_count(PREEMPT_ACTIVE);
+
+	/* we could miss a preemption opportunity between schedule and now */
+	barrier();
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+		goto need_resched;
+}
+
+EXPORT_SYMBOL(preempt_schedule);
+
+/*
+ * this is is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage void __sched preempt_schedule_irq(void)
+{
+	struct thread_info *ti = current_thread_info();
+#ifdef CONFIG_PREEMPT_BKL
+	struct task_struct *task = current;
+	int saved_lock_depth;
+#endif
+	/* Catch callers which need to be fixed*/
+	BUG_ON(ti->preempt_count || !irqs_disabled());
+
+need_resched:
+	add_preempt_count(PREEMPT_ACTIVE);
+	/*
+	 * We keep the big kernel semaphore locked, but we
+	 * clear ->lock_depth so that schedule() doesnt
+	 * auto-release the semaphore:
+	 */
+#ifdef CONFIG_PREEMPT_BKL
+	saved_lock_depth = task->lock_depth;
+	task->lock_depth = -1;
+#endif
+	local_irq_enable();
+	schedule();
+	local_irq_disable();
+#ifdef CONFIG_PREEMPT_BKL
+	task->lock_depth = saved_lock_depth;
+#endif
+	sub_preempt_count(PREEMPT_ACTIVE);
+
+	/* we could miss a preemption opportunity between schedule and now */
+	barrier();
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+		goto need_resched;
+}
+
+#endif /* CONFIG_PREEMPT */
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
+{
+	task_t *p = curr->task;
+	return try_to_wake_up(p, mode, sync);
+}
+
+EXPORT_SYMBOL(default_wake_function);
+
+/*
+ * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+			     int nr_exclusive, int sync, void *key)
+{
+	struct list_head *tmp, *next;
+
+	list_for_each_safe(tmp, next, &q->task_list) {
+		wait_queue_t *curr;
+		unsigned flags;
+		curr = list_entry(tmp, wait_queue_t, task_list);
+		flags = curr->flags;
+		if (curr->func(curr, mode, sync, key) &&
+		    (flags & WQ_FLAG_EXCLUSIVE) &&
+		    !--nr_exclusive)
+			break;
+	}
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ */
+void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
+				int nr_exclusive, void *key)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, 0, key);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+{
+	__wake_up_common(q, mode, 1, 0, NULL);
+}
+
+/**
+ * __wake_up - sync- wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ */
+void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+	unsigned long flags;
+	int sync = 1;
+
+	if (unlikely(!q))
+		return;
+
+	if (unlikely(!nr_exclusive))
+		sync = 0;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
+
+void fastcall complete(struct completion *x)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done++;
+	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+			 1, 0, NULL);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+void fastcall complete_all(struct completion *x)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done += UINT_MAX/2;
+	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+			 0, 0, NULL);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+void fastcall __sched wait_for_completion(struct completion *x)
+{
+	might_sleep();
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			schedule();
+			spin_lock_irq(&x->wait.lock);
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+	spin_unlock_irq(&x->wait.lock);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+unsigned long fastcall __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			timeout = schedule_timeout(timeout);
+			spin_lock_irq(&x->wait.lock);
+			if (!timeout) {
+				__remove_wait_queue(&x->wait, &wait);
+				goto out;
+			}
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+out:
+	spin_unlock_irq(&x->wait.lock);
+	return timeout;
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+int fastcall __sched wait_for_completion_interruptible(struct completion *x)
+{
+	int ret = 0;
+
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			if (signal_pending(current)) {
+				ret = -ERESTARTSYS;
+				__remove_wait_queue(&x->wait, &wait);
+				goto out;
+			}
+			__set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			schedule();
+			spin_lock_irq(&x->wait.lock);
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+out:
+	spin_unlock_irq(&x->wait.lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+unsigned long fastcall __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+					  unsigned long timeout)
+{
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			if (signal_pending(current)) {
+				timeout = -ERESTARTSYS;
+				__remove_wait_queue(&x->wait, &wait);
+				goto out;
+			}
+			__set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			timeout = schedule_timeout(timeout);
+			spin_lock_irq(&x->wait.lock);
+			if (!timeout) {
+				__remove_wait_queue(&x->wait, &wait);
+				goto out;
+			}
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+out:
+	spin_unlock_irq(&x->wait.lock);
+	return timeout;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+
+#define	SLEEP_ON_VAR					\
+	unsigned long flags;				\
+	wait_queue_t wait;				\
+	init_waitqueue_entry(&wait, current);
+
+#define SLEEP_ON_HEAD					\
+	spin_lock_irqsave(&q->lock,flags);		\
+	__add_wait_queue(q, &wait);			\
+	spin_unlock(&q->lock);
+
+#define	SLEEP_ON_TAIL					\
+	spin_lock_irq(&q->lock);			\
+	__remove_wait_queue(q, &wait);			\
+	spin_unlock_irqrestore(&q->lock, flags);
+
+void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
+{
+	SLEEP_ON_VAR
+
+	current->state = TASK_INTERRUPTIBLE;
+
+	SLEEP_ON_HEAD
+	schedule();
+	SLEEP_ON_TAIL
+}
+
+EXPORT_SYMBOL(interruptible_sleep_on);
+
+long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+	SLEEP_ON_VAR
+
+	current->state = TASK_INTERRUPTIBLE;
+
+	SLEEP_ON_HEAD
+	timeout = schedule_timeout(timeout);
+	SLEEP_ON_TAIL
+
+	return timeout;
+}
+
+EXPORT_SYMBOL(interruptible_sleep_on_timeout);
+
+void fastcall __sched sleep_on(wait_queue_head_t *q)
+{
+	SLEEP_ON_VAR
+
+	current->state = TASK_UNINTERRUPTIBLE;
+
+	SLEEP_ON_HEAD
+	schedule();
+	SLEEP_ON_TAIL
+}
+
+EXPORT_SYMBOL(sleep_on);
+
+long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+	SLEEP_ON_VAR
+
+	current->state = TASK_UNINTERRUPTIBLE;
+
+	SLEEP_ON_HEAD
+	timeout = schedule_timeout(timeout);
+	SLEEP_ON_TAIL
+
+	return timeout;
+}
+
+EXPORT_SYMBOL(sleep_on_timeout);
+
+void set_user_nice(task_t *p, long nice)
+{
+	unsigned long flags;
+	prio_array_t *array;
+	runqueue_t *rq;
+	int old_prio, new_prio, delta;
+
+	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+		return;
+	/*
+	 * We have to be careful, if called from sys_setpriority(),
+	 * the task might be in the middle of scheduling on another CPU.
+	 */
+	rq = task_rq_lock(p, &flags);
+	/*
+	 * The RT priorities are set via sched_setscheduler(), but we still
+	 * allow the 'normal' nice value to be set - but as expected
+	 * it wont have any effect on scheduling until the task is
+	 * not SCHED_NORMAL:
+	 */
+	if (rt_task(p)) {
+		p->static_prio = NICE_TO_PRIO(nice);
+		goto out_unlock;
+	}
+	array = p->array;
+	if (array)
+		dequeue_task(p, array);
+
+	old_prio = p->prio;
+	new_prio = NICE_TO_PRIO(nice);
+	delta = new_prio - old_prio;
+	p->static_prio = NICE_TO_PRIO(nice);
+	p->prio += delta;
+
+	if (array) {
+		enqueue_task(p, array);
+		/*
+		 * If the task increased its priority or is running and
+		 * lowered its priority, then reschedule its CPU:
+		 */
+		if (delta < 0 || (delta > 0 && task_running(rq, p)))
+			resched_task(rq->curr);
+	}
+out_unlock:
+	task_rq_unlock(rq, &flags);
+}
+
+EXPORT_SYMBOL(set_user_nice);
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+asmlinkage long sys_nice(int increment)
+{
+	int retval;
+	long nice;
+
+	/*
+	 * Setpriority might change our priority at the same moment.
+	 * We don't have to worry. Conceptually one call occurs first
+	 * and we have a single winner.
+	 */
+	if (increment < 0) {
+		if (!capable(CAP_SYS_NICE))
+			return -EPERM;
+		if (increment < -40)
+			increment = -40;
+	}
+	if (increment > 40)
+		increment = 40;
+
+	nice = PRIO_TO_NICE(current->static_prio) + increment;
+	if (nice < -20)
+		nice = -20;
+	if (nice > 19)
+		nice = 19;
+
+	retval = security_task_setnice(current, nice);
+	if (retval)
+		return retval;
+
+	set_user_nice(current, nice);
+	return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * This is the priority value as seen by users in /proc.
+ * RT tasks are offset by -200. Normal tasks are centered
+ * around 0, value goes from -16 to +15.
+ */
+int task_prio(const task_t *p)
+{
+	return p->prio - MAX_RT_PRIO;
+}
+
+/**
+ * task_nice - return the nice value of a given task.
+ * @p: the task in question.
+ */
+int task_nice(const task_t *p)
+{
+	return TASK_NICE(p);
+}
+
+/*
+ * The only users of task_nice are binfmt_elf and binfmt_elf32.
+ * binfmt_elf is no longer modular, but binfmt_elf32 still is.
+ * Therefore, task_nice is needed if there is a compat_mode.
+ */
+#ifdef CONFIG_COMPAT
+EXPORT_SYMBOL_GPL(task_nice);
+#endif
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ */
+int idle_cpu(int cpu)
+{
+	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
+EXPORT_SYMBOL_GPL(idle_cpu);
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ */
+task_t *idle_task(int cpu)
+{
+	return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ */
+static inline task_t *find_process_by_pid(pid_t pid)
+{
+	return pid ? find_task_by_pid(pid) : current;
+}
+
+/* Actually do priority change: must hold rq lock. */
+static void __setscheduler(struct task_struct *p, int policy, int prio)
+{
+	BUG_ON(p->array);
+	p->policy = policy;
+	p->rt_priority = prio;
+	if (policy != SCHED_NORMAL)
+		p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
+	else
+		p->prio = p->static_prio;
+}
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of
+ * a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ */
+int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
+{
+	int retval;
+	int oldprio, oldpolicy = -1;
+	prio_array_t *array;
+	unsigned long flags;
+	runqueue_t *rq;
+
+recheck:
+	/* double check policy once rq lock held */
+	if (policy < 0)
+		policy = oldpolicy = p->policy;
+	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
+				policy != SCHED_NORMAL)
+			return -EINVAL;
+	/*
+	 * Valid priorities for SCHED_FIFO and SCHED_RR are
+	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
+	 */
+	if (param->sched_priority < 0 ||
+	    param->sched_priority > MAX_USER_RT_PRIO-1)
+		return -EINVAL;
+	if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
+		return -EINVAL;
+
+	if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
+	    !capable(CAP_SYS_NICE))
+		return -EPERM;
+	if ((current->euid != p->euid) && (current->euid != p->uid) &&
+	    !capable(CAP_SYS_NICE))
+		return -EPERM;
+
+	retval = security_task_setscheduler(p, policy, param);
+	if (retval)
+		return retval;
+	/*
+	 * To be able to change p->policy safely, the apropriate
+	 * runqueue lock must be held.
+	 */
+	rq = task_rq_lock(p, &flags);
+	/* recheck policy now with rq lock held */
+	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+		policy = oldpolicy = -1;
+		task_rq_unlock(rq, &flags);
+		goto recheck;
+	}
+	array = p->array;
+	if (array)
+		deactivate_task(p, rq);
+	oldprio = p->prio;
+	__setscheduler(p, policy, param->sched_priority);
+	if (array) {
+		__activate_task(p, rq);
+		/*
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
+		 */
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	task_rq_unlock(rq, &flags);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+	int retval;
+	struct sched_param lparam;
+	struct task_struct *p;
+
+	if (!param || pid < 0)
+		return -EINVAL;
+	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+		return -EFAULT;
+	read_lock_irq(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	if (!p) {
+		read_unlock_irq(&tasklist_lock);
+		return -ESRCH;
+	}
+	retval = sched_setscheduler(p, policy, &lparam);
+	read_unlock_irq(&tasklist_lock);
+	return retval;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+				       struct sched_param __user *param)
+{
+	return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
+{
+	return do_sched_setscheduler(pid, -1, param);
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ */
+asmlinkage long sys_sched_getscheduler(pid_t pid)
+{
+	int retval = -EINVAL;
+	task_t *p;
+
+	if (pid < 0)
+		goto out_nounlock;
+
+	retval = -ESRCH;
+	read_lock(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	if (p) {
+		retval = security_task_getscheduler(p);
+		if (!retval)
+			retval = p->policy;
+	}
+	read_unlock(&tasklist_lock);
+
+out_nounlock:
+	return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ */
+asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
+{
+	struct sched_param lp;
+	int retval = -EINVAL;
+	task_t *p;
+
+	if (!param || pid < 0)
+		goto out_nounlock;
+
+	read_lock(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	retval = -ESRCH;
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	lp.sched_priority = p->rt_priority;
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * This one might sleep, we cannot do it with a spinlock held ...
+	 */
+	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+out_nounlock:
+	return retval;
+
+out_unlock:
+	read_unlock(&tasklist_lock);
+	return retval;
+}
+
+long sched_setaffinity(pid_t pid, cpumask_t new_mask)
+{
+	task_t *p;
+	int retval;
+	cpumask_t cpus_allowed;
+
+	lock_cpu_hotplug();
+	read_lock(&tasklist_lock);
+
+	p = find_process_by_pid(pid);
+	if (!p) {
+		read_unlock(&tasklist_lock);
+		unlock_cpu_hotplug();
+		return -ESRCH;
+	}
+
+	/*
+	 * It is not safe to call set_cpus_allowed with the
+	 * tasklist_lock held.  We will bump the task_struct's
+	 * usage count and then drop tasklist_lock.
+	 */
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	retval = -EPERM;
+	if ((current->euid != p->euid) && (current->euid != p->uid) &&
+			!capable(CAP_SYS_NICE))
+		goto out_unlock;
+
+	cpus_allowed = cpuset_cpus_allowed(p);
+	cpus_and(new_mask, new_mask, cpus_allowed);
+	retval = set_cpus_allowed(p, new_mask);
+
+out_unlock:
+	put_task_struct(p);
+	unlock_cpu_hotplug();
+	return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+			     cpumask_t *new_mask)
+{
+	if (len < sizeof(cpumask_t)) {
+		memset(new_mask, 0, sizeof(cpumask_t));
+	} else if (len > sizeof(cpumask_t)) {
+		len = sizeof(cpumask_t);
+	}
+	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ */
+asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
+				      unsigned long __user *user_mask_ptr)
+{
+	cpumask_t new_mask;
+	int retval;
+
+	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
+	if (retval)
+		return retval;
+
+	return sched_setaffinity(pid, new_mask);
+}
+
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+
+cpumask_t cpu_present_map;
+EXPORT_SYMBOL(cpu_present_map);
+
+#ifndef CONFIG_SMP
+cpumask_t cpu_online_map = CPU_MASK_ALL;
+cpumask_t cpu_possible_map = CPU_MASK_ALL;
+#endif
+
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
+{
+	int retval;
+	task_t *p;
+
+	lock_cpu_hotplug();
+	read_lock(&tasklist_lock);
+
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = 0;
+	cpus_and(*mask, p->cpus_allowed, cpu_possible_map);
+
+out_unlock:
+	read_unlock(&tasklist_lock);
+	unlock_cpu_hotplug();
+	if (retval)
+		return retval;
+
+	return 0;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ */
+asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
+				      unsigned long __user *user_mask_ptr)
+{
+	int ret;
+	cpumask_t mask;
+
+	if (len < sizeof(cpumask_t))
+		return -EINVAL;
+
+	ret = sched_getaffinity(pid, &mask);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
+		return -EFAULT;
+
+	return sizeof(cpumask_t);
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * this function yields the current CPU by moving the calling thread
+ * to the expired array. If there are no other threads running on this
+ * CPU then this function will return.
+ */
+asmlinkage long sys_sched_yield(void)
+{
+	runqueue_t *rq = this_rq_lock();
+	prio_array_t *array = current->array;
+	prio_array_t *target = rq->expired;
+
+	schedstat_inc(rq, yld_cnt);
+	/*
+	 * We implement yielding by moving the task into the expired
+	 * queue.
+	 *
+	 * (special rule: RT tasks will just roundrobin in the active
+	 *  array.)
+	 */
+	if (rt_task(current))
+		target = rq->active;
+
+	if (current->array->nr_active == 1) {
+		schedstat_inc(rq, yld_act_empty);
+		if (!rq->expired->nr_active)
+			schedstat_inc(rq, yld_both_empty);
+	} else if (!rq->expired->nr_active)
+		schedstat_inc(rq, yld_exp_empty);
+
+	if (array != target) {
+		dequeue_task(current, array);
+		enqueue_task(current, target);
+	} else
+		/*
+		 * requeue_task is cheaper so perform that if possible.
+		 */
+		requeue_task(current, array);
+
+	/*
+	 * Since we are going to call schedule() anyway, there's
+	 * no need to preempt or enable interrupts:
+	 */
+	__release(rq->lock);
+	_raw_spin_unlock(&rq->lock);
+	preempt_enable_no_resched();
+
+	schedule();
+
+	return 0;
+}
+
+static inline void __cond_resched(void)
+{
+	do {
+		add_preempt_count(PREEMPT_ACTIVE);
+		schedule();
+		sub_preempt_count(PREEMPT_ACTIVE);
+	} while (need_resched());
+}
+
+int __sched cond_resched(void)
+{
+	if (need_resched()) {
+		__cond_resched();
+		return 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(cond_resched);
+
+/*
+ * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int cond_resched_lock(spinlock_t * lock)
+{
+	if (need_lockbreak(lock)) {
+		spin_unlock(lock);
+		cpu_relax();
+		spin_lock(lock);
+	}
+	if (need_resched()) {
+		_raw_spin_unlock(lock);
+		preempt_enable_no_resched();
+		__cond_resched();
+		spin_lock(lock);
+		return 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(cond_resched_lock);
+
+int __sched cond_resched_softirq(void)
+{
+	BUG_ON(!in_softirq());
+
+	if (need_resched()) {
+		__local_bh_enable();
+		__cond_resched();
+		local_bh_disable();
+		return 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(cond_resched_softirq);
+
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * this is a shortcut for kernel-space yielding - it marks the
+ * thread runnable and calls sys_sched_yield().
+ */
+void __sched yield(void)
+{
+	set_current_state(TASK_RUNNING);
+	sys_sched_yield();
+}
+
+EXPORT_SYMBOL(yield);
+
+/*
+ * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+void __sched io_schedule(void)
+{
+	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+
+	atomic_inc(&rq->nr_iowait);
+	schedule();
+	atomic_dec(&rq->nr_iowait);
+}
+
+EXPORT_SYMBOL(io_schedule);
+
+long __sched io_schedule_timeout(long timeout)
+{
+	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+	long ret;
+
+	atomic_inc(&rq->nr_iowait);
+	ret = schedule_timeout(timeout);
+	atomic_dec(&rq->nr_iowait);
+	return ret;
+}
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the maximum rt_priority that can be used
+ * by a given scheduling class.
+ */
+asmlinkage long sys_sched_get_priority_max(int policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = MAX_USER_RT_PRIO-1;
+		break;
+	case SCHED_NORMAL:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the minimum rt_priority that can be used
+ * by a given scheduling class.
+ */
+asmlinkage long sys_sched_get_priority_min(int policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = 1;
+		break;
+	case SCHED_NORMAL:
+		ret = 0;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ */
+asmlinkage
+long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
+{
+	int retval = -EINVAL;
+	struct timespec t;
+	task_t *p;
+
+	if (pid < 0)
+		goto out_nounlock;
+
+	retval = -ESRCH;
+	read_lock(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	jiffies_to_timespec(p->policy & SCHED_FIFO ?
+				0 : task_timeslice(p), &t);
+	read_unlock(&tasklist_lock);
+	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+out_nounlock:
+	return retval;
+out_unlock:
+	read_unlock(&tasklist_lock);
+	return retval;
+}
+
+static inline struct task_struct *eldest_child(struct task_struct *p)
+{
+	if (list_empty(&p->children)) return NULL;
+	return list_entry(p->children.next,struct task_struct,sibling);
+}
+
+static inline struct task_struct *older_sibling(struct task_struct *p)
+{
+	if (p->sibling.prev==&p->parent->children) return NULL;
+	return list_entry(p->sibling.prev,struct task_struct,sibling);
+}
+
+static inline struct task_struct *younger_sibling(struct task_struct *p)
+{
+	if (p->sibling.next==&p->parent->children) return NULL;
+	return list_entry(p->sibling.next,struct task_struct,sibling);
+}
+
+static void show_task(task_t * p)
+{
+	task_t *relative;
+	unsigned state;
+	unsigned long free = 0;
+	static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
+
+	printk("%-13.13s ", p->comm);
+	state = p->state ? __ffs(p->state) + 1 : 0;
+	if (state < ARRAY_SIZE(stat_nam))
+		printk(stat_nam[state]);
+	else
+		printk("?");
+#if (BITS_PER_LONG == 32)
+	if (state == TASK_RUNNING)
+		printk(" running ");
+	else
+		printk(" %08lX ", thread_saved_pc(p));
+#else
+	if (state == TASK_RUNNING)
+		printk("  running task   ");
+	else
+		printk(" %016lx ", thread_saved_pc(p));
+#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+	{
+		unsigned long * n = (unsigned long *) (p->thread_info+1);
+		while (!*n)
+			n++;
+		free = (unsigned long) n - (unsigned long)(p->thread_info+1);
+	}
+#endif
+	printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
+	if ((relative = eldest_child(p)))
+		printk("%5d ", relative->pid);
+	else
+		printk("      ");
+	if ((relative = younger_sibling(p)))
+		printk("%7d", relative->pid);
+	else
+		printk("       ");
+	if ((relative = older_sibling(p)))
+		printk(" %5d", relative->pid);
+	else
+		printk("      ");
+	if (!p->mm)
+		printk(" (L-TLB)\n");
+	else
+		printk(" (NOTLB)\n");
+
+	if (state != TASK_RUNNING)
+		show_stack(p, NULL);
+}
+
+void show_state(void)
+{
+	task_t *g, *p;
+
+#if (BITS_PER_LONG == 32)
+	printk("\n"
+	       "                                               sibling\n");
+	printk("  task             PC      pid father child younger older\n");
+#else
+	printk("\n"
+	       "                                                       sibling\n");
+	printk("  task                 PC          pid father child younger older\n");
+#endif
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		/*
+		 * reset the NMI-timeout, listing all files on a slow
+		 * console might take alot of time:
+		 */
+		touch_nmi_watchdog();
+		show_task(p);
+	} while_each_thread(g, p);
+
+	read_unlock(&tasklist_lock);
+}
+
+void __devinit init_idle(task_t *idle, int cpu)
+{
+	runqueue_t *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	idle->sleep_avg = 0;
+	idle->array = NULL;
+	idle->prio = MAX_PRIO;
+	idle->state = TASK_RUNNING;
+	idle->cpus_allowed = cpumask_of_cpu(cpu);
+	set_task_cpu(idle, cpu);
+
+	spin_lock_irqsave(&rq->lock, flags);
+	rq->curr = rq->idle = idle;
+	set_tsk_need_resched(idle);
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	/* Set the preempt count _outside_ the spinlocks! */
+#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
+	idle->thread_info->preempt_count = (idle->lock_depth >= 0);
+#else
+	idle->thread_info->preempt_count = 0;
+#endif
+}
+
+/*
+ * In a system that switches off the HZ timer nohz_cpu_mask
+ * indicates which cpus entered this state. This is used
+ * in the rcu update to wait only for active cpus. For system
+ * which do not switch off the HZ timer nohz_cpu_mask should
+ * always be CPU_MASK_NONE.
+ */
+cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+
+#ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we queue a migration_req_t structure in the source CPU's
+ *    runqueue and wake up that CPU's migration thread.
+ * 2) we down() the locked semaphore => thread blocks.
+ * 3) migration thread wakes up (implicitly it forces the migrated
+ *    thread off the CPU)
+ * 4) it gets the migration request and checks whether the migrated
+ *    task is still in the wrong runqueue.
+ * 5) if it's in the wrong runqueue then the migration thread removes
+ *    it and puts it into the right queue.
+ * 6) migration thread up()s the semaphore.
+ * 7) we wake up and the migration is done.
+ */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely.  The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed(task_t *p, cpumask_t new_mask)
+{
+	unsigned long flags;
+	int ret = 0;
+	migration_req_t req;
+	runqueue_t *rq;
+
+	rq = task_rq_lock(p, &flags);
+	if (!cpus_intersects(new_mask, cpu_online_map)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	p->cpus_allowed = new_mask;
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpu_isset(task_cpu(p), new_mask))
+		goto out;
+
+	if (migrate_task(p, any_online_cpu(new_mask), &req)) {
+		/* Need help from migration thread: drop lock and wait. */
+		task_rq_unlock(rq, &flags);
+		wake_up_process(rq->migration_thread);
+		wait_for_completion(&req.done);
+		tlb_migrate_finish(p->mm);
+		return 0;
+	}
+out:
+	task_rq_unlock(rq, &flags);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(set_cpus_allowed);
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu.  We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ */
+static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+	runqueue_t *rq_dest, *rq_src;
+
+	if (unlikely(cpu_is_offline(dest_cpu)))
+		return;
+
+	rq_src = cpu_rq(src_cpu);
+	rq_dest = cpu_rq(dest_cpu);
+
+	double_rq_lock(rq_src, rq_dest);
+	/* Already moved. */
+	if (task_cpu(p) != src_cpu)
+		goto out;
+	/* Affinity changed (again). */
+	if (!cpu_isset(dest_cpu, p->cpus_allowed))
+		goto out;
+
+	set_task_cpu(p, dest_cpu);
+	if (p->array) {
+		/*
+		 * Sync timestamp with rq_dest's before activating.
+		 * The same thing could be achieved by doing this step
+		 * afterwards, and pretending it was a local activate.
+		 * This way is cleaner and logically correct.
+		 */
+		p->timestamp = p->timestamp - rq_src->timestamp_last_tick
+				+ rq_dest->timestamp_last_tick;
+		deactivate_task(p, rq_src);
+		activate_task(p, rq_dest, 0);
+		if (TASK_PREEMPTS_CURR(p, rq_dest))
+			resched_task(rq_dest->curr);
+	}
+
+out:
+	double_rq_unlock(rq_src, rq_dest);
+}
+
+/*
+ * migration_thread - this is a highprio system thread that performs
+ * thread migration by bumping thread off CPU then 'pushing' onto
+ * another runqueue.
+ */
+static int migration_thread(void * data)
+{
+	runqueue_t *rq;
+	int cpu = (long)data;
+
+	rq = cpu_rq(cpu);
+	BUG_ON(rq->migration_thread != current);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		struct list_head *head;
+		migration_req_t *req;
+
+		if (current->flags & PF_FREEZE)
+			refrigerator(PF_FREEZE);
+
+		spin_lock_irq(&rq->lock);
+
+		if (cpu_is_offline(cpu)) {
+			spin_unlock_irq(&rq->lock);
+			goto wait_to_die;
+		}
+
+		if (rq->active_balance) {
+			active_load_balance(rq, cpu);
+			rq->active_balance = 0;
+		}
+
+		head = &rq->migration_queue;
+
+		if (list_empty(head)) {
+			spin_unlock_irq(&rq->lock);
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+			continue;
+		}
+		req = list_entry(head->next, migration_req_t, list);
+		list_del_init(head->next);
+
+		if (req->type == REQ_MOVE_TASK) {
+			spin_unlock(&rq->lock);
+			__migrate_task(req->task, cpu, req->dest_cpu);
+			local_irq_enable();
+		} else if (req->type == REQ_SET_DOMAIN) {
+			rq->sd = req->sd;
+			spin_unlock_irq(&rq->lock);
+		} else {
+			spin_unlock_irq(&rq->lock);
+			WARN_ON(1);
+		}
+
+		complete(&req->done);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+
+wait_to_die:
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* Figure out where task on dead CPU should go, use force if neccessary. */
+static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
+{
+	int dest_cpu;
+	cpumask_t mask;
+
+	/* On same node? */
+	mask = node_to_cpumask(cpu_to_node(dead_cpu));
+	cpus_and(mask, mask, tsk->cpus_allowed);
+	dest_cpu = any_online_cpu(mask);
+
+	/* On any allowed CPU? */
+	if (dest_cpu == NR_CPUS)
+		dest_cpu = any_online_cpu(tsk->cpus_allowed);
+
+	/* No more Mr. Nice Guy. */
+	if (dest_cpu == NR_CPUS) {
+		tsk->cpus_allowed = cpuset_cpus_allowed(tsk);
+		dest_cpu = any_online_cpu(tsk->cpus_allowed);
+
+		/*
+		 * Don't tell them about moving exiting tasks or
+		 * kernel threads (both mm NULL), since they never
+		 * leave kernel.
+		 */
+		if (tsk->mm && printk_ratelimit())
+			printk(KERN_INFO "process %d (%s) no "
+			       "longer affine to cpu%d\n",
+			       tsk->pid, tsk->comm, dead_cpu);
+	}
+	__migrate_task(tsk, dead_cpu, dest_cpu);
+}
+
+/*
+ * While a dead CPU has no uninterruptible tasks queued at this point,
+ * it might still have a nonzero ->nr_uninterruptible counter, because
+ * for performance reasons the counter is not stricly tracking tasks to
+ * their home CPUs. So we just add the counter to another CPU's counter,
+ * to keep the global sum constant after CPU-down:
+ */
+static void migrate_nr_uninterruptible(runqueue_t *rq_src)
+{
+	runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
+	unsigned long flags;
+
+	local_irq_save(flags);
+	double_rq_lock(rq_src, rq_dest);
+	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
+	rq_src->nr_uninterruptible = 0;
+	double_rq_unlock(rq_src, rq_dest);
+	local_irq_restore(flags);
+}
+
+/* Run through task list and migrate tasks from the dead cpu. */
+static void migrate_live_tasks(int src_cpu)
+{
+	struct task_struct *tsk, *t;
+
+	write_lock_irq(&tasklist_lock);
+
+	do_each_thread(t, tsk) {
+		if (tsk == current)
+			continue;
+
+		if (task_cpu(tsk) == src_cpu)
+			move_task_off_dead_cpu(src_cpu, tsk);
+	} while_each_thread(t, tsk);
+
+	write_unlock_irq(&tasklist_lock);
+}
+
+/* Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible and adding it to
+ * the _front_ of runqueue. Used by CPU offline code.
+ */
+void sched_idle_next(void)
+{
+	int cpu = smp_processor_id();
+	runqueue_t *rq = this_rq();
+	struct task_struct *p = rq->idle;
+	unsigned long flags;
+
+	/* cpu has to be offline */
+	BUG_ON(cpu_online(cpu));
+
+	/* Strictly not necessary since rest of the CPUs are stopped by now
+	 * and interrupts disabled on current cpu.
+	 */
+	spin_lock_irqsave(&rq->lock, flags);
+
+	__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+	/* Add idle task to _front_ of it's priority queue */
+	__activate_idle_task(p, rq);
+
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+/* Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+	struct mm_struct *mm = current->active_mm;
+
+	BUG_ON(cpu_online(smp_processor_id()));
+
+	if (mm != &init_mm)
+		switch_mm(mm, &init_mm, current);
+	mmdrop(mm);
+}
+
+static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
+{
+	struct runqueue *rq = cpu_rq(dead_cpu);
+
+	/* Must be exiting, otherwise would be on tasklist. */
+	BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
+
+	/* Cannot have done final schedule yet: would have vanished. */
+	BUG_ON(tsk->flags & PF_DEAD);
+
+	get_task_struct(tsk);
+
+	/*
+	 * Drop lock around migration; if someone else moves it,
+	 * that's OK.  No task can be added to this CPU, so iteration is
+	 * fine.
+	 */
+	spin_unlock_irq(&rq->lock);
+	move_task_off_dead_cpu(dead_cpu, tsk);
+	spin_lock_irq(&rq->lock);
+
+	put_task_struct(tsk);
+}
+
+/* release_task() removes task from tasklist, so we won't find dead tasks. */
+static void migrate_dead_tasks(unsigned int dead_cpu)
+{
+	unsigned arr, i;
+	struct runqueue *rq = cpu_rq(dead_cpu);
+
+	for (arr = 0; arr < 2; arr++) {
+		for (i = 0; i < MAX_PRIO; i++) {
+			struct list_head *list = &rq->arrays[arr].queue[i];
+			while (!list_empty(list))
+				migrate_dead(dead_cpu,
+					     list_entry(list->next, task_t,
+							run_list));
+		}
+	}
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * migration_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int migration_call(struct notifier_block *nfb, unsigned long action,
+			  void *hcpu)
+{
+	int cpu = (long)hcpu;
+	struct task_struct *p;
+	struct runqueue *rq;
+	unsigned long flags;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		p->flags |= PF_NOFREEZE;
+		kthread_bind(p, cpu);
+		/* Must be high prio: stop_machine expects to yield to it. */
+		rq = task_rq_lock(p, &flags);
+		__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+		task_rq_unlock(rq, &flags);
+		cpu_rq(cpu)->migration_thread = p;
+		break;
+	case CPU_ONLINE:
+		/* Strictly unneccessary, as first user will wake it. */
+		wake_up_process(cpu_rq(cpu)->migration_thread);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind it from offline cpu so it can run.  Fall thru. */
+		kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
+		kthread_stop(cpu_rq(cpu)->migration_thread);
+		cpu_rq(cpu)->migration_thread = NULL;
+		break;
+	case CPU_DEAD:
+		migrate_live_tasks(cpu);
+		rq = cpu_rq(cpu);
+		kthread_stop(rq->migration_thread);
+		rq->migration_thread = NULL;
+		/* Idle task back to normal (off runqueue, low prio) */
+		rq = task_rq_lock(rq->idle, &flags);
+		deactivate_task(rq->idle, rq);
+		rq->idle->static_prio = MAX_PRIO;
+		__setscheduler(rq->idle, SCHED_NORMAL, 0);
+		migrate_dead_tasks(cpu);
+		task_rq_unlock(rq, &flags);
+		migrate_nr_uninterruptible(rq);
+		BUG_ON(rq->nr_running != 0);
+
+		/* No need to migrate the tasks: it was best-effort if
+		 * they didn't do lock_cpu_hotplug().  Just wake up
+		 * the requestors. */
+		spin_lock_irq(&rq->lock);
+		while (!list_empty(&rq->migration_queue)) {
+			migration_req_t *req;
+			req = list_entry(rq->migration_queue.next,
+					 migration_req_t, list);
+			BUG_ON(req->type != REQ_MOVE_TASK);
+			list_del_init(&req->list);
+			complete(&req->done);
+		}
+		spin_unlock_irq(&rq->lock);
+		break;
+#endif
+	}
+	return NOTIFY_OK;
+}
+
+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
+static struct notifier_block __devinitdata migration_notifier = {
+	.notifier_call = migration_call,
+	.priority = 10
+};
+
+int __init migration_init(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	/* Start one for boot CPU. */
+	migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+	migration_call(&migration_notifier, CPU_ONLINE, cpu);
+	register_cpu_notifier(&migration_notifier);
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_SMP
+#define SCHED_DOMAIN_DEBUG
+#ifdef SCHED_DOMAIN_DEBUG
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+	int level = 0;
+
+	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+	do {
+		int i;
+		char str[NR_CPUS];
+		struct sched_group *group = sd->groups;
+		cpumask_t groupmask;
+
+		cpumask_scnprintf(str, NR_CPUS, sd->span);
+		cpus_clear(groupmask);
+
+		printk(KERN_DEBUG);
+		for (i = 0; i < level + 1; i++)
+			printk(" ");
+		printk("domain %d: ", level);
+
+		if (!(sd->flags & SD_LOAD_BALANCE)) {
+			printk("does not load-balance\n");
+			if (sd->parent)
+				printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
+			break;
+		}
+
+		printk("span %s\n", str);
+
+		if (!cpu_isset(cpu, sd->span))
+			printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
+		if (!cpu_isset(cpu, group->cpumask))
+			printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
+
+		printk(KERN_DEBUG);
+		for (i = 0; i < level + 2; i++)
+			printk(" ");
+		printk("groups:");
+		do {
+			if (!group) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: group is NULL\n");
+				break;
+			}
+
+			if (!group->cpu_power) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
+			}
+
+			if (!cpus_weight(group->cpumask)) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: empty group\n");
+			}
+
+			if (cpus_intersects(groupmask, group->cpumask)) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: repeated CPUs\n");
+			}
+
+			cpus_or(groupmask, groupmask, group->cpumask);
+
+			cpumask_scnprintf(str, NR_CPUS, group->cpumask);
+			printk(" %s", str);
+
+			group = group->next;
+		} while (group != sd->groups);
+		printk("\n");
+
+		if (!cpus_equal(sd->span, groupmask))
+			printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+		level++;
+		sd = sd->parent;
+
+		if (sd) {
+			if (!cpus_subset(groupmask, sd->span))
+				printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
+		}
+
+	} while (sd);
+}
+#else
+#define sched_domain_debug(sd, cpu) {}
+#endif
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
+ * hold the hotplug lock.
+ */
+void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
+{
+	migration_req_t req;
+	unsigned long flags;
+	runqueue_t *rq = cpu_rq(cpu);
+	int local = 1;
+
+	sched_domain_debug(sd, cpu);
+
+	spin_lock_irqsave(&rq->lock, flags);
+
+	if (cpu == smp_processor_id() || !cpu_online(cpu)) {
+		rq->sd = sd;
+	} else {
+		init_completion(&req.done);
+		req.type = REQ_SET_DOMAIN;
+		req.sd = sd;
+		list_add(&req.list, &rq->migration_queue);
+		local = 0;
+	}
+
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	if (!local) {
+		wake_up_process(rq->migration_thread);
+		wait_for_completion(&req.done);
+	}
+}
+
+/* cpus with isolated domains */
+cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+	int ints[NR_CPUS], i;
+
+	str = get_options(str, ARRAY_SIZE(ints), ints);
+	cpus_clear(cpu_isolated_map);
+	for (i = 1; i <= ints[0]; i++)
+		if (ints[i] < NR_CPUS)
+			cpu_set(ints[i], cpu_isolated_map);
+	return 1;
+}
+
+__setup ("isolcpus=", isolated_cpu_setup);
+
+/*
+ * init_sched_build_groups takes an array of groups, the cpumask we wish
+ * to span, and a pointer to a function which identifies what group a CPU
+ * belongs to. The return value of group_fn must be a valid index into the
+ * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
+ * keep track of groups covered with a cpumask_t).
+ *
+ * init_sched_build_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
+ */
+void __devinit init_sched_build_groups(struct sched_group groups[],
+			cpumask_t span, int (*group_fn)(int cpu))
+{
+	struct sched_group *first = NULL, *last = NULL;
+	cpumask_t covered = CPU_MASK_NONE;
+	int i;
+
+	for_each_cpu_mask(i, span) {
+		int group = group_fn(i);
+		struct sched_group *sg = &groups[group];
+		int j;
+
+		if (cpu_isset(i, covered))
+			continue;
+
+		sg->cpumask = CPU_MASK_NONE;
+		sg->cpu_power = 0;
+
+		for_each_cpu_mask(j, span) {
+			if (group_fn(j) != group)
+				continue;
+
+			cpu_set(j, covered);
+			cpu_set(j, sg->cpumask);
+		}
+		if (!first)
+			first = sg;
+		if (last)
+			last->next = sg;
+		last = sg;
+	}
+	last->next = first;
+}
+
+
+#ifdef ARCH_HAS_SCHED_DOMAIN
+extern void __devinit arch_init_sched_domains(void);
+extern void __devinit arch_destroy_sched_domains(void);
+#else
+#ifdef CONFIG_SCHED_SMT
+static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+static struct sched_group sched_group_cpus[NR_CPUS];
+static int __devinit cpu_to_cpu_group(int cpu)
+{
+	return cpu;
+}
+#endif
+
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+static struct sched_group sched_group_phys[NR_CPUS];
+static int __devinit cpu_to_phys_group(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+	return first_cpu(cpu_sibling_map[cpu]);
+#else
+	return cpu;
+#endif
+}
+
+#ifdef CONFIG_NUMA
+
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group sched_group_nodes[MAX_NUMNODES];
+static int __devinit cpu_to_node_group(int cpu)
+{
+	return cpu_to_node(cpu);
+}
+#endif
+
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+/*
+ * The domains setup code relies on siblings not spanning
+ * multiple nodes. Make sure the architecture has a proper
+ * siblings map:
+ */
+static void check_sibling_maps(void)
+{
+	int i, j;
+
+	for_each_online_cpu(i) {
+		for_each_cpu_mask(j, cpu_sibling_map[i]) {
+			if (cpu_to_node(i) != cpu_to_node(j)) {
+				printk(KERN_INFO "warning: CPU %d siblings map "
+					"to different node - isolating "
+					"them.\n", i);
+				cpu_sibling_map[i] = cpumask_of_cpu(i);
+				break;
+			}
+		}
+	}
+}
+#endif
+
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+static void __devinit arch_init_sched_domains(void)
+{
+	int i;
+	cpumask_t cpu_default_map;
+
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+	check_sibling_maps();
+#endif
+	/*
+	 * Setup mask for cpus without special case scheduling requirements.
+	 * For now this just excludes isolated cpus, but could be used to
+	 * exclude other special cases in the future.
+	 */
+	cpus_complement(cpu_default_map, cpu_isolated_map);
+	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
+
+	/*
+	 * Set up domains. Isolated domains just stay on the dummy domain.
+	 */
+	for_each_cpu_mask(i, cpu_default_map) {
+		int group;
+		struct sched_domain *sd = NULL, *p;
+		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
+
+		cpus_and(nodemask, nodemask, cpu_default_map);
+
+#ifdef CONFIG_NUMA
+		sd = &per_cpu(node_domains, i);
+		group = cpu_to_node_group(i);
+		*sd = SD_NODE_INIT;
+		sd->span = cpu_default_map;
+		sd->groups = &sched_group_nodes[group];
+#endif
+
+		p = sd;
+		sd = &per_cpu(phys_domains, i);
+		group = cpu_to_phys_group(i);
+		*sd = SD_CPU_INIT;
+		sd->span = nodemask;
+		sd->parent = p;
+		sd->groups = &sched_group_phys[group];
+
+#ifdef CONFIG_SCHED_SMT
+		p = sd;
+		sd = &per_cpu(cpu_domains, i);
+		group = cpu_to_cpu_group(i);
+		*sd = SD_SIBLING_INIT;
+		sd->span = cpu_sibling_map[i];
+		cpus_and(sd->span, sd->span, cpu_default_map);
+		sd->parent = p;
+		sd->groups = &sched_group_cpus[group];
+#endif
+	}
+
+#ifdef CONFIG_SCHED_SMT
+	/* Set up CPU (sibling) groups */
+	for_each_online_cpu(i) {
+		cpumask_t this_sibling_map = cpu_sibling_map[i];
+		cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+		if (i != first_cpu(this_sibling_map))
+			continue;
+
+		init_sched_build_groups(sched_group_cpus, this_sibling_map,
+						&cpu_to_cpu_group);
+	}
+#endif
+
+	/* Set up physical groups */
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		cpumask_t nodemask = node_to_cpumask(i);
+
+		cpus_and(nodemask, nodemask, cpu_default_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		init_sched_build_groups(sched_group_phys, nodemask,
+						&cpu_to_phys_group);
+	}
+
+#ifdef CONFIG_NUMA
+	/* Set up node groups */
+	init_sched_build_groups(sched_group_nodes, cpu_default_map,
+					&cpu_to_node_group);
+#endif
+
+	/* Calculate CPU power for physical packages and nodes */
+	for_each_cpu_mask(i, cpu_default_map) {
+		int power;
+		struct sched_domain *sd;
+#ifdef CONFIG_SCHED_SMT
+		sd = &per_cpu(cpu_domains, i);
+		power = SCHED_LOAD_SCALE;
+		sd->groups->cpu_power = power;
+#endif
+
+		sd = &per_cpu(phys_domains, i);
+		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+		sd->groups->cpu_power = power;
+
+#ifdef CONFIG_NUMA
+		if (i == first_cpu(sd->groups->cpumask)) {
+			/* Only add "power" once for each physical package. */
+			sd = &per_cpu(node_domains, i);
+			sd->groups->cpu_power += power;
+		}
+#endif
+	}
+
+	/* Attach the domains */
+	for_each_online_cpu(i) {
+		struct sched_domain *sd;
+#ifdef CONFIG_SCHED_SMT
+		sd = &per_cpu(cpu_domains, i);
+#else
+		sd = &per_cpu(phys_domains, i);
+#endif
+		cpu_attach_domain(sd, i);
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __devinit arch_destroy_sched_domains(void)
+{
+	/* Do nothing: everything is statically allocated. */
+}
+#endif
+
+#endif /* ARCH_HAS_SCHED_DOMAIN */
+
+/*
+ * Initial dummy domain for early boot and for hotplug cpu. Being static,
+ * it is initialized to zero, so all balancing flags are cleared which is
+ * what we want.
+ */
+static struct sched_domain sched_domain_dummy;
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Force a reinitialization of the sched domains hierarchy.  The domains
+ * and groups cannot be updated in place without racing with the balancing
+ * code, so we temporarily attach all running cpus to a "dummy" domain
+ * which will prevent rebalancing while the sched domains are recalculated.
+ */
+static int update_sched_domains(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
+{
+	int i;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_DOWN_PREPARE:
+		for_each_online_cpu(i)
+			cpu_attach_domain(&sched_domain_dummy, i);
+		arch_destroy_sched_domains();
+		return NOTIFY_OK;
+
+	case CPU_UP_CANCELED:
+	case CPU_DOWN_FAILED:
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		/*
+		 * Fall through and re-initialise the domains.
+		 */
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	/* The hotplug lock is already held by cpu_up/cpu_down */
+	arch_init_sched_domains();
+
+	return NOTIFY_OK;
+}
+#endif
+
+void __init sched_init_smp(void)
+{
+	lock_cpu_hotplug();
+	arch_init_sched_domains();
+	unlock_cpu_hotplug();
+	/* XXX: Theoretical race here - CPU may be hotplugged now */
+	hotcpu_notifier(update_sched_domains, 0);
+}
+#else
+void __init sched_init_smp(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+int in_sched_functions(unsigned long addr)
+{
+	/* Linker adds these: start and end of __sched functions */
+	extern char __sched_text_start[], __sched_text_end[];
+	return in_lock_functions(addr) ||
+		(addr >= (unsigned long)__sched_text_start
+		&& addr < (unsigned long)__sched_text_end);
+}
+
+void __init sched_init(void)
+{
+	runqueue_t *rq;
+	int i, j, k;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		prio_array_t *array;
+
+		rq = cpu_rq(i);
+		spin_lock_init(&rq->lock);
+		rq->active = rq->arrays;
+		rq->expired = rq->arrays + 1;
+		rq->best_expired_prio = MAX_PRIO;
+
+#ifdef CONFIG_SMP
+		rq->sd = &sched_domain_dummy;
+		rq->cpu_load = 0;
+		rq->active_balance = 0;
+		rq->push_cpu = 0;
+		rq->migration_thread = NULL;
+		INIT_LIST_HEAD(&rq->migration_queue);
+#endif
+		atomic_set(&rq->nr_iowait, 0);
+
+		for (j = 0; j < 2; j++) {
+			array = rq->arrays + j;
+			for (k = 0; k < MAX_PRIO; k++) {
+				INIT_LIST_HEAD(array->queue + k);
+				__clear_bit(k, array->bitmap);
+			}
+			// delimiter for bitsearch
+			__set_bit(MAX_PRIO, array->bitmap);
+		}
+	}
+
+	/*
+	 * The boot idle thread does lazy MMU switching as well:
+	 */
+	atomic_inc(&init_mm.mm_count);
+	enter_lazy_tlb(&init_mm, current);
+
+	/*
+	 * Make us the idle thread. Technically, schedule() should not be
+	 * called from this thread, however somewhere below it might be,
+	 * but because we are the idle thread, we just pick up running again
+	 * when this runqueue becomes "idle".
+	 */
+	init_idle(current, smp_processor_id());
+}
+
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+void __might_sleep(char *file, int line)
+{
+#if defined(in_atomic)
+	static unsigned long prev_jiffy;	/* ratelimiting */
+
+	if ((in_atomic() || irqs_disabled()) &&
+	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
+		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+			return;
+		prev_jiffy = jiffies;
+		printk(KERN_ERR "Debug: sleeping function called from invalid"
+				" context at %s:%d\n", file, line);
+		printk("in_atomic():%d, irqs_disabled():%d\n",
+			in_atomic(), irqs_disabled());
+		dump_stack();
+	}
+#endif
+}
+EXPORT_SYMBOL(__might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+void normalize_rt_tasks(void)
+{
+	struct task_struct *p;
+	prio_array_t *array;
+	unsigned long flags;
+	runqueue_t *rq;
+
+	read_lock_irq(&tasklist_lock);
+	for_each_process (p) {
+		if (!rt_task(p))
+			continue;
+
+		rq = task_rq_lock(p, &flags);
+
+		array = p->array;
+		if (array)
+			deactivate_task(p, task_rq(p));
+		__setscheduler(p, SCHED_NORMAL, 0);
+		if (array) {
+			__activate_task(p, task_rq(p));
+			resched_task(rq->curr);
+		}
+
+		task_rq_unlock(rq, &flags);
+	}
+	read_unlock_irq(&tasklist_lock);
+}
+
+#endif /* CONFIG_MAGIC_SYSRQ */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
new file mode 100644
index 000000000000..c3391b6020e8
--- /dev/null
+++ b/kernel/seccomp.c
@@ -0,0 +1,56 @@
+/*
+ * linux/kernel/seccomp.c
+ *
+ * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
+ *
+ * This defines a simple but solid secure-computing mode.
+ */
+
+#include <linux/seccomp.h>
+#include <linux/sched.h>
+
+/* #define SECCOMP_DEBUG 1 */
+
+/*
+ * Secure computing mode 1 allows only read/write/exit/sigreturn.
+ * To be fully secure this must be combined with rlimit
+ * to limit the stack allocations too.
+ */
+static int mode1_syscalls[] = {
+	__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
+	0, /* null terminated */
+};
+
+#ifdef TIF_32BIT
+static int mode1_syscalls_32[] = {
+	__NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
+	0, /* null terminated */
+};
+#endif
+
+void __secure_computing(int this_syscall)
+{
+	int mode = current->seccomp.mode;
+	int * syscall;
+
+	switch (mode) {
+	case 1:
+		syscall = mode1_syscalls;
+#ifdef TIF_32BIT
+		if (test_thread_flag(TIF_32BIT))
+			syscall = mode1_syscalls_32;
+#endif
+		do {
+			if (*syscall == this_syscall)
+				return;
+		} while (*++syscall);
+		break;
+	default:
+		BUG();
+	}
+
+#ifdef SECCOMP_DEBUG
+	dump_stack();
+#endif
+	do_exit(SIGKILL);
+}
diff --git a/kernel/signal.c b/kernel/signal.c
new file mode 100644
index 000000000000..f00a1d610f0b
--- /dev/null
+++ b/kernel/signal.c
@@ -0,0 +1,2662 @@
+/*
+ *  linux/kernel/signal.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  1997-11-02  Modified for POSIX.1b signals by Richard Henderson
+ *
+ *  2003-06-02  Jim Houston - Concurrent Computer Corp.
+ *		Changes to use preallocated sigqueue structures
+ *		to allow signals to be sent reliably.
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/tty.h>
+#include <linux/binfmts.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/ptrace.h>
+#include <linux/posix-timers.h>
+#include <asm/param.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/siginfo.h>
+
+/*
+ * SLAB caches for signal bits.
+ */
+
+static kmem_cache_t *sigqueue_cachep;
+
+/*
+ * In POSIX a signal is sent either to a specific thread (Linux task)
+ * or to the process as a whole (Linux thread group).  How the signal
+ * is sent determines whether it's to one thread or the whole group,
+ * which determines which signal mask(s) are involved in blocking it
+ * from being delivered until later.  When the signal is delivered,
+ * either it's caught or ignored by a user handler or it has a default
+ * effect that applies to the whole thread group (POSIX process).
+ *
+ * The possible effects an unblocked signal set to SIG_DFL can have are:
+ *   ignore	- Nothing Happens
+ *   terminate	- kill the process, i.e. all threads in the group,
+ * 		  similar to exit_group.  The group leader (only) reports
+ *		  WIFSIGNALED status to its parent.
+ *   coredump	- write a core dump file describing all threads using
+ *		  the same mm and then kill all those threads
+ *   stop 	- stop all the threads in the group, i.e. TASK_STOPPED state
+ *
+ * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
+ * Other signals when not blocked and set to SIG_DFL behaves as follows.
+ * The job control signals also have other special effects.
+ *
+ *	+--------------------+------------------+
+ *	|  POSIX signal      |  default action  |
+ *	+--------------------+------------------+
+ *	|  SIGHUP            |  terminate	|
+ *	|  SIGINT            |	terminate	|
+ *	|  SIGQUIT           |	coredump 	|
+ *	|  SIGILL            |	coredump 	|
+ *	|  SIGTRAP           |	coredump 	|
+ *	|  SIGABRT/SIGIOT    |	coredump 	|
+ *	|  SIGBUS            |	coredump 	|
+ *	|  SIGFPE            |	coredump 	|
+ *	|  SIGKILL           |	terminate(+)	|
+ *	|  SIGUSR1           |	terminate	|
+ *	|  SIGSEGV           |	coredump 	|
+ *	|  SIGUSR2           |	terminate	|
+ *	|  SIGPIPE           |	terminate	|
+ *	|  SIGALRM           |	terminate	|
+ *	|  SIGTERM           |	terminate	|
+ *	|  SIGCHLD           |	ignore   	|
+ *	|  SIGCONT           |	ignore(*)	|
+ *	|  SIGSTOP           |	stop(*)(+)  	|
+ *	|  SIGTSTP           |	stop(*)  	|
+ *	|  SIGTTIN           |	stop(*)  	|
+ *	|  SIGTTOU           |	stop(*)  	|
+ *	|  SIGURG            |	ignore   	|
+ *	|  SIGXCPU           |	coredump 	|
+ *	|  SIGXFSZ           |	coredump 	|
+ *	|  SIGVTALRM         |	terminate	|
+ *	|  SIGPROF           |	terminate	|
+ *	|  SIGPOLL/SIGIO     |	terminate	|
+ *	|  SIGSYS/SIGUNUSED  |	coredump 	|
+ *	|  SIGSTKFLT         |	terminate	|
+ *	|  SIGWINCH          |	ignore   	|
+ *	|  SIGPWR            |	terminate	|
+ *	|  SIGRTMIN-SIGRTMAX |	terminate       |
+ *	+--------------------+------------------+
+ *	|  non-POSIX signal  |  default action  |
+ *	+--------------------+------------------+
+ *	|  SIGEMT            |  coredump	|
+ *	+--------------------+------------------+
+ *
+ * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
+ * (*) Special job control effects:
+ * When SIGCONT is sent, it resumes the process (all threads in the group)
+ * from TASK_STOPPED state and also clears any pending/queued stop signals
+ * (any of those marked with "stop(*)").  This happens regardless of blocking,
+ * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears
+ * any pending/queued SIGCONT signals; this happens regardless of blocking,
+ * catching, or ignored the stop signal, though (except for SIGSTOP) the
+ * default action of stopping the process may happen later or never.
+ */
+
+#ifdef SIGEMT
+#define M_SIGEMT	M(SIGEMT)
+#else
+#define M_SIGEMT	0
+#endif
+
+#if SIGRTMIN > BITS_PER_LONG
+#define M(sig) (1ULL << ((sig)-1))
+#else
+#define M(sig) (1UL << ((sig)-1))
+#endif
+#define T(sig, mask) (M(sig) & (mask))
+
+#define SIG_KERNEL_ONLY_MASK (\
+	M(SIGKILL)   |  M(SIGSTOP)                                   )
+
+#define SIG_KERNEL_STOP_MASK (\
+	M(SIGSTOP)   |  M(SIGTSTP)   |  M(SIGTTIN)   |  M(SIGTTOU)   )
+
+#define SIG_KERNEL_COREDUMP_MASK (\
+        M(SIGQUIT)   |  M(SIGILL)    |  M(SIGTRAP)   |  M(SIGABRT)   | \
+        M(SIGFPE)    |  M(SIGSEGV)   |  M(SIGBUS)    |  M(SIGSYS)    | \
+        M(SIGXCPU)   |  M(SIGXFSZ)   |  M_SIGEMT                     )
+
+#define SIG_KERNEL_IGNORE_MASK (\
+        M(SIGCONT)   |  M(SIGCHLD)   |  M(SIGWINCH)  |  M(SIGURG)    )
+
+#define sig_kernel_only(sig) \
+		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_ONLY_MASK))
+#define sig_kernel_coredump(sig) \
+		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_COREDUMP_MASK))
+#define sig_kernel_ignore(sig) \
+		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_IGNORE_MASK))
+#define sig_kernel_stop(sig) \
+		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_STOP_MASK))
+
+#define sig_user_defined(t, signr) \
+	(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&	\
+	 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
+
+#define sig_fatal(t, signr) \
+	(!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
+	 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
+
+static int sig_ignored(struct task_struct *t, int sig)
+{
+	void __user * handler;
+
+	/*
+	 * Tracers always want to know about signals..
+	 */
+	if (t->ptrace & PT_PTRACED)
+		return 0;
+
+	/*
+	 * Blocked signals are never ignored, since the
+	 * signal handler may change by the time it is
+	 * unblocked.
+	 */
+	if (sigismember(&t->blocked, sig))
+		return 0;
+
+	/* Is it explicitly or implicitly ignored? */
+	handler = t->sighand->action[sig-1].sa.sa_handler;
+	return   handler == SIG_IGN ||
+		(handler == SIG_DFL && sig_kernel_ignore(sig));
+}
+
+/*
+ * Re-calculate pending state from the set of locally pending
+ * signals, globally pending signals, and blocked signals.
+ */
+static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
+{
+	unsigned long ready;
+	long i;
+
+	switch (_NSIG_WORDS) {
+	default:
+		for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
+			ready |= signal->sig[i] &~ blocked->sig[i];
+		break;
+
+	case 4: ready  = signal->sig[3] &~ blocked->sig[3];
+		ready |= signal->sig[2] &~ blocked->sig[2];
+		ready |= signal->sig[1] &~ blocked->sig[1];
+		ready |= signal->sig[0] &~ blocked->sig[0];
+		break;
+
+	case 2: ready  = signal->sig[1] &~ blocked->sig[1];
+		ready |= signal->sig[0] &~ blocked->sig[0];
+		break;
+
+	case 1: ready  = signal->sig[0] &~ blocked->sig[0];
+	}
+	return ready !=	0;
+}
+
+#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
+
+fastcall void recalc_sigpending_tsk(struct task_struct *t)
+{
+	if (t->signal->group_stop_count > 0 ||
+	    PENDING(&t->pending, &t->blocked) ||
+	    PENDING(&t->signal->shared_pending, &t->blocked))
+		set_tsk_thread_flag(t, TIF_SIGPENDING);
+	else
+		clear_tsk_thread_flag(t, TIF_SIGPENDING);
+}
+
+void recalc_sigpending(void)
+{
+	recalc_sigpending_tsk(current);
+}
+
+/* Given the mask, find the first available signal that should be serviced. */
+
+static int
+next_signal(struct sigpending *pending, sigset_t *mask)
+{
+	unsigned long i, *s, *m, x;
+	int sig = 0;
+	
+	s = pending->signal.sig;
+	m = mask->sig;
+	switch (_NSIG_WORDS) {
+	default:
+		for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m)
+			if ((x = *s &~ *m) != 0) {
+				sig = ffz(~x) + i*_NSIG_BPW + 1;
+				break;
+			}
+		break;
+
+	case 2: if ((x = s[0] &~ m[0]) != 0)
+			sig = 1;
+		else if ((x = s[1] &~ m[1]) != 0)
+			sig = _NSIG_BPW + 1;
+		else
+			break;
+		sig += ffz(~x);
+		break;
+
+	case 1: if ((x = *s &~ *m) != 0)
+			sig = ffz(~x) + 1;
+		break;
+	}
+	
+	return sig;
+}
+
+static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags,
+					 int override_rlimit)
+{
+	struct sigqueue *q = NULL;
+
+	atomic_inc(&t->user->sigpending);
+	if (override_rlimit ||
+	    atomic_read(&t->user->sigpending) <=
+			t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
+		q = kmem_cache_alloc(sigqueue_cachep, flags);
+	if (unlikely(q == NULL)) {
+		atomic_dec(&t->user->sigpending);
+	} else {
+		INIT_LIST_HEAD(&q->list);
+		q->flags = 0;
+		q->lock = NULL;
+		q->user = get_uid(t->user);
+	}
+	return(q);
+}
+
+static inline void __sigqueue_free(struct sigqueue *q)
+{
+	if (q->flags & SIGQUEUE_PREALLOC)
+		return;
+	atomic_dec(&q->user->sigpending);
+	free_uid(q->user);
+	kmem_cache_free(sigqueue_cachep, q);
+}
+
+static void flush_sigqueue(struct sigpending *queue)
+{
+	struct sigqueue *q;
+
+	sigemptyset(&queue->signal);
+	while (!list_empty(&queue->list)) {
+		q = list_entry(queue->list.next, struct sigqueue , list);
+		list_del_init(&q->list);
+		__sigqueue_free(q);
+	}
+}
+
+/*
+ * Flush all pending signals for a task.
+ */
+
+void
+flush_signals(struct task_struct *t)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&t->sighand->siglock, flags);
+	clear_tsk_thread_flag(t,TIF_SIGPENDING);
+	flush_sigqueue(&t->pending);
+	flush_sigqueue(&t->signal->shared_pending);
+	spin_unlock_irqrestore(&t->sighand->siglock, flags);
+}
+
+/*
+ * This function expects the tasklist_lock write-locked.
+ */
+void __exit_sighand(struct task_struct *tsk)
+{
+	struct sighand_struct * sighand = tsk->sighand;
+
+	/* Ok, we're done with the signal handlers */
+	tsk->sighand = NULL;
+	if (atomic_dec_and_test(&sighand->count))
+		kmem_cache_free(sighand_cachep, sighand);
+}
+
+void exit_sighand(struct task_struct *tsk)
+{
+	write_lock_irq(&tasklist_lock);
+	__exit_sighand(tsk);
+	write_unlock_irq(&tasklist_lock);
+}
+
+/*
+ * This function expects the tasklist_lock write-locked.
+ */
+void __exit_signal(struct task_struct *tsk)
+{
+	struct signal_struct * sig = tsk->signal;
+	struct sighand_struct * sighand = tsk->sighand;
+
+	if (!sig)
+		BUG();
+	if (!atomic_read(&sig->count))
+		BUG();
+	spin_lock(&sighand->siglock);
+	posix_cpu_timers_exit(tsk);
+	if (atomic_dec_and_test(&sig->count)) {
+		posix_cpu_timers_exit_group(tsk);
+		if (tsk == sig->curr_target)
+			sig->curr_target = next_thread(tsk);
+		tsk->signal = NULL;
+		spin_unlock(&sighand->siglock);
+		flush_sigqueue(&sig->shared_pending);
+	} else {
+		/*
+		 * If there is any task waiting for the group exit
+		 * then notify it:
+		 */
+		if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
+			wake_up_process(sig->group_exit_task);
+			sig->group_exit_task = NULL;
+		}
+		if (tsk == sig->curr_target)
+			sig->curr_target = next_thread(tsk);
+		tsk->signal = NULL;
+		/*
+		 * Accumulate here the counters for all threads but the
+		 * group leader as they die, so they can be added into
+		 * the process-wide totals when those are taken.
+		 * The group leader stays around as a zombie as long
+		 * as there are other threads.  When it gets reaped,
+		 * the exit.c code will add its counts into these totals.
+		 * We won't ever get here for the group leader, since it
+		 * will have been the last reference on the signal_struct.
+		 */
+		sig->utime = cputime_add(sig->utime, tsk->utime);
+		sig->stime = cputime_add(sig->stime, tsk->stime);
+		sig->min_flt += tsk->min_flt;
+		sig->maj_flt += tsk->maj_flt;
+		sig->nvcsw += tsk->nvcsw;
+		sig->nivcsw += tsk->nivcsw;
+		sig->sched_time += tsk->sched_time;
+		spin_unlock(&sighand->siglock);
+		sig = NULL;	/* Marker for below.  */
+	}
+	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+	flush_sigqueue(&tsk->pending);
+	if (sig) {
+		/*
+		 * We are cleaning up the signal_struct here.  We delayed
+		 * calling exit_itimers until after flush_sigqueue, just in
+		 * case our thread-local pending queue contained a queued
+		 * timer signal that would have been cleared in
+		 * exit_itimers.  When that called sigqueue_free, it would
+		 * attempt to re-take the tasklist_lock and deadlock.  This
+		 * can never happen if we ensure that all queues the
+		 * timer's signal might be queued on have been flushed
+		 * first.  The shared_pending queue, and our own pending
+		 * queue are the only queues the timer could be on, since
+		 * there are no other threads left in the group and timer
+		 * signals are constrained to threads inside the group.
+		 */
+		exit_itimers(sig);
+		exit_thread_group_keys(sig);
+		kmem_cache_free(signal_cachep, sig);
+	}
+}
+
+void exit_signal(struct task_struct *tsk)
+{
+	write_lock_irq(&tasklist_lock);
+	__exit_signal(tsk);
+	write_unlock_irq(&tasklist_lock);
+}
+
+/*
+ * Flush all handlers for a task.
+ */
+
+void
+flush_signal_handlers(struct task_struct *t, int force_default)
+{
+	int i;
+	struct k_sigaction *ka = &t->sighand->action[0];
+	for (i = _NSIG ; i != 0 ; i--) {
+		if (force_default || ka->sa.sa_handler != SIG_IGN)
+			ka->sa.sa_handler = SIG_DFL;
+		ka->sa.sa_flags = 0;
+		sigemptyset(&ka->sa.sa_mask);
+		ka++;
+	}
+}
+
+
+/* Notify the system that a driver wants to block all signals for this
+ * process, and wants to be notified if any signals at all were to be
+ * sent/acted upon.  If the notifier routine returns non-zero, then the
+ * signal will be acted upon after all.  If the notifier routine returns 0,
+ * then then signal will be blocked.  Only one block per process is
+ * allowed.  priv is a pointer to private data that the notifier routine
+ * can use to determine if the signal should be blocked or not.  */
+
+void
+block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	current->notifier_mask = mask;
+	current->notifier_data = priv;
+	current->notifier = notifier;
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+
+/* Notify the system that blocking has ended. */
+
+void
+unblock_all_signals(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	current->notifier = NULL;
+	current->notifier_data = NULL;
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+
+static inline int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+{
+	struct sigqueue *q, *first = NULL;
+	int still_pending = 0;
+
+	if (unlikely(!sigismember(&list->signal, sig)))
+		return 0;
+
+	/*
+	 * Collect the siginfo appropriate to this signal.  Check if
+	 * there is another siginfo for the same signal.
+	*/
+	list_for_each_entry(q, &list->list, list) {
+		if (q->info.si_signo == sig) {
+			if (first) {
+				still_pending = 1;
+				break;
+			}
+			first = q;
+		}
+	}
+	if (first) {
+		list_del_init(&first->list);
+		copy_siginfo(info, &first->info);
+		__sigqueue_free(first);
+		if (!still_pending)
+			sigdelset(&list->signal, sig);
+	} else {
+
+		/* Ok, it wasn't in the queue.  This must be
+		   a fast-pathed signal or we must have been
+		   out of queue space.  So zero out the info.
+		 */
+		sigdelset(&list->signal, sig);
+		info->si_signo = sig;
+		info->si_errno = 0;
+		info->si_code = 0;
+		info->si_pid = 0;
+		info->si_uid = 0;
+	}
+	return 1;
+}
+
+static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
+			siginfo_t *info)
+{
+	int sig = 0;
+
+	sig = next_signal(pending, mask);
+	if (sig) {
+		if (current->notifier) {
+			if (sigismember(current->notifier_mask, sig)) {
+				if (!(current->notifier)(current->notifier_data)) {
+					clear_thread_flag(TIF_SIGPENDING);
+					return 0;
+				}
+			}
+		}
+
+		if (!collect_signal(sig, pending, info))
+			sig = 0;
+				
+	}
+	recalc_sigpending();
+
+	return sig;
+}
+
+/*
+ * Dequeue a signal and return the element to the caller, which is 
+ * expected to free it.
+ *
+ * All callers have to hold the siglock.
+ */
+int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
+{
+	int signr = __dequeue_signal(&tsk->pending, mask, info);
+	if (!signr)
+		signr = __dequeue_signal(&tsk->signal->shared_pending,
+					 mask, info);
+ 	if (signr && unlikely(sig_kernel_stop(signr))) {
+ 		/*
+ 		 * Set a marker that we have dequeued a stop signal.  Our
+ 		 * caller might release the siglock and then the pending
+ 		 * stop signal it is about to process is no longer in the
+ 		 * pending bitmasks, but must still be cleared by a SIGCONT
+ 		 * (and overruled by a SIGKILL).  So those cases clear this
+ 		 * shared flag after we've set it.  Note that this flag may
+ 		 * remain set after the signal we return is ignored or
+ 		 * handled.  That doesn't matter because its only purpose
+ 		 * is to alert stop-signal processing code when another
+ 		 * processor has come along and cleared the flag.
+ 		 */
+ 		tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+ 	}
+	if ( signr &&
+	     ((info->si_code & __SI_MASK) == __SI_TIMER) &&
+	     info->si_sys_private){
+		/*
+		 * Release the siglock to ensure proper locking order
+		 * of timer locks outside of siglocks.  Note, we leave
+		 * irqs disabled here, since the posix-timers code is
+		 * about to disable them again anyway.
+		 */
+		spin_unlock(&tsk->sighand->siglock);
+		do_schedule_next_timer(info);
+		spin_lock(&tsk->sighand->siglock);
+	}
+	return signr;
+}
+
+/*
+ * Tell a process that it has a new active signal..
+ *
+ * NOTE! we rely on the previous spin_lock to
+ * lock interrupts for us! We can only be called with
+ * "siglock" held, and the local interrupt must
+ * have been disabled when that got acquired!
+ *
+ * No need to set need_resched since signal event passing
+ * goes through ->blocked
+ */
+void signal_wake_up(struct task_struct *t, int resume)
+{
+	unsigned int mask;
+
+	set_tsk_thread_flag(t, TIF_SIGPENDING);
+
+	/*
+	 * For SIGKILL, we want to wake it up in the stopped/traced case.
+	 * We don't check t->state here because there is a race with it
+	 * executing another processor and just now entering stopped state.
+	 * By using wake_up_state, we ensure the process will wake up and
+	 * handle its death signal.
+	 */
+	mask = TASK_INTERRUPTIBLE;
+	if (resume)
+		mask |= TASK_STOPPED | TASK_TRACED;
+	if (!wake_up_state(t, mask))
+		kick_process(t);
+}
+
+/*
+ * Remove signals in mask from the pending set and queue.
+ * Returns 1 if any signals were found.
+ *
+ * All callers must be holding the siglock.
+ */
+static int rm_from_queue(unsigned long mask, struct sigpending *s)
+{
+	struct sigqueue *q, *n;
+
+	if (!sigtestsetmask(&s->signal, mask))
+		return 0;
+
+	sigdelsetmask(&s->signal, mask);
+	list_for_each_entry_safe(q, n, &s->list, list) {
+		if (q->info.si_signo < SIGRTMIN &&
+		    (mask & sigmask(q->info.si_signo))) {
+			list_del_init(&q->list);
+			__sigqueue_free(q);
+		}
+	}
+	return 1;
+}
+
+/*
+ * Bad permissions for sending the signal
+ */
+static int check_kill_permission(int sig, struct siginfo *info,
+				 struct task_struct *t)
+{
+	int error = -EINVAL;
+	if (sig < 0 || sig > _NSIG)
+		return error;
+	error = -EPERM;
+	if ((!info || ((unsigned long)info != 1 &&
+			(unsigned long)info != 2 && SI_FROMUSER(info)))
+	    && ((sig != SIGCONT) ||
+		(current->signal->session != t->signal->session))
+	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
+	    && (current->uid ^ t->suid) && (current->uid ^ t->uid)
+	    && !capable(CAP_KILL))
+		return error;
+	return security_task_kill(t, info, sig);
+}
+
+/* forward decl */
+static void do_notify_parent_cldstop(struct task_struct *tsk,
+				     struct task_struct *parent,
+				     int why);
+
+/*
+ * Handle magic process-wide effects of stop/continue signals.
+ * Unlike the signal actions, these happen immediately at signal-generation
+ * time regardless of blocking, ignoring, or handling.  This does the
+ * actual continuing for SIGCONT, but not the actual stopping for stop
+ * signals.  The process stop is done as a signal action for SIG_DFL.
+ */
+static void handle_stop_signal(int sig, struct task_struct *p)
+{
+	struct task_struct *t;
+
+	if (p->flags & SIGNAL_GROUP_EXIT)
+		/*
+		 * The process is in the middle of dying already.
+		 */
+		return;
+
+	if (sig_kernel_stop(sig)) {
+		/*
+		 * This is a stop signal.  Remove SIGCONT from all queues.
+		 */
+		rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending);
+		t = p;
+		do {
+			rm_from_queue(sigmask(SIGCONT), &t->pending);
+			t = next_thread(t);
+		} while (t != p);
+	} else if (sig == SIGCONT) {
+		/*
+		 * Remove all stop signals from all queues,
+		 * and wake all threads.
+		 */
+		if (unlikely(p->signal->group_stop_count > 0)) {
+			/*
+			 * There was a group stop in progress.  We'll
+			 * pretend it finished before we got here.  We are
+			 * obliged to report it to the parent: if the
+			 * SIGSTOP happened "after" this SIGCONT, then it
+			 * would have cleared this pending SIGCONT.  If it
+			 * happened "before" this SIGCONT, then the parent
+			 * got the SIGCHLD about the stop finishing before
+			 * the continue happened.  We do the notification
+			 * now, and it's as if the stop had finished and
+			 * the SIGCHLD was pending on entry to this kill.
+			 */
+			p->signal->group_stop_count = 0;
+			p->signal->flags = SIGNAL_STOP_CONTINUED;
+			spin_unlock(&p->sighand->siglock);
+			if (p->ptrace & PT_PTRACED)
+				do_notify_parent_cldstop(p, p->parent,
+							 CLD_STOPPED);
+			else
+				do_notify_parent_cldstop(
+					p->group_leader,
+					p->group_leader->real_parent,
+							 CLD_STOPPED);
+			spin_lock(&p->sighand->siglock);
+		}
+		rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
+		t = p;
+		do {
+			unsigned int state;
+			rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
+			
+			/*
+			 * If there is a handler for SIGCONT, we must make
+			 * sure that no thread returns to user mode before
+			 * we post the signal, in case it was the only
+			 * thread eligible to run the signal handler--then
+			 * it must not do anything between resuming and
+			 * running the handler.  With the TIF_SIGPENDING
+			 * flag set, the thread will pause and acquire the
+			 * siglock that we hold now and until we've queued
+			 * the pending signal. 
+			 *
+			 * Wake up the stopped thread _after_ setting
+			 * TIF_SIGPENDING
+			 */
+			state = TASK_STOPPED;
+			if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) {
+				set_tsk_thread_flag(t, TIF_SIGPENDING);
+				state |= TASK_INTERRUPTIBLE;
+			}
+			wake_up_state(t, state);
+
+			t = next_thread(t);
+		} while (t != p);
+
+		if (p->signal->flags & SIGNAL_STOP_STOPPED) {
+			/*
+			 * We were in fact stopped, and are now continued.
+			 * Notify the parent with CLD_CONTINUED.
+			 */
+			p->signal->flags = SIGNAL_STOP_CONTINUED;
+			p->signal->group_exit_code = 0;
+			spin_unlock(&p->sighand->siglock);
+			if (p->ptrace & PT_PTRACED)
+				do_notify_parent_cldstop(p, p->parent,
+							 CLD_CONTINUED);
+			else
+				do_notify_parent_cldstop(
+					p->group_leader,
+					p->group_leader->real_parent,
+							 CLD_CONTINUED);
+			spin_lock(&p->sighand->siglock);
+		} else {
+			/*
+			 * We are not stopped, but there could be a stop
+			 * signal in the middle of being processed after
+			 * being removed from the queue.  Clear that too.
+			 */
+			p->signal->flags = 0;
+		}
+	} else if (sig == SIGKILL) {
+		/*
+		 * Make sure that any pending stop signal already dequeued
+		 * is undone by the wakeup for SIGKILL.
+		 */
+		p->signal->flags = 0;
+	}
+}
+
+static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
+			struct sigpending *signals)
+{
+	struct sigqueue * q = NULL;
+	int ret = 0;
+
+	/*
+	 * fast-pathed signals for kernel-internal things like SIGSTOP
+	 * or SIGKILL.
+	 */
+	if ((unsigned long)info == 2)
+		goto out_set;
+
+	/* Real-time signals must be queued if sent by sigqueue, or
+	   some other real-time mechanism.  It is implementation
+	   defined whether kill() does so.  We attempt to do so, on
+	   the principle of least surprise, but since kill is not
+	   allowed to fail with EAGAIN when low on memory we just
+	   make sure at least one signal gets delivered and don't
+	   pass on the info struct.  */
+
+	q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
+					     ((unsigned long) info < 2 ||
+					      info->si_code >= 0)));
+	if (q) {
+		list_add_tail(&q->list, &signals->list);
+		switch ((unsigned long) info) {
+		case 0:
+			q->info.si_signo = sig;
+			q->info.si_errno = 0;
+			q->info.si_code = SI_USER;
+			q->info.si_pid = current->pid;
+			q->info.si_uid = current->uid;
+			break;
+		case 1:
+			q->info.si_signo = sig;
+			q->info.si_errno = 0;
+			q->info.si_code = SI_KERNEL;
+			q->info.si_pid = 0;
+			q->info.si_uid = 0;
+			break;
+		default:
+			copy_siginfo(&q->info, info);
+			break;
+		}
+	} else {
+		if (sig >= SIGRTMIN && info && (unsigned long)info != 1
+		   && info->si_code != SI_USER)
+		/*
+		 * Queue overflow, abort.  We may abort if the signal was rt
+		 * and sent by user using something other than kill().
+		 */
+			return -EAGAIN;
+		if (((unsigned long)info > 1) && (info->si_code == SI_TIMER))
+			/*
+			 * Set up a return to indicate that we dropped 
+			 * the signal.
+			 */
+			ret = info->si_sys_private;
+	}
+
+out_set:
+	sigaddset(&signals->signal, sig);
+	return ret;
+}
+
+#define LEGACY_QUEUE(sigptr, sig) \
+	(((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig)))
+
+
+static int
+specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+{
+	int ret = 0;
+
+	if (!irqs_disabled())
+		BUG();
+	assert_spin_locked(&t->sighand->siglock);
+
+	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
+		/*
+		 * Set up a return to indicate that we dropped the signal.
+		 */
+		ret = info->si_sys_private;
+
+	/* Short-circuit ignored signals.  */
+	if (sig_ignored(t, sig))
+		goto out;
+
+	/* Support queueing exactly one non-rt signal, so that we
+	   can get more detailed information about the cause of
+	   the signal. */
+	if (LEGACY_QUEUE(&t->pending, sig))
+		goto out;
+
+	ret = send_signal(sig, info, t, &t->pending);
+	if (!ret && !sigismember(&t->blocked, sig))
+		signal_wake_up(t, sig == SIGKILL);
+out:
+	return ret;
+}
+
+/*
+ * Force a signal that the process can't ignore: if necessary
+ * we unblock the signal and change any SIG_IGN to SIG_DFL.
+ */
+
+int
+force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+{
+	unsigned long int flags;
+	int ret;
+
+	spin_lock_irqsave(&t->sighand->siglock, flags);
+	if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
+		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
+		sigdelset(&t->blocked, sig);
+		recalc_sigpending_tsk(t);
+	}
+	ret = specific_send_sig_info(sig, info, t);
+	spin_unlock_irqrestore(&t->sighand->siglock, flags);
+
+	return ret;
+}
+
+void
+force_sig_specific(int sig, struct task_struct *t)
+{
+	unsigned long int flags;
+
+	spin_lock_irqsave(&t->sighand->siglock, flags);
+	if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN)
+		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
+	sigdelset(&t->blocked, sig);
+	recalc_sigpending_tsk(t);
+	specific_send_sig_info(sig, (void *)2, t);
+	spin_unlock_irqrestore(&t->sighand->siglock, flags);
+}
+
+/*
+ * Test if P wants to take SIG.  After we've checked all threads with this,
+ * it's equivalent to finding no threads not blocking SIG.  Any threads not
+ * blocking SIG were ruled out because they are not running and already
+ * have pending signals.  Such threads will dequeue from the shared queue
+ * as soon as they're available, so putting the signal on the shared queue
+ * will be equivalent to sending it to one such thread.
+ */
+#define wants_signal(sig, p, mask) 			\
+	(!sigismember(&(p)->blocked, sig)		\
+	 && !((p)->state & mask)			\
+	 && !((p)->flags & PF_EXITING)			\
+	 && (task_curr(p) || !signal_pending(p)))
+
+
+static void
+__group_complete_signal(int sig, struct task_struct *p)
+{
+	unsigned int mask;
+	struct task_struct *t;
+
+	/*
+	 * Don't bother traced and stopped tasks (but
+	 * SIGKILL will punch through that).
+	 */
+	mask = TASK_STOPPED | TASK_TRACED;
+	if (sig == SIGKILL)
+		mask = 0;
+
+	/*
+	 * Now find a thread we can wake up to take the signal off the queue.
+	 *
+	 * If the main thread wants the signal, it gets first crack.
+	 * Probably the least surprising to the average bear.
+	 */
+	if (wants_signal(sig, p, mask))
+		t = p;
+	else if (thread_group_empty(p))
+		/*
+		 * There is just one thread and it does not need to be woken.
+		 * It will dequeue unblocked signals before it runs again.
+		 */
+		return;
+	else {
+		/*
+		 * Otherwise try to find a suitable thread.
+		 */
+		t = p->signal->curr_target;
+		if (t == NULL)
+			/* restart balancing at this thread */
+			t = p->signal->curr_target = p;
+		BUG_ON(t->tgid != p->tgid);
+
+		while (!wants_signal(sig, t, mask)) {
+			t = next_thread(t);
+			if (t == p->signal->curr_target)
+				/*
+				 * No thread needs to be woken.
+				 * Any eligible threads will see
+				 * the signal in the queue soon.
+				 */
+				return;
+		}
+		p->signal->curr_target = t;
+	}
+
+	/*
+	 * Found a killable thread.  If the signal will be fatal,
+	 * then start taking the whole group down immediately.
+	 */
+	if (sig_fatal(p, sig) && !(p->signal->flags & SIGNAL_GROUP_EXIT) &&
+	    !sigismember(&t->real_blocked, sig) &&
+	    (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
+		/*
+		 * This signal will be fatal to the whole group.
+		 */
+		if (!sig_kernel_coredump(sig)) {
+			/*
+			 * Start a group exit and wake everybody up.
+			 * This way we don't have other threads
+			 * running and doing things after a slower
+			 * thread has the fatal signal pending.
+			 */
+			p->signal->flags = SIGNAL_GROUP_EXIT;
+			p->signal->group_exit_code = sig;
+			p->signal->group_stop_count = 0;
+			t = p;
+			do {
+				sigaddset(&t->pending.signal, SIGKILL);
+				signal_wake_up(t, 1);
+				t = next_thread(t);
+			} while (t != p);
+			return;
+		}
+
+		/*
+		 * There will be a core dump.  We make all threads other
+		 * than the chosen one go into a group stop so that nothing
+		 * happens until it gets scheduled, takes the signal off
+		 * the shared queue, and does the core dump.  This is a
+		 * little more complicated than strictly necessary, but it
+		 * keeps the signal state that winds up in the core dump
+		 * unchanged from the death state, e.g. which thread had
+		 * the core-dump signal unblocked.
+		 */
+		rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
+		rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
+		p->signal->group_stop_count = 0;
+		p->signal->group_exit_task = t;
+		t = p;
+		do {
+			p->signal->group_stop_count++;
+			signal_wake_up(t, 0);
+			t = next_thread(t);
+		} while (t != p);
+		wake_up_process(p->signal->group_exit_task);
+		return;
+	}
+
+	/*
+	 * The signal is already in the shared-pending queue.
+	 * Tell the chosen thread to wake up and dequeue it.
+	 */
+	signal_wake_up(t, sig == SIGKILL);
+	return;
+}
+
+int
+__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+{
+	int ret = 0;
+
+	assert_spin_locked(&p->sighand->siglock);
+	handle_stop_signal(sig, p);
+
+	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
+		/*
+		 * Set up a return to indicate that we dropped the signal.
+		 */
+		ret = info->si_sys_private;
+
+	/* Short-circuit ignored signals.  */
+	if (sig_ignored(p, sig))
+		return ret;
+
+	if (LEGACY_QUEUE(&p->signal->shared_pending, sig))
+		/* This is a non-RT signal and we already have one queued.  */
+		return ret;
+
+	/*
+	 * Put this signal on the shared-pending queue, or fail with EAGAIN.
+	 * We always use the shared queue for process-wide signals,
+	 * to avoid several races.
+	 */
+	ret = send_signal(sig, info, p, &p->signal->shared_pending);
+	if (unlikely(ret))
+		return ret;
+
+	__group_complete_signal(sig, p);
+	return 0;
+}
+
+/*
+ * Nuke all other threads in the group.
+ */
+void zap_other_threads(struct task_struct *p)
+{
+	struct task_struct *t;
+
+	p->signal->flags = SIGNAL_GROUP_EXIT;
+	p->signal->group_stop_count = 0;
+
+	if (thread_group_empty(p))
+		return;
+
+	for (t = next_thread(p); t != p; t = next_thread(t)) {
+		/*
+		 * Don't bother with already dead threads
+		 */
+		if (t->exit_state)
+			continue;
+
+		/*
+		 * We don't want to notify the parent, since we are
+		 * killed as part of a thread group due to another
+		 * thread doing an execve() or similar. So set the
+		 * exit signal to -1 to allow immediate reaping of
+		 * the process.  But don't detach the thread group
+		 * leader.
+		 */
+		if (t != p->group_leader)
+			t->exit_signal = -1;
+
+		sigaddset(&t->pending.signal, SIGKILL);
+		rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
+		signal_wake_up(t, 1);
+	}
+}
+
+/*
+ * Must be called with the tasklist_lock held for reading!
+ */
+int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+{
+	unsigned long flags;
+	int ret;
+
+	ret = check_kill_permission(sig, info, p);
+	if (!ret && sig && p->sighand) {
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		ret = __group_send_sig_info(sig, info, p);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	}
+
+	return ret;
+}
+
+/*
+ * kill_pg_info() sends a signal to a process group: this is what the tty
+ * control characters do (^C, ^Z etc)
+ */
+
+int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
+{
+	struct task_struct *p = NULL;
+	int retval, success;
+
+	if (pgrp <= 0)
+		return -EINVAL;
+
+	success = 0;
+	retval = -ESRCH;
+	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
+		int err = group_send_sig_info(sig, info, p);
+		success |= !err;
+		retval = err;
+	} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+	return success ? 0 : retval;
+}
+
+int
+kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
+{
+	int retval;
+
+	read_lock(&tasklist_lock);
+	retval = __kill_pg_info(sig, info, pgrp);
+	read_unlock(&tasklist_lock);
+
+	return retval;
+}
+
+int
+kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+{
+	int error;
+	struct task_struct *p;
+
+	read_lock(&tasklist_lock);
+	p = find_task_by_pid(pid);
+	error = -ESRCH;
+	if (p)
+		error = group_send_sig_info(sig, info, p);
+	read_unlock(&tasklist_lock);
+	return error;
+}
+
+
+/*
+ * kill_something_info() interprets pid in interesting ways just like kill(2).
+ *
+ * POSIX specifies that kill(-1,sig) is unspecified, but what we have
+ * is probably wrong.  Should make it like BSD or SYSV.
+ */
+
+static int kill_something_info(int sig, struct siginfo *info, int pid)
+{
+	if (!pid) {
+		return kill_pg_info(sig, info, process_group(current));
+	} else if (pid == -1) {
+		int retval = 0, count = 0;
+		struct task_struct * p;
+
+		read_lock(&tasklist_lock);
+		for_each_process(p) {
+			if (p->pid > 1 && p->tgid != current->tgid) {
+				int err = group_send_sig_info(sig, info, p);
+				++count;
+				if (err != -EPERM)
+					retval = err;
+			}
+		}
+		read_unlock(&tasklist_lock);
+		return count ? retval : -ESRCH;
+	} else if (pid < 0) {
+		return kill_pg_info(sig, info, -pid);
+	} else {
+		return kill_proc_info(sig, info, pid);
+	}
+}
+
+/*
+ * These are for backward compatibility with the rest of the kernel source.
+ */
+
+/*
+ * These two are the most common entry points.  They send a signal
+ * just to the specific thread.
+ */
+int
+send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+{
+	int ret;
+	unsigned long flags;
+
+	/*
+	 * Make sure legacy kernel users don't send in bad values
+	 * (normal paths check this in check_kill_permission).
+	 */
+	if (sig < 0 || sig > _NSIG)
+		return -EINVAL;
+
+	/*
+	 * We need the tasklist lock even for the specific
+	 * thread case (when we don't need to follow the group
+	 * lists) in order to avoid races with "p->sighand"
+	 * going away or changing from under us.
+	 */
+	read_lock(&tasklist_lock);  
+	spin_lock_irqsave(&p->sighand->siglock, flags);
+	ret = specific_send_sig_info(sig, info, p);
+	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	read_unlock(&tasklist_lock);
+	return ret;
+}
+
+int
+send_sig(int sig, struct task_struct *p, int priv)
+{
+	return send_sig_info(sig, (void*)(long)(priv != 0), p);
+}
+
+/*
+ * This is the entry point for "process-wide" signals.
+ * They will go to an appropriate thread in the thread group.
+ */
+int
+send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+{
+	int ret;
+	read_lock(&tasklist_lock);
+	ret = group_send_sig_info(sig, info, p);
+	read_unlock(&tasklist_lock);
+	return ret;
+}
+
+void
+force_sig(int sig, struct task_struct *p)
+{
+	force_sig_info(sig, (void*)1L, p);
+}
+
+/*
+ * When things go south during signal handling, we
+ * will force a SIGSEGV. And if the signal that caused
+ * the problem was already a SIGSEGV, we'll want to
+ * make sure we don't even try to deliver the signal..
+ */
+int
+force_sigsegv(int sig, struct task_struct *p)
+{
+	if (sig == SIGSEGV) {
+		unsigned long flags;
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	}
+	force_sig(SIGSEGV, p);
+	return 0;
+}
+
+int
+kill_pg(pid_t pgrp, int sig, int priv)
+{
+	return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp);
+}
+
+int
+kill_proc(pid_t pid, int sig, int priv)
+{
+	return kill_proc_info(sig, (void *)(long)(priv != 0), pid);
+}
+
+/*
+ * These functions support sending signals using preallocated sigqueue
+ * structures.  This is needed "because realtime applications cannot
+ * afford to lose notifications of asynchronous events, like timer
+ * expirations or I/O completions".  In the case of Posix Timers 
+ * we allocate the sigqueue structure from the timer_create.  If this
+ * allocation fails we are able to report the failure to the application
+ * with an EAGAIN error.
+ */
+ 
+struct sigqueue *sigqueue_alloc(void)
+{
+	struct sigqueue *q;
+
+	if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0)))
+		q->flags |= SIGQUEUE_PREALLOC;
+	return(q);
+}
+
+void sigqueue_free(struct sigqueue *q)
+{
+	unsigned long flags;
+	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+	/*
+	 * If the signal is still pending remove it from the
+	 * pending queue.
+	 */
+	if (unlikely(!list_empty(&q->list))) {
+		read_lock(&tasklist_lock);  
+		spin_lock_irqsave(q->lock, flags);
+		if (!list_empty(&q->list))
+			list_del_init(&q->list);
+		spin_unlock_irqrestore(q->lock, flags);
+		read_unlock(&tasklist_lock);
+	}
+	q->flags &= ~SIGQUEUE_PREALLOC;
+	__sigqueue_free(q);
+}
+
+int
+send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	/*
+	 * We need the tasklist lock even for the specific
+	 * thread case (when we don't need to follow the group
+	 * lists) in order to avoid races with "p->sighand"
+	 * going away or changing from under us.
+	 */
+	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+	read_lock(&tasklist_lock);  
+	spin_lock_irqsave(&p->sighand->siglock, flags);
+	
+	if (unlikely(!list_empty(&q->list))) {
+		/*
+		 * If an SI_TIMER entry is already queue just increment
+		 * the overrun count.
+		 */
+		if (q->info.si_code != SI_TIMER)
+			BUG();
+		q->info.si_overrun++;
+		goto out;
+	} 
+	/* Short-circuit ignored signals.  */
+	if (sig_ignored(p, sig)) {
+		ret = 1;
+		goto out;
+	}
+
+	q->lock = &p->sighand->siglock;
+	list_add_tail(&q->list, &p->pending.list);
+	sigaddset(&p->pending.signal, sig);
+	if (!sigismember(&p->blocked, sig))
+		signal_wake_up(p, sig == SIGKILL);
+
+out:
+	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	read_unlock(&tasklist_lock);
+	return(ret);
+}
+
+int
+send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+	read_lock(&tasklist_lock);
+	spin_lock_irqsave(&p->sighand->siglock, flags);
+	handle_stop_signal(sig, p);
+
+	/* Short-circuit ignored signals.  */
+	if (sig_ignored(p, sig)) {
+		ret = 1;
+		goto out;
+	}
+
+	if (unlikely(!list_empty(&q->list))) {
+		/*
+		 * If an SI_TIMER entry is already queue just increment
+		 * the overrun count.  Other uses should not try to
+		 * send the signal multiple times.
+		 */
+		if (q->info.si_code != SI_TIMER)
+			BUG();
+		q->info.si_overrun++;
+		goto out;
+	} 
+
+	/*
+	 * Put this signal on the shared-pending queue.
+	 * We always use the shared queue for process-wide signals,
+	 * to avoid several races.
+	 */
+	q->lock = &p->sighand->siglock;
+	list_add_tail(&q->list, &p->signal->shared_pending.list);
+	sigaddset(&p->signal->shared_pending.signal, sig);
+
+	__group_complete_signal(sig, p);
+out:
+	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	read_unlock(&tasklist_lock);
+	return(ret);
+}
+
+/*
+ * Wake up any threads in the parent blocked in wait* syscalls.
+ */
+static inline void __wake_up_parent(struct task_struct *p,
+				    struct task_struct *parent)
+{
+	wake_up_interruptible_sync(&parent->signal->wait_chldexit);
+}
+
+/*
+ * Let a parent know about the death of a child.
+ * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ */
+
+void do_notify_parent(struct task_struct *tsk, int sig)
+{
+	struct siginfo info;
+	unsigned long flags;
+	struct sighand_struct *psig;
+
+	BUG_ON(sig == -1);
+
+ 	/* do_notify_parent_cldstop should have been called instead.  */
+ 	BUG_ON(tsk->state & (TASK_STOPPED|TASK_TRACED));
+
+	BUG_ON(!tsk->ptrace &&
+	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+
+	info.si_signo = sig;
+	info.si_errno = 0;
+	info.si_pid = tsk->pid;
+	info.si_uid = tsk->uid;
+
+	/* FIXME: find out whether or not this is supposed to be c*time. */
+	info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime,
+						       tsk->signal->utime));
+	info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime,
+						       tsk->signal->stime));
+
+	info.si_status = tsk->exit_code & 0x7f;
+	if (tsk->exit_code & 0x80)
+		info.si_code = CLD_DUMPED;
+	else if (tsk->exit_code & 0x7f)
+		info.si_code = CLD_KILLED;
+	else {
+		info.si_code = CLD_EXITED;
+		info.si_status = tsk->exit_code >> 8;
+	}
+
+	psig = tsk->parent->sighand;
+	spin_lock_irqsave(&psig->siglock, flags);
+	if (sig == SIGCHLD &&
+	    (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
+	     (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
+		/*
+		 * We are exiting and our parent doesn't care.  POSIX.1
+		 * defines special semantics for setting SIGCHLD to SIG_IGN
+		 * or setting the SA_NOCLDWAIT flag: we should be reaped
+		 * automatically and not left for our parent's wait4 call.
+		 * Rather than having the parent do it as a magic kind of
+		 * signal handler, we just set this to tell do_exit that we
+		 * can be cleaned up without becoming a zombie.  Note that
+		 * we still call __wake_up_parent in this case, because a
+		 * blocked sys_wait4 might now return -ECHILD.
+		 *
+		 * Whether we send SIGCHLD or not for SA_NOCLDWAIT
+		 * is implementation-defined: we do (if you don't want
+		 * it, just use SIG_IGN instead).
+		 */
+		tsk->exit_signal = -1;
+		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
+			sig = 0;
+	}
+	if (sig > 0 && sig <= _NSIG)
+		__group_send_sig_info(sig, &info, tsk->parent);
+	__wake_up_parent(tsk, tsk->parent);
+	spin_unlock_irqrestore(&psig->siglock, flags);
+}
+
+static void
+do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent,
+			 int why)
+{
+	struct siginfo info;
+	unsigned long flags;
+	struct sighand_struct *sighand;
+
+	info.si_signo = SIGCHLD;
+	info.si_errno = 0;
+	info.si_pid = tsk->pid;
+	info.si_uid = tsk->uid;
+
+	/* FIXME: find out whether or not this is supposed to be c*time. */
+	info.si_utime = cputime_to_jiffies(tsk->utime);
+	info.si_stime = cputime_to_jiffies(tsk->stime);
+
+ 	info.si_code = why;
+ 	switch (why) {
+ 	case CLD_CONTINUED:
+ 		info.si_status = SIGCONT;
+ 		break;
+ 	case CLD_STOPPED:
+ 		info.si_status = tsk->signal->group_exit_code & 0x7f;
+ 		break;
+ 	case CLD_TRAPPED:
+ 		info.si_status = tsk->exit_code & 0x7f;
+ 		break;
+ 	default:
+ 		BUG();
+ 	}
+
+	sighand = parent->sighand;
+	spin_lock_irqsave(&sighand->siglock, flags);
+	if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
+	    !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
+		__group_send_sig_info(SIGCHLD, &info, parent);
+	/*
+	 * Even if SIGCHLD is not generated, we must wake up wait4 calls.
+	 */
+	__wake_up_parent(tsk, parent);
+	spin_unlock_irqrestore(&sighand->siglock, flags);
+}
+
+/*
+ * This must be called with current->sighand->siglock held.
+ *
+ * This should be the path for all ptrace stops.
+ * We always set current->last_siginfo while stopped here.
+ * That makes it a way to test a stopped process for
+ * being ptrace-stopped vs being job-control-stopped.
+ *
+ * If we actually decide not to stop at all because the tracer is gone,
+ * we leave nostop_code in current->exit_code.
+ */
+static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
+{
+	/*
+	 * If there is a group stop in progress,
+	 * we must participate in the bookkeeping.
+	 */
+	if (current->signal->group_stop_count > 0)
+		--current->signal->group_stop_count;
+
+	current->last_siginfo = info;
+	current->exit_code = exit_code;
+
+	/* Let the debugger run.  */
+	set_current_state(TASK_TRACED);
+	spin_unlock_irq(&current->sighand->siglock);
+	read_lock(&tasklist_lock);
+	if (likely(current->ptrace & PT_PTRACED) &&
+	    likely(current->parent != current->real_parent ||
+		   !(current->ptrace & PT_ATTACHED)) &&
+	    (likely(current->parent->signal != current->signal) ||
+	     !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
+		do_notify_parent_cldstop(current, current->parent,
+					 CLD_TRAPPED);
+		read_unlock(&tasklist_lock);
+		schedule();
+	} else {
+		/*
+		 * By the time we got the lock, our tracer went away.
+		 * Don't stop here.
+		 */
+		read_unlock(&tasklist_lock);
+		set_current_state(TASK_RUNNING);
+		current->exit_code = nostop_code;
+	}
+
+	/*
+	 * We are back.  Now reacquire the siglock before touching
+	 * last_siginfo, so that we are sure to have synchronized with
+	 * any signal-sending on another CPU that wants to examine it.
+	 */
+	spin_lock_irq(&current->sighand->siglock);
+	current->last_siginfo = NULL;
+
+	/*
+	 * Queued signals ignored us while we were stopped for tracing.
+	 * So check for any that we should take before resuming user mode.
+	 */
+	recalc_sigpending();
+}
+
+void ptrace_notify(int exit_code)
+{
+	siginfo_t info;
+
+	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
+
+	memset(&info, 0, sizeof info);
+	info.si_signo = SIGTRAP;
+	info.si_code = exit_code;
+	info.si_pid = current->pid;
+	info.si_uid = current->uid;
+
+	/* Let the debugger run.  */
+	spin_lock_irq(&current->sighand->siglock);
+	ptrace_stop(exit_code, 0, &info);
+	spin_unlock_irq(&current->sighand->siglock);
+}
+
+#ifndef HAVE_ARCH_GET_SIGNAL_TO_DELIVER
+
+static void
+finish_stop(int stop_count)
+{
+	/*
+	 * If there are no other threads in the group, or if there is
+	 * a group stop in progress and we are the last to stop,
+	 * report to the parent.  When ptraced, every thread reports itself.
+	 */
+	if (stop_count < 0 || (current->ptrace & PT_PTRACED)) {
+		read_lock(&tasklist_lock);
+		do_notify_parent_cldstop(current, current->parent,
+					 CLD_STOPPED);
+		read_unlock(&tasklist_lock);
+	}
+	else if (stop_count == 0) {
+		read_lock(&tasklist_lock);
+		do_notify_parent_cldstop(current->group_leader,
+					 current->group_leader->real_parent,
+					 CLD_STOPPED);
+		read_unlock(&tasklist_lock);
+	}
+
+	schedule();
+	/*
+	 * Now we don't run again until continued.
+	 */
+	current->exit_code = 0;
+}
+
+/*
+ * This performs the stopping for SIGSTOP and other stop signals.
+ * We have to stop all threads in the thread group.
+ * Returns nonzero if we've actually stopped and released the siglock.
+ * Returns zero if we didn't stop and still hold the siglock.
+ */
+static int
+do_signal_stop(int signr)
+{
+	struct signal_struct *sig = current->signal;
+	struct sighand_struct *sighand = current->sighand;
+	int stop_count = -1;
+
+	if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
+		return 0;
+
+	if (sig->group_stop_count > 0) {
+		/*
+		 * There is a group stop in progress.  We don't need to
+		 * start another one.
+		 */
+		signr = sig->group_exit_code;
+		stop_count = --sig->group_stop_count;
+		current->exit_code = signr;
+		set_current_state(TASK_STOPPED);
+		if (stop_count == 0)
+			sig->flags = SIGNAL_STOP_STOPPED;
+		spin_unlock_irq(&sighand->siglock);
+	}
+	else if (thread_group_empty(current)) {
+		/*
+		 * Lock must be held through transition to stopped state.
+		 */
+		current->exit_code = current->signal->group_exit_code = signr;
+		set_current_state(TASK_STOPPED);
+		sig->flags = SIGNAL_STOP_STOPPED;
+		spin_unlock_irq(&sighand->siglock);
+	}
+	else {
+		/*
+		 * There is no group stop already in progress.
+		 * We must initiate one now, but that requires
+		 * dropping siglock to get both the tasklist lock
+		 * and siglock again in the proper order.  Note that
+		 * this allows an intervening SIGCONT to be posted.
+		 * We need to check for that and bail out if necessary.
+		 */
+		struct task_struct *t;
+
+		spin_unlock_irq(&sighand->siglock);
+
+		/* signals can be posted during this window */
+
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&sighand->siglock);
+
+		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) {
+			/*
+			 * Another stop or continue happened while we
+			 * didn't have the lock.  We can just swallow this
+			 * signal now.  If we raced with a SIGCONT, that
+			 * should have just cleared it now.  If we raced
+			 * with another processor delivering a stop signal,
+			 * then the SIGCONT that wakes us up should clear it.
+			 */
+			read_unlock(&tasklist_lock);
+			return 0;
+		}
+
+		if (sig->group_stop_count == 0) {
+			sig->group_exit_code = signr;
+			stop_count = 0;
+			for (t = next_thread(current); t != current;
+			     t = next_thread(t))
+				/*
+				 * Setting state to TASK_STOPPED for a group
+				 * stop is always done with the siglock held,
+				 * so this check has no races.
+				 */
+				if (t->state < TASK_STOPPED) {
+					stop_count++;
+					signal_wake_up(t, 0);
+				}
+			sig->group_stop_count = stop_count;
+		}
+		else {
+			/* A race with another thread while unlocked.  */
+			signr = sig->group_exit_code;
+			stop_count = --sig->group_stop_count;
+		}
+
+		current->exit_code = signr;
+		set_current_state(TASK_STOPPED);
+		if (stop_count == 0)
+			sig->flags = SIGNAL_STOP_STOPPED;
+
+		spin_unlock_irq(&sighand->siglock);
+		read_unlock(&tasklist_lock);
+	}
+
+	finish_stop(stop_count);
+	return 1;
+}
+
+/*
+ * Do appropriate magic when group_stop_count > 0.
+ * We return nonzero if we stopped, after releasing the siglock.
+ * We return zero if we still hold the siglock and should look
+ * for another signal without checking group_stop_count again.
+ */
+static inline int handle_group_stop(void)
+{
+	int stop_count;
+
+	if (current->signal->group_exit_task == current) {
+		/*
+		 * Group stop is so we can do a core dump,
+		 * We are the initiating thread, so get on with it.
+		 */
+		current->signal->group_exit_task = NULL;
+		return 0;
+	}
+
+	if (current->signal->flags & SIGNAL_GROUP_EXIT)
+		/*
+		 * Group stop is so another thread can do a core dump,
+		 * or else we are racing against a death signal.
+		 * Just punt the stop so we can get the next signal.
+		 */
+		return 0;
+
+	/*
+	 * There is a group stop in progress.  We stop
+	 * without any associated signal being in our queue.
+	 */
+	stop_count = --current->signal->group_stop_count;
+	if (stop_count == 0)
+		current->signal->flags = SIGNAL_STOP_STOPPED;
+	current->exit_code = current->signal->group_exit_code;
+	set_current_state(TASK_STOPPED);
+	spin_unlock_irq(&current->sighand->siglock);
+	finish_stop(stop_count);
+	return 1;
+}
+
+int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
+			  struct pt_regs *regs, void *cookie)
+{
+	sigset_t *mask = &current->blocked;
+	int signr = 0;
+
+relock:
+	spin_lock_irq(&current->sighand->siglock);
+	for (;;) {
+		struct k_sigaction *ka;
+
+		if (unlikely(current->signal->group_stop_count > 0) &&
+		    handle_group_stop())
+			goto relock;
+
+		signr = dequeue_signal(current, mask, info);
+
+		if (!signr)
+			break; /* will return 0 */
+
+		if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) {
+			ptrace_signal_deliver(regs, cookie);
+
+			/* Let the debugger run.  */
+			ptrace_stop(signr, signr, info);
+
+			/* We're back.  Did the debugger cancel the sig?  */
+			signr = current->exit_code;
+			if (signr == 0)
+				continue;
+
+			current->exit_code = 0;
+
+			/* Update the siginfo structure if the signal has
+			   changed.  If the debugger wanted something
+			   specific in the siginfo structure then it should
+			   have updated *info via PTRACE_SETSIGINFO.  */
+			if (signr != info->si_signo) {
+				info->si_signo = signr;
+				info->si_errno = 0;
+				info->si_code = SI_USER;
+				info->si_pid = current->parent->pid;
+				info->si_uid = current->parent->uid;
+			}
+
+			/* If the (new) signal is now blocked, requeue it.  */
+			if (sigismember(&current->blocked, signr)) {
+				specific_send_sig_info(signr, info, current);
+				continue;
+			}
+		}
+
+		ka = &current->sighand->action[signr-1];
+		if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
+			continue;
+		if (ka->sa.sa_handler != SIG_DFL) {
+			/* Run the handler.  */
+			*return_ka = *ka;
+
+			if (ka->sa.sa_flags & SA_ONESHOT)
+				ka->sa.sa_handler = SIG_DFL;
+
+			break; /* will return non-zero "signr" value */
+		}
+
+		/*
+		 * Now we are doing the default action for this signal.
+		 */
+		if (sig_kernel_ignore(signr)) /* Default is nothing. */
+			continue;
+
+		/* Init gets no signals it doesn't want.  */
+		if (current->pid == 1)
+			continue;
+
+		if (sig_kernel_stop(signr)) {
+			/*
+			 * The default action is to stop all threads in
+			 * the thread group.  The job control signals
+			 * do nothing in an orphaned pgrp, but SIGSTOP
+			 * always works.  Note that siglock needs to be
+			 * dropped during the call to is_orphaned_pgrp()
+			 * because of lock ordering with tasklist_lock.
+			 * This allows an intervening SIGCONT to be posted.
+			 * We need to check for that and bail out if necessary.
+			 */
+			if (signr != SIGSTOP) {
+				spin_unlock_irq(&current->sighand->siglock);
+
+				/* signals can be posted during this window */
+
+				if (is_orphaned_pgrp(process_group(current)))
+					goto relock;
+
+				spin_lock_irq(&current->sighand->siglock);
+			}
+
+			if (likely(do_signal_stop(signr))) {
+				/* It released the siglock.  */
+				goto relock;
+			}
+
+			/*
+			 * We didn't actually stop, due to a race
+			 * with SIGCONT or something like that.
+			 */
+			continue;
+		}
+
+		spin_unlock_irq(&current->sighand->siglock);
+
+		/*
+		 * Anything else is fatal, maybe with a core dump.
+		 */
+		current->flags |= PF_SIGNALED;
+		if (sig_kernel_coredump(signr)) {
+			/*
+			 * If it was able to dump core, this kills all
+			 * other threads in the group and synchronizes with
+			 * their demise.  If we lost the race with another
+			 * thread getting here, it set group_exit_code
+			 * first and our do_group_exit call below will use
+			 * that value and ignore the one we pass it.
+			 */
+			do_coredump((long)signr, signr, regs);
+		}
+
+		/*
+		 * Death signals, no core dump.
+		 */
+		do_group_exit(signr);
+		/* NOTREACHED */
+	}
+	spin_unlock_irq(&current->sighand->siglock);
+	return signr;
+}
+
+#endif
+
+EXPORT_SYMBOL(recalc_sigpending);
+EXPORT_SYMBOL_GPL(dequeue_signal);
+EXPORT_SYMBOL(flush_signals);
+EXPORT_SYMBOL(force_sig);
+EXPORT_SYMBOL(kill_pg);
+EXPORT_SYMBOL(kill_proc);
+EXPORT_SYMBOL(ptrace_notify);
+EXPORT_SYMBOL(send_sig);
+EXPORT_SYMBOL(send_sig_info);
+EXPORT_SYMBOL(sigprocmask);
+EXPORT_SYMBOL(block_all_signals);
+EXPORT_SYMBOL(unblock_all_signals);
+
+
+/*
+ * System call entry points.
+ */
+
+asmlinkage long sys_restart_syscall(void)
+{
+	struct restart_block *restart = &current_thread_info()->restart_block;
+	return restart->fn(restart);
+}
+
+long do_no_restart_syscall(struct restart_block *param)
+{
+	return -EINTR;
+}
+
+/*
+ * We don't need to get the kernel lock - this is all local to this
+ * particular thread.. (and that's good, because this is _heavily_
+ * used by various programs)
+ */
+
+/*
+ * This is also useful for kernel threads that want to temporarily
+ * (or permanently) block certain signals.
+ *
+ * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel
+ * interface happily blocks "unblockable" signals like SIGKILL
+ * and friends.
+ */
+int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
+{
+	int error;
+	sigset_t old_block;
+
+	spin_lock_irq(&current->sighand->siglock);
+	old_block = current->blocked;
+	error = 0;
+	switch (how) {
+	case SIG_BLOCK:
+		sigorsets(&current->blocked, &current->blocked, set);
+		break;
+	case SIG_UNBLOCK:
+		signandsets(&current->blocked, &current->blocked, set);
+		break;
+	case SIG_SETMASK:
+		current->blocked = *set;
+		break;
+	default:
+		error = -EINVAL;
+	}
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+	if (oldset)
+		*oldset = old_block;
+	return error;
+}
+
+asmlinkage long
+sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize)
+{
+	int error = -EINVAL;
+	sigset_t old_set, new_set;
+
+	/* XXX: Don't preclude handling different sized sigset_t's.  */
+	if (sigsetsize != sizeof(sigset_t))
+		goto out;
+
+	if (set) {
+		error = -EFAULT;
+		if (copy_from_user(&new_set, set, sizeof(*set)))
+			goto out;
+		sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+		error = sigprocmask(how, &new_set, &old_set);
+		if (error)
+			goto out;
+		if (oset)
+			goto set_old;
+	} else if (oset) {
+		spin_lock_irq(&current->sighand->siglock);
+		old_set = current->blocked;
+		spin_unlock_irq(&current->sighand->siglock);
+
+	set_old:
+		error = -EFAULT;
+		if (copy_to_user(oset, &old_set, sizeof(*oset)))
+			goto out;
+	}
+	error = 0;
+out:
+	return error;
+}
+
+long do_sigpending(void __user *set, unsigned long sigsetsize)
+{
+	long error = -EINVAL;
+	sigset_t pending;
+
+	if (sigsetsize > sizeof(sigset_t))
+		goto out;
+
+	spin_lock_irq(&current->sighand->siglock);
+	sigorsets(&pending, &current->pending.signal,
+		  &current->signal->shared_pending.signal);
+	spin_unlock_irq(&current->sighand->siglock);
+
+	/* Outside the lock because only this thread touches it.  */
+	sigandsets(&pending, &current->blocked, &pending);
+
+	error = -EFAULT;
+	if (!copy_to_user(set, &pending, sigsetsize))
+		error = 0;
+
+out:
+	return error;
+}	
+
+asmlinkage long
+sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize)
+{
+	return do_sigpending(set, sigsetsize);
+}
+
+#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
+
+int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
+{
+	int err;
+
+	if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t)))
+		return -EFAULT;
+	if (from->si_code < 0)
+		return __copy_to_user(to, from, sizeof(siginfo_t))
+			? -EFAULT : 0;
+	/*
+	 * If you change siginfo_t structure, please be sure
+	 * this code is fixed accordingly.
+	 * It should never copy any pad contained in the structure
+	 * to avoid security leaks, but must copy the generic
+	 * 3 ints plus the relevant union member.
+	 */
+	err = __put_user(from->si_signo, &to->si_signo);
+	err |= __put_user(from->si_errno, &to->si_errno);
+	err |= __put_user((short)from->si_code, &to->si_code);
+	switch (from->si_code & __SI_MASK) {
+	case __SI_KILL:
+		err |= __put_user(from->si_pid, &to->si_pid);
+		err |= __put_user(from->si_uid, &to->si_uid);
+		break;
+	case __SI_TIMER:
+		 err |= __put_user(from->si_tid, &to->si_tid);
+		 err |= __put_user(from->si_overrun, &to->si_overrun);
+		 err |= __put_user(from->si_ptr, &to->si_ptr);
+		break;
+	case __SI_POLL:
+		err |= __put_user(from->si_band, &to->si_band);
+		err |= __put_user(from->si_fd, &to->si_fd);
+		break;
+	case __SI_FAULT:
+		err |= __put_user(from->si_addr, &to->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		err |= __put_user(from->si_trapno, &to->si_trapno);
+#endif
+		break;
+	case __SI_CHLD:
+		err |= __put_user(from->si_pid, &to->si_pid);
+		err |= __put_user(from->si_uid, &to->si_uid);
+		err |= __put_user(from->si_status, &to->si_status);
+		err |= __put_user(from->si_utime, &to->si_utime);
+		err |= __put_user(from->si_stime, &to->si_stime);
+		break;
+	case __SI_RT: /* This is not generated by the kernel as of now. */
+	case __SI_MESGQ: /* But this is */
+		err |= __put_user(from->si_pid, &to->si_pid);
+		err |= __put_user(from->si_uid, &to->si_uid);
+		err |= __put_user(from->si_ptr, &to->si_ptr);
+		break;
+	default: /* this is just in case for now ... */
+		err |= __put_user(from->si_pid, &to->si_pid);
+		err |= __put_user(from->si_uid, &to->si_uid);
+		break;
+	}
+	return err;
+}
+
+#endif
+
+asmlinkage long
+sys_rt_sigtimedwait(const sigset_t __user *uthese,
+		    siginfo_t __user *uinfo,
+		    const struct timespec __user *uts,
+		    size_t sigsetsize)
+{
+	int ret, sig;
+	sigset_t these;
+	struct timespec ts;
+	siginfo_t info;
+	long timeout = 0;
+
+	/* XXX: Don't preclude handling different sized sigset_t's.  */
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+
+	if (copy_from_user(&these, uthese, sizeof(these)))
+		return -EFAULT;
+		
+	/*
+	 * Invert the set of allowed signals to get those we
+	 * want to block.
+	 */
+	sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
+	signotset(&these);
+
+	if (uts) {
+		if (copy_from_user(&ts, uts, sizeof(ts)))
+			return -EFAULT;
+		if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
+		    || ts.tv_sec < 0)
+			return -EINVAL;
+	}
+
+	spin_lock_irq(&current->sighand->siglock);
+	sig = dequeue_signal(current, &these, &info);
+	if (!sig) {
+		timeout = MAX_SCHEDULE_TIMEOUT;
+		if (uts)
+			timeout = (timespec_to_jiffies(&ts)
+				   + (ts.tv_sec || ts.tv_nsec));
+
+		if (timeout) {
+			/* None ready -- temporarily unblock those we're
+			 * interested while we are sleeping in so that we'll
+			 * be awakened when they arrive.  */
+			current->real_blocked = current->blocked;
+			sigandsets(&current->blocked, &current->blocked, &these);
+			recalc_sigpending();
+			spin_unlock_irq(&current->sighand->siglock);
+
+			current->state = TASK_INTERRUPTIBLE;
+			timeout = schedule_timeout(timeout);
+
+			if (current->flags & PF_FREEZE)
+				refrigerator(PF_FREEZE);
+			spin_lock_irq(&current->sighand->siglock);
+			sig = dequeue_signal(current, &these, &info);
+			current->blocked = current->real_blocked;
+			siginitset(&current->real_blocked, 0);
+			recalc_sigpending();
+		}
+	}
+	spin_unlock_irq(&current->sighand->siglock);
+
+	if (sig) {
+		ret = sig;
+		if (uinfo) {
+			if (copy_siginfo_to_user(uinfo, &info))
+				ret = -EFAULT;
+		}
+	} else {
+		ret = -EAGAIN;
+		if (timeout)
+			ret = -EINTR;
+	}
+
+	return ret;
+}
+
+asmlinkage long
+sys_kill(int pid, int sig)
+{
+	struct siginfo info;
+
+	info.si_signo = sig;
+	info.si_errno = 0;
+	info.si_code = SI_USER;
+	info.si_pid = current->tgid;
+	info.si_uid = current->uid;
+
+	return kill_something_info(sig, &info, pid);
+}
+
+/**
+ *  sys_tgkill - send signal to one specific thread
+ *  @tgid: the thread group ID of the thread
+ *  @pid: the PID of the thread
+ *  @sig: signal to be sent
+ *
+ *  This syscall also checks the tgid and returns -ESRCH even if the PID
+ *  exists but it's not belonging to the target process anymore. This
+ *  method solves the problem of threads exiting and PIDs getting reused.
+ */
+asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+{
+	struct siginfo info;
+	int error;
+	struct task_struct *p;
+
+	/* This is only valid for single tasks */
+	if (pid <= 0 || tgid <= 0)
+		return -EINVAL;
+
+	info.si_signo = sig;
+	info.si_errno = 0;
+	info.si_code = SI_TKILL;
+	info.si_pid = current->tgid;
+	info.si_uid = current->uid;
+
+	read_lock(&tasklist_lock);
+	p = find_task_by_pid(pid);
+	error = -ESRCH;
+	if (p && (p->tgid == tgid)) {
+		error = check_kill_permission(sig, &info, p);
+		/*
+		 * The null signal is a permissions and process existence
+		 * probe.  No signal is actually delivered.
+		 */
+		if (!error && sig && p->sighand) {
+			spin_lock_irq(&p->sighand->siglock);
+			handle_stop_signal(sig, p);
+			error = specific_send_sig_info(sig, &info, p);
+			spin_unlock_irq(&p->sighand->siglock);
+		}
+	}
+	read_unlock(&tasklist_lock);
+	return error;
+}
+
+/*
+ *  Send a signal to only one task, even if it's a CLONE_THREAD task.
+ */
+asmlinkage long
+sys_tkill(int pid, int sig)
+{
+	struct siginfo info;
+	int error;
+	struct task_struct *p;
+
+	/* This is only valid for single tasks */
+	if (pid <= 0)
+		return -EINVAL;
+
+	info.si_signo = sig;
+	info.si_errno = 0;
+	info.si_code = SI_TKILL;
+	info.si_pid = current->tgid;
+	info.si_uid = current->uid;
+
+	read_lock(&tasklist_lock);
+	p = find_task_by_pid(pid);
+	error = -ESRCH;
+	if (p) {
+		error = check_kill_permission(sig, &info, p);
+		/*
+		 * The null signal is a permissions and process existence
+		 * probe.  No signal is actually delivered.
+		 */
+		if (!error && sig && p->sighand) {
+			spin_lock_irq(&p->sighand->siglock);
+			handle_stop_signal(sig, p);
+			error = specific_send_sig_info(sig, &info, p);
+			spin_unlock_irq(&p->sighand->siglock);
+		}
+	}
+	read_unlock(&tasklist_lock);
+	return error;
+}
+
+asmlinkage long
+sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
+{
+	siginfo_t info;
+
+	if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
+		return -EFAULT;
+
+	/* Not even root can pretend to send signals from the kernel.
+	   Nor can they impersonate a kill(), which adds source info.  */
+	if (info.si_code >= 0)
+		return -EPERM;
+	info.si_signo = sig;
+
+	/* POSIX.1b doesn't mention process groups.  */
+	return kill_proc_info(sig, &info, pid);
+}
+
+int
+do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
+{
+	struct k_sigaction *k;
+
+	if (sig < 1 || sig > _NSIG || (act && sig_kernel_only(sig)))
+		return -EINVAL;
+
+	k = &current->sighand->action[sig-1];
+
+	spin_lock_irq(&current->sighand->siglock);
+	if (signal_pending(current)) {
+		/*
+		 * If there might be a fatal signal pending on multiple
+		 * threads, make sure we take it before changing the action.
+		 */
+		spin_unlock_irq(&current->sighand->siglock);
+		return -ERESTARTNOINTR;
+	}
+
+	if (oact)
+		*oact = *k;
+
+	if (act) {
+		/*
+		 * POSIX 3.3.1.3:
+		 *  "Setting a signal action to SIG_IGN for a signal that is
+		 *   pending shall cause the pending signal to be discarded,
+		 *   whether or not it is blocked."
+		 *
+		 *  "Setting a signal action to SIG_DFL for a signal that is
+		 *   pending and whose default action is to ignore the signal
+		 *   (for example, SIGCHLD), shall cause the pending signal to
+		 *   be discarded, whether or not it is blocked"
+		 */
+		if (act->sa.sa_handler == SIG_IGN ||
+		    (act->sa.sa_handler == SIG_DFL &&
+		     sig_kernel_ignore(sig))) {
+			/*
+			 * This is a fairly rare case, so we only take the
+			 * tasklist_lock once we're sure we'll need it.
+			 * Now we must do this little unlock and relock
+			 * dance to maintain the lock hierarchy.
+			 */
+			struct task_struct *t = current;
+			spin_unlock_irq(&t->sighand->siglock);
+			read_lock(&tasklist_lock);
+			spin_lock_irq(&t->sighand->siglock);
+			*k = *act;
+			sigdelsetmask(&k->sa.sa_mask,
+				      sigmask(SIGKILL) | sigmask(SIGSTOP));
+			rm_from_queue(sigmask(sig), &t->signal->shared_pending);
+			do {
+				rm_from_queue(sigmask(sig), &t->pending);
+				recalc_sigpending_tsk(t);
+				t = next_thread(t);
+			} while (t != current);
+			spin_unlock_irq(&current->sighand->siglock);
+			read_unlock(&tasklist_lock);
+			return 0;
+		}
+
+		*k = *act;
+		sigdelsetmask(&k->sa.sa_mask,
+			      sigmask(SIGKILL) | sigmask(SIGSTOP));
+	}
+
+	spin_unlock_irq(&current->sighand->siglock);
+	return 0;
+}
+
+int 
+do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
+{
+	stack_t oss;
+	int error;
+
+	if (uoss) {
+		oss.ss_sp = (void __user *) current->sas_ss_sp;
+		oss.ss_size = current->sas_ss_size;
+		oss.ss_flags = sas_ss_flags(sp);
+	}
+
+	if (uss) {
+		void __user *ss_sp;
+		size_t ss_size;
+		int ss_flags;
+
+		error = -EFAULT;
+		if (!access_ok(VERIFY_READ, uss, sizeof(*uss))
+		    || __get_user(ss_sp, &uss->ss_sp)
+		    || __get_user(ss_flags, &uss->ss_flags)
+		    || __get_user(ss_size, &uss->ss_size))
+			goto out;
+
+		error = -EPERM;
+		if (on_sig_stack(sp))
+			goto out;
+
+		error = -EINVAL;
+		/*
+		 *
+		 * Note - this code used to test ss_flags incorrectly
+		 *  	  old code may have been written using ss_flags==0
+		 *	  to mean ss_flags==SS_ONSTACK (as this was the only
+		 *	  way that worked) - this fix preserves that older
+		 *	  mechanism
+		 */
+		if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
+			goto out;
+
+		if (ss_flags == SS_DISABLE) {
+			ss_size = 0;
+			ss_sp = NULL;
+		} else {
+			error = -ENOMEM;
+			if (ss_size < MINSIGSTKSZ)
+				goto out;
+		}
+
+		current->sas_ss_sp = (unsigned long) ss_sp;
+		current->sas_ss_size = ss_size;
+	}
+
+	if (uoss) {
+		error = -EFAULT;
+		if (copy_to_user(uoss, &oss, sizeof(oss)))
+			goto out;
+	}
+
+	error = 0;
+out:
+	return error;
+}
+
+#ifdef __ARCH_WANT_SYS_SIGPENDING
+
+asmlinkage long
+sys_sigpending(old_sigset_t __user *set)
+{
+	return do_sigpending(set, sizeof(*set));
+}
+
+#endif
+
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
+/* Some platforms have their own version with special arguments others
+   support only sys_rt_sigprocmask.  */
+
+asmlinkage long
+sys_sigprocmask(int how, old_sigset_t __user *set, old_sigset_t __user *oset)
+{
+	int error;
+	old_sigset_t old_set, new_set;
+
+	if (set) {
+		error = -EFAULT;
+		if (copy_from_user(&new_set, set, sizeof(*set)))
+			goto out;
+		new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
+
+		spin_lock_irq(&current->sighand->siglock);
+		old_set = current->blocked.sig[0];
+
+		error = 0;
+		switch (how) {
+		default:
+			error = -EINVAL;
+			break;
+		case SIG_BLOCK:
+			sigaddsetmask(&current->blocked, new_set);
+			break;
+		case SIG_UNBLOCK:
+			sigdelsetmask(&current->blocked, new_set);
+			break;
+		case SIG_SETMASK:
+			current->blocked.sig[0] = new_set;
+			break;
+		}
+
+		recalc_sigpending();
+		spin_unlock_irq(&current->sighand->siglock);
+		if (error)
+			goto out;
+		if (oset)
+			goto set_old;
+	} else if (oset) {
+		old_set = current->blocked.sig[0];
+	set_old:
+		error = -EFAULT;
+		if (copy_to_user(oset, &old_set, sizeof(*oset)))
+			goto out;
+	}
+	error = 0;
+out:
+	return error;
+}
+#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
+
+#ifdef __ARCH_WANT_SYS_RT_SIGACTION
+asmlinkage long
+sys_rt_sigaction(int sig,
+		 const struct sigaction __user *act,
+		 struct sigaction __user *oact,
+		 size_t sigsetsize)
+{
+	struct k_sigaction new_sa, old_sa;
+	int ret = -EINVAL;
+
+	/* XXX: Don't preclude handling different sized sigset_t's.  */
+	if (sigsetsize != sizeof(sigset_t))
+		goto out;
+
+	if (act) {
+		if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
+			return -EFAULT;
+	}
+
+	ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
+
+	if (!ret && oact) {
+		if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
+			return -EFAULT;
+	}
+out:
+	return ret;
+}
+#endif /* __ARCH_WANT_SYS_RT_SIGACTION */
+
+#ifdef __ARCH_WANT_SYS_SGETMASK
+
+/*
+ * For backwards compatibility.  Functionality superseded by sigprocmask.
+ */
+asmlinkage long
+sys_sgetmask(void)
+{
+	/* SMP safe */
+	return current->blocked.sig[0];
+}
+
+asmlinkage long
+sys_ssetmask(int newmask)
+{
+	int old;
+
+	spin_lock_irq(&current->sighand->siglock);
+	old = current->blocked.sig[0];
+
+	siginitset(&current->blocked, newmask & ~(sigmask(SIGKILL)|
+						  sigmask(SIGSTOP)));
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	return old;
+}
+#endif /* __ARCH_WANT_SGETMASK */
+
+#ifdef __ARCH_WANT_SYS_SIGNAL
+/*
+ * For backwards compatibility.  Functionality superseded by sigaction.
+ */
+asmlinkage unsigned long
+sys_signal(int sig, __sighandler_t handler)
+{
+	struct k_sigaction new_sa, old_sa;
+	int ret;
+
+	new_sa.sa.sa_handler = handler;
+	new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
+
+	ret = do_sigaction(sig, &new_sa, &old_sa);
+
+	return ret ? ret : (unsigned long)old_sa.sa.sa_handler;
+}
+#endif /* __ARCH_WANT_SYS_SIGNAL */
+
+#ifdef __ARCH_WANT_SYS_PAUSE
+
+asmlinkage long
+sys_pause(void)
+{
+	current->state = TASK_INTERRUPTIBLE;
+	schedule();
+	return -ERESTARTNOHAND;
+}
+
+#endif
+
+void __init signals_init(void)
+{
+	sigqueue_cachep =
+		kmem_cache_create("sigqueue",
+				  sizeof(struct sigqueue),
+				  __alignof__(struct sigqueue),
+				  SLAB_PANIC, NULL, NULL);
+}
diff --git a/kernel/softirq.c b/kernel/softirq.c
new file mode 100644
index 000000000000..b4ab6af1dea8
--- /dev/null
+++ b/kernel/softirq.c
@@ -0,0 +1,496 @@
+/*
+ *	linux/kernel/softirq.c
+ *
+ *	Copyright (C) 1992 Linus Torvalds
+ *
+ * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/kthread.h>
+#include <linux/rcupdate.h>
+
+#include <asm/irq.h>
+/*
+   - No shared variables, all the data are CPU local.
+   - If a softirq needs serialization, let it serialize itself
+     by its own spinlocks.
+   - Even if softirq is serialized, only local cpu is marked for
+     execution. Hence, we get something sort of weak cpu binding.
+     Though it is still not clear, will it result in better locality
+     or will not.
+
+   Examples:
+   - NET RX softirq. It is multithreaded and does not require
+     any global serialization.
+   - NET TX softirq. It kicks software netdevice queues, hence
+     it is logically serialized per device, but this serialization
+     is invisible to common code.
+   - Tasklets: serialized wrt itself.
+ */
+
+#ifndef __ARCH_IRQ_STAT
+irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
+EXPORT_SYMBOL(irq_stat);
+#endif
+
+static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
+
+static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+
+/*
+ * we cannot loop indefinitely here to avoid userspace starvation,
+ * but we also don't want to introduce a worst case 1/HZ latency
+ * to the pending events, so lets the scheduler to balance
+ * the softirq load for us.
+ */
+static inline void wakeup_softirqd(void)
+{
+	/* Interrupts are disabled: no need to stop preemption */
+	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+
+	if (tsk && tsk->state != TASK_RUNNING)
+		wake_up_process(tsk);
+}
+
+/*
+ * We restart softirq processing MAX_SOFTIRQ_RESTART times,
+ * and we fall back to softirqd after that.
+ *
+ * This number has been established via experimentation.
+ * The two things to balance is latency against fairness -
+ * we want to handle softirqs as soon as possible, but they
+ * should not be able to lock up the box.
+ */
+#define MAX_SOFTIRQ_RESTART 10
+
+asmlinkage void __do_softirq(void)
+{
+	struct softirq_action *h;
+	__u32 pending;
+	int max_restart = MAX_SOFTIRQ_RESTART;
+	int cpu;
+
+	pending = local_softirq_pending();
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+restart:
+	/* Reset the pending bitmask before enabling irqs */
+	local_softirq_pending() = 0;
+
+	local_irq_enable();
+
+	h = softirq_vec;
+
+	do {
+		if (pending & 1) {
+			h->action(h);
+			rcu_bh_qsctr_inc(cpu);
+		}
+		h++;
+		pending >>= 1;
+	} while (pending);
+
+	local_irq_disable();
+
+	pending = local_softirq_pending();
+	if (pending && --max_restart)
+		goto restart;
+
+	if (pending)
+		wakeup_softirqd();
+
+	__local_bh_enable();
+}
+
+#ifndef __ARCH_HAS_DO_SOFTIRQ
+
+asmlinkage void do_softirq(void)
+{
+	__u32 pending;
+	unsigned long flags;
+
+	if (in_interrupt())
+		return;
+
+	local_irq_save(flags);
+
+	pending = local_softirq_pending();
+
+	if (pending)
+		__do_softirq();
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(do_softirq);
+
+#endif
+
+void local_bh_enable(void)
+{
+	WARN_ON(irqs_disabled());
+	/*
+	 * Keep preemption disabled until we are done with
+	 * softirq processing:
+ 	 */
+ 	sub_preempt_count(SOFTIRQ_OFFSET - 1);
+
+	if (unlikely(!in_interrupt() && local_softirq_pending()))
+		do_softirq();
+
+	dec_preempt_count();
+	preempt_check_resched();
+}
+EXPORT_SYMBOL(local_bh_enable);
+
+#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
+# define invoke_softirq()	__do_softirq()
+#else
+# define invoke_softirq()	do_softirq()
+#endif
+
+/*
+ * Exit an interrupt context. Process softirqs if needed and possible:
+ */
+void irq_exit(void)
+{
+	account_system_vtime(current);
+	sub_preempt_count(IRQ_EXIT_OFFSET);
+	if (!in_interrupt() && local_softirq_pending())
+		invoke_softirq();
+	preempt_enable_no_resched();
+}
+
+/*
+ * This function must run with irqs disabled!
+ */
+inline fastcall void raise_softirq_irqoff(unsigned int nr)
+{
+	__raise_softirq_irqoff(nr);
+
+	/*
+	 * If we're in an interrupt or softirq, we're done
+	 * (this also catches softirq-disabled code). We will
+	 * actually run the softirq once we return from
+	 * the irq or softirq.
+	 *
+	 * Otherwise we wake up ksoftirqd to make sure we
+	 * schedule the softirq soon.
+	 */
+	if (!in_interrupt())
+		wakeup_softirqd();
+}
+
+EXPORT_SYMBOL(raise_softirq_irqoff);
+
+void fastcall raise_softirq(unsigned int nr)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	raise_softirq_irqoff(nr);
+	local_irq_restore(flags);
+}
+
+void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
+{
+	softirq_vec[nr].data = data;
+	softirq_vec[nr].action = action;
+}
+
+EXPORT_SYMBOL(open_softirq);
+
+/* Tasklets */
+struct tasklet_head
+{
+	struct tasklet_struct *list;
+};
+
+/* Some compilers disobey section attribute on statics when not
+   initialized -- RR */
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
+
+void fastcall __tasklet_schedule(struct tasklet_struct *t)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	t->next = __get_cpu_var(tasklet_vec).list;
+	__get_cpu_var(tasklet_vec).list = t;
+	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(__tasklet_schedule);
+
+void fastcall __tasklet_hi_schedule(struct tasklet_struct *t)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	t->next = __get_cpu_var(tasklet_hi_vec).list;
+	__get_cpu_var(tasklet_hi_vec).list = t;
+	raise_softirq_irqoff(HI_SOFTIRQ);
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(__tasklet_hi_schedule);
+
+static void tasklet_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
+
+	local_irq_disable();
+	list = __get_cpu_var(tasklet_vec).list;
+	__get_cpu_var(tasklet_vec).list = NULL;
+	local_irq_enable();
+
+	while (list) {
+		struct tasklet_struct *t = list;
+
+		list = list->next;
+
+		if (tasklet_trylock(t)) {
+			if (!atomic_read(&t->count)) {
+				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+					BUG();
+				t->func(t->data);
+				tasklet_unlock(t);
+				continue;
+			}
+			tasklet_unlock(t);
+		}
+
+		local_irq_disable();
+		t->next = __get_cpu_var(tasklet_vec).list;
+		__get_cpu_var(tasklet_vec).list = t;
+		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
+		local_irq_enable();
+	}
+}
+
+static void tasklet_hi_action(struct softirq_action *a)
+{
+	struct tasklet_struct *list;
+
+	local_irq_disable();
+	list = __get_cpu_var(tasklet_hi_vec).list;
+	__get_cpu_var(tasklet_hi_vec).list = NULL;
+	local_irq_enable();
+
+	while (list) {
+		struct tasklet_struct *t = list;
+
+		list = list->next;
+
+		if (tasklet_trylock(t)) {
+			if (!atomic_read(&t->count)) {
+				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
+					BUG();
+				t->func(t->data);
+				tasklet_unlock(t);
+				continue;
+			}
+			tasklet_unlock(t);
+		}
+
+		local_irq_disable();
+		t->next = __get_cpu_var(tasklet_hi_vec).list;
+		__get_cpu_var(tasklet_hi_vec).list = t;
+		__raise_softirq_irqoff(HI_SOFTIRQ);
+		local_irq_enable();
+	}
+}
+
+
+void tasklet_init(struct tasklet_struct *t,
+		  void (*func)(unsigned long), unsigned long data)
+{
+	t->next = NULL;
+	t->state = 0;
+	atomic_set(&t->count, 0);
+	t->func = func;
+	t->data = data;
+}
+
+EXPORT_SYMBOL(tasklet_init);
+
+void tasklet_kill(struct tasklet_struct *t)
+{
+	if (in_interrupt())
+		printk("Attempt to kill tasklet from interrupt\n");
+
+	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
+		do
+			yield();
+		while (test_bit(TASKLET_STATE_SCHED, &t->state));
+	}
+	tasklet_unlock_wait(t);
+	clear_bit(TASKLET_STATE_SCHED, &t->state);
+}
+
+EXPORT_SYMBOL(tasklet_kill);
+
+void __init softirq_init(void)
+{
+	open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
+	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
+}
+
+static int ksoftirqd(void * __bind_cpu)
+{
+	set_user_nice(current, 19);
+	current->flags |= PF_NOFREEZE;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+		preempt_disable();
+		if (!local_softirq_pending()) {
+			preempt_enable_no_resched();
+			schedule();
+			preempt_disable();
+		}
+
+		__set_current_state(TASK_RUNNING);
+
+		while (local_softirq_pending()) {
+			/* Preempt disable stops cpu going offline.
+			   If already offline, we'll be on wrong CPU:
+			   don't process */
+			if (cpu_is_offline((long)__bind_cpu))
+				goto wait_to_die;
+			do_softirq();
+			preempt_enable_no_resched();
+			cond_resched();
+			preempt_disable();
+		}
+		preempt_enable();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+
+wait_to_die:
+	preempt_enable();
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * tasklet_kill_immediate is called to remove a tasklet which can already be
+ * scheduled for execution on @cpu.
+ *
+ * Unlike tasklet_kill, this function removes the tasklet
+ * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
+ *
+ * When this function is called, @cpu must be in the CPU_DEAD state.
+ */
+void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
+{
+	struct tasklet_struct **i;
+
+	BUG_ON(cpu_online(cpu));
+	BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
+
+	if (!test_bit(TASKLET_STATE_SCHED, &t->state))
+		return;
+
+	/* CPU is dead, so no lock needed. */
+	for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) {
+		if (*i == t) {
+			*i = t->next;
+			return;
+		}
+	}
+	BUG();
+}
+
+static void takeover_tasklets(unsigned int cpu)
+{
+	struct tasklet_struct **i;
+
+	/* CPU is dead, so no lock needed. */
+	local_irq_disable();
+
+	/* Find end, append list for that CPU. */
+	for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next);
+	*i = per_cpu(tasklet_vec, cpu).list;
+	per_cpu(tasklet_vec, cpu).list = NULL;
+	raise_softirq_irqoff(TASKLET_SOFTIRQ);
+
+	for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next);
+	*i = per_cpu(tasklet_hi_vec, cpu).list;
+	per_cpu(tasklet_hi_vec, cpu).list = NULL;
+	raise_softirq_irqoff(HI_SOFTIRQ);
+
+	local_irq_enable();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
+		BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
+		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("ksoftirqd for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+		kthread_bind(p, hotcpu);
+  		per_cpu(ksoftirqd, hotcpu) = p;
+ 		break;
+	case CPU_ONLINE:
+		wake_up_process(per_cpu(ksoftirqd, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(ksoftirqd, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+		p = per_cpu(ksoftirqd, hotcpu);
+		per_cpu(ksoftirqd, hotcpu) = NULL;
+		kthread_stop(p);
+		takeover_tasklets(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init int spawn_ksoftirqd(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+	return 0;
+}
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
new file mode 100644
index 000000000000..e15ed17863f1
--- /dev/null
+++ b/kernel/spinlock.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright (2004) Linus Torvalds
+ *
+ * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
+ *
+ * Copyright (2004) Ingo Molnar
+ */
+
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+
+/*
+ * Generic declaration of the raw read_trylock() function,
+ * architectures are supposed to optimize this:
+ */
+int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
+{
+	_raw_read_lock(lock);
+	return 1;
+}
+EXPORT_SYMBOL(generic_raw_read_trylock);
+
+int __lockfunc _spin_trylock(spinlock_t *lock)
+{
+	preempt_disable();
+	if (_raw_spin_trylock(lock))
+		return 1;
+	
+	preempt_enable();
+	return 0;
+}
+EXPORT_SYMBOL(_spin_trylock);
+
+int __lockfunc _read_trylock(rwlock_t *lock)
+{
+	preempt_disable();
+	if (_raw_read_trylock(lock))
+		return 1;
+
+	preempt_enable();
+	return 0;
+}
+EXPORT_SYMBOL(_read_trylock);
+
+int __lockfunc _write_trylock(rwlock_t *lock)
+{
+	preempt_disable();
+	if (_raw_write_trylock(lock))
+		return 1;
+
+	preempt_enable();
+	return 0;
+}
+EXPORT_SYMBOL(_write_trylock);
+
+#ifndef CONFIG_PREEMPT
+
+void __lockfunc _read_lock(rwlock_t *lock)
+{
+	preempt_disable();
+	_raw_read_lock(lock);
+}
+EXPORT_SYMBOL(_read_lock);
+
+unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preempt_disable();
+	_raw_spin_lock_flags(lock, flags);
+	return flags;
+}
+EXPORT_SYMBOL(_spin_lock_irqsave);
+
+void __lockfunc _spin_lock_irq(spinlock_t *lock)
+{
+	local_irq_disable();
+	preempt_disable();
+	_raw_spin_lock(lock);
+}
+EXPORT_SYMBOL(_spin_lock_irq);
+
+void __lockfunc _spin_lock_bh(spinlock_t *lock)
+{
+	local_bh_disable();
+	preempt_disable();
+	_raw_spin_lock(lock);
+}
+EXPORT_SYMBOL(_spin_lock_bh);
+
+unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preempt_disable();
+	_raw_read_lock(lock);
+	return flags;
+}
+EXPORT_SYMBOL(_read_lock_irqsave);
+
+void __lockfunc _read_lock_irq(rwlock_t *lock)
+{
+	local_irq_disable();
+	preempt_disable();
+	_raw_read_lock(lock);
+}
+EXPORT_SYMBOL(_read_lock_irq);
+
+void __lockfunc _read_lock_bh(rwlock_t *lock)
+{
+	local_bh_disable();
+	preempt_disable();
+	_raw_read_lock(lock);
+}
+EXPORT_SYMBOL(_read_lock_bh);
+
+unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	preempt_disable();
+	_raw_write_lock(lock);
+	return flags;
+}
+EXPORT_SYMBOL(_write_lock_irqsave);
+
+void __lockfunc _write_lock_irq(rwlock_t *lock)
+{
+	local_irq_disable();
+	preempt_disable();
+	_raw_write_lock(lock);
+}
+EXPORT_SYMBOL(_write_lock_irq);
+
+void __lockfunc _write_lock_bh(rwlock_t *lock)
+{
+	local_bh_disable();
+	preempt_disable();
+	_raw_write_lock(lock);
+}
+EXPORT_SYMBOL(_write_lock_bh);
+
+void __lockfunc _spin_lock(spinlock_t *lock)
+{
+	preempt_disable();
+	_raw_spin_lock(lock);
+}
+
+EXPORT_SYMBOL(_spin_lock);
+
+void __lockfunc _write_lock(rwlock_t *lock)
+{
+	preempt_disable();
+	_raw_write_lock(lock);
+}
+
+EXPORT_SYMBOL(_write_lock);
+
+#else /* CONFIG_PREEMPT: */
+
+/*
+ * This could be a long-held lock. We both prepare to spin for a long
+ * time (making _this_ CPU preemptable if possible), and we also signal
+ * towards that other CPU that it should break the lock ASAP.
+ *
+ * (We do this in a function because inlining it would be excessive.)
+ */
+
+#define BUILD_LOCK_OPS(op, locktype)					\
+void __lockfunc _##op##_lock(locktype##_t *lock)			\
+{									\
+	preempt_disable();						\
+	for (;;) {							\
+		if (likely(_raw_##op##_trylock(lock)))			\
+			break;						\
+		preempt_enable();					\
+		if (!(lock)->break_lock)				\
+			(lock)->break_lock = 1;				\
+		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+			cpu_relax();					\
+		preempt_disable();					\
+	}								\
+	(lock)->break_lock = 0;						\
+}									\
+									\
+EXPORT_SYMBOL(_##op##_lock);						\
+									\
+unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
+{									\
+	unsigned long flags;						\
+									\
+	preempt_disable();						\
+	for (;;) {							\
+		local_irq_save(flags);					\
+		if (likely(_raw_##op##_trylock(lock)))			\
+			break;						\
+		local_irq_restore(flags);				\
+									\
+		preempt_enable();					\
+		if (!(lock)->break_lock)				\
+			(lock)->break_lock = 1;				\
+		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+			cpu_relax();					\
+		preempt_disable();					\
+	}								\
+	(lock)->break_lock = 0;						\
+	return flags;							\
+}									\
+									\
+EXPORT_SYMBOL(_##op##_lock_irqsave);					\
+									\
+void __lockfunc _##op##_lock_irq(locktype##_t *lock)			\
+{									\
+	_##op##_lock_irqsave(lock);					\
+}									\
+									\
+EXPORT_SYMBOL(_##op##_lock_irq);					\
+									\
+void __lockfunc _##op##_lock_bh(locktype##_t *lock)			\
+{									\
+	unsigned long flags;						\
+									\
+	/*							*/	\
+	/* Careful: we must exclude softirqs too, hence the	*/	\
+	/* irq-disabling. We use the generic preemption-aware	*/	\
+	/* function:						*/	\
+	/**/								\
+	flags = _##op##_lock_irqsave(lock);				\
+	local_bh_disable();						\
+	local_irq_restore(flags);					\
+}									\
+									\
+EXPORT_SYMBOL(_##op##_lock_bh)
+
+/*
+ * Build preemption-friendly versions of the following
+ * lock-spinning functions:
+ *
+ *         _[spin|read|write]_lock()
+ *         _[spin|read|write]_lock_irq()
+ *         _[spin|read|write]_lock_irqsave()
+ *         _[spin|read|write]_lock_bh()
+ */
+BUILD_LOCK_OPS(spin, spinlock);
+BUILD_LOCK_OPS(read, rwlock);
+BUILD_LOCK_OPS(write, rwlock);
+
+#endif /* CONFIG_PREEMPT */
+
+void __lockfunc _spin_unlock(spinlock_t *lock)
+{
+	_raw_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_spin_unlock);
+
+void __lockfunc _write_unlock(rwlock_t *lock)
+{
+	_raw_write_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_write_unlock);
+
+void __lockfunc _read_unlock(rwlock_t *lock)
+{
+	_raw_read_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_read_unlock);
+
+void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+{
+	_raw_spin_unlock(lock);
+	local_irq_restore(flags);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_spin_unlock_irqrestore);
+
+void __lockfunc _spin_unlock_irq(spinlock_t *lock)
+{
+	_raw_spin_unlock(lock);
+	local_irq_enable();
+	preempt_enable();
+}
+EXPORT_SYMBOL(_spin_unlock_irq);
+
+void __lockfunc _spin_unlock_bh(spinlock_t *lock)
+{
+	_raw_spin_unlock(lock);
+	preempt_enable();
+	local_bh_enable();
+}
+EXPORT_SYMBOL(_spin_unlock_bh);
+
+void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+	_raw_read_unlock(lock);
+	local_irq_restore(flags);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_read_unlock_irqrestore);
+
+void __lockfunc _read_unlock_irq(rwlock_t *lock)
+{
+	_raw_read_unlock(lock);
+	local_irq_enable();
+	preempt_enable();
+}
+EXPORT_SYMBOL(_read_unlock_irq);
+
+void __lockfunc _read_unlock_bh(rwlock_t *lock)
+{
+	_raw_read_unlock(lock);
+	preempt_enable();
+	local_bh_enable();
+}
+EXPORT_SYMBOL(_read_unlock_bh);
+
+void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+	_raw_write_unlock(lock);
+	local_irq_restore(flags);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_write_unlock_irqrestore);
+
+void __lockfunc _write_unlock_irq(rwlock_t *lock)
+{
+	_raw_write_unlock(lock);
+	local_irq_enable();
+	preempt_enable();
+}
+EXPORT_SYMBOL(_write_unlock_irq);
+
+void __lockfunc _write_unlock_bh(rwlock_t *lock)
+{
+	_raw_write_unlock(lock);
+	preempt_enable();
+	local_bh_enable();
+}
+EXPORT_SYMBOL(_write_unlock_bh);
+
+int __lockfunc _spin_trylock_bh(spinlock_t *lock)
+{
+	local_bh_disable();
+	preempt_disable();
+	if (_raw_spin_trylock(lock))
+		return 1;
+
+	preempt_enable();
+	local_bh_enable();
+	return 0;
+}
+EXPORT_SYMBOL(_spin_trylock_bh);
+
+int in_lock_functions(unsigned long addr)
+{
+	/* Linker adds these: start and end of __lockfunc functions */
+	extern char __lock_text_start[], __lock_text_end[];
+
+	return addr >= (unsigned long)__lock_text_start
+	&& addr < (unsigned long)__lock_text_end;
+}
+EXPORT_SYMBOL(in_lock_functions);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
new file mode 100644
index 000000000000..c39ed70af174
--- /dev/null
+++ b/kernel/stop_machine.c
@@ -0,0 +1,212 @@
+#include <linux/stop_machine.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/syscalls.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+
+/* Since we effect priority and affinity (both of which are visible
+ * to, and settable by outside processes) we do indirection via a
+ * kthread. */
+
+/* Thread to stop each CPU in user context. */
+enum stopmachine_state {
+	STOPMACHINE_WAIT,
+	STOPMACHINE_PREPARE,
+	STOPMACHINE_DISABLE_IRQ,
+	STOPMACHINE_EXIT,
+};
+
+static enum stopmachine_state stopmachine_state;
+static unsigned int stopmachine_num_threads;
+static atomic_t stopmachine_thread_ack;
+static DECLARE_MUTEX(stopmachine_mutex);
+
+static int stopmachine(void *cpu)
+{
+	int irqs_disabled = 0;
+	int prepared = 0;
+
+	set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
+
+	/* Ack: we are alive */
+	mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
+	atomic_inc(&stopmachine_thread_ack);
+
+	/* Simple state machine */
+	while (stopmachine_state != STOPMACHINE_EXIT) {
+		if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 
+		    && !irqs_disabled) {
+			local_irq_disable();
+			irqs_disabled = 1;
+			/* Ack: irqs disabled. */
+			mb(); /* Must read state first. */
+			atomic_inc(&stopmachine_thread_ack);
+		} else if (stopmachine_state == STOPMACHINE_PREPARE
+			   && !prepared) {
+			/* Everyone is in place, hold CPU. */
+			preempt_disable();
+			prepared = 1;
+			mb(); /* Must read state first. */
+			atomic_inc(&stopmachine_thread_ack);
+		}
+		/* Yield in first stage: migration threads need to
+		 * help our sisters onto their CPUs. */
+		if (!prepared && !irqs_disabled)
+			yield();
+		else
+			cpu_relax();
+	}
+
+	/* Ack: we are exiting. */
+	mb(); /* Must read state first. */
+	atomic_inc(&stopmachine_thread_ack);
+
+	if (irqs_disabled)
+		local_irq_enable();
+	if (prepared)
+		preempt_enable();
+
+	return 0;
+}
+
+/* Change the thread state */
+static void stopmachine_set_state(enum stopmachine_state state)
+{
+	atomic_set(&stopmachine_thread_ack, 0);
+	wmb();
+	stopmachine_state = state;
+	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
+		cpu_relax();
+}
+
+static int stop_machine(void)
+{
+	int i, ret = 0;
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	mm_segment_t old_fs = get_fs();
+
+	/* One high-prio thread per cpu.  We'll do this one. */
+	set_fs(KERNEL_DS);
+	sys_sched_setscheduler(current->pid, SCHED_FIFO,
+				(struct sched_param __user *)&param);
+	set_fs(old_fs);
+
+	atomic_set(&stopmachine_thread_ack, 0);
+	stopmachine_num_threads = 0;
+	stopmachine_state = STOPMACHINE_WAIT;
+
+	for_each_online_cpu(i) {
+		if (i == _smp_processor_id())
+			continue;
+		ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
+		if (ret < 0)
+			break;
+		stopmachine_num_threads++;
+	}
+
+	/* Wait for them all to come to life. */
+	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
+		yield();
+
+	/* If some failed, kill them all. */
+	if (ret < 0) {
+		stopmachine_set_state(STOPMACHINE_EXIT);
+		up(&stopmachine_mutex);
+		return ret;
+	}
+
+	/* Don't schedule us away at this point, please. */
+	local_irq_disable();
+
+	/* Now they are all started, make them hold the CPUs, ready. */
+	stopmachine_set_state(STOPMACHINE_PREPARE);
+
+	/* Make them disable irqs. */
+	stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
+
+	return 0;
+}
+
+static void restart_machine(void)
+{
+	stopmachine_set_state(STOPMACHINE_EXIT);
+	local_irq_enable();
+}
+
+struct stop_machine_data
+{
+	int (*fn)(void *);
+	void *data;
+	struct completion done;
+};
+
+static int do_stop(void *_smdata)
+{
+	struct stop_machine_data *smdata = _smdata;
+	int ret;
+
+	ret = stop_machine();
+	if (ret == 0) {
+		ret = smdata->fn(smdata->data);
+		restart_machine();
+	}
+
+	/* We're done: you can kthread_stop us now */
+	complete(&smdata->done);
+
+	/* Wait for kthread_stop */
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return ret;
+}
+
+struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
+				       unsigned int cpu)
+{
+	struct stop_machine_data smdata;
+	struct task_struct *p;
+
+	smdata.fn = fn;
+	smdata.data = data;
+	init_completion(&smdata.done);
+
+	down(&stopmachine_mutex);
+
+	/* If they don't care which CPU fn runs on, bind to any online one. */
+	if (cpu == NR_CPUS)
+		cpu = _smp_processor_id();
+
+	p = kthread_create(do_stop, &smdata, "kstopmachine");
+	if (!IS_ERR(p)) {
+		kthread_bind(p, cpu);
+		wake_up_process(p);
+		wait_for_completion(&smdata.done);
+	}
+	up(&stopmachine_mutex);
+	return p;
+}
+
+int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
+{
+	struct task_struct *p;
+	int ret;
+
+	/* No CPUs can come up or down during this. */
+	lock_cpu_hotplug();
+	p = __stop_machine_run(fn, data, cpu);
+	if (!IS_ERR(p))
+		ret = kthread_stop(p);
+	else
+		ret = PTR_ERR(p);
+	unlock_cpu_hotplug();
+
+	return ret;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
new file mode 100644
index 000000000000..462d78d55895
--- /dev/null
+++ b/kernel/sys.c
@@ -0,0 +1,1725 @@
+/*
+ *  linux/kernel/sys.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/utsname.h>
+#include <linux/mman.h>
+#include <linux/smp_lock.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/prctl.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+#include <linux/fs.h>
+#include <linux/workqueue.h>
+#include <linux/device.h>
+#include <linux/key.h>
+#include <linux/times.h>
+#include <linux/posix-timers.h>
+#include <linux/security.h>
+#include <linux/dcookies.h>
+#include <linux/suspend.h>
+#include <linux/tty.h>
+
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/unistd.h>
+
+#ifndef SET_UNALIGN_CTL
+# define SET_UNALIGN_CTL(a,b)	(-EINVAL)
+#endif
+#ifndef GET_UNALIGN_CTL
+# define GET_UNALIGN_CTL(a,b)	(-EINVAL)
+#endif
+#ifndef SET_FPEMU_CTL
+# define SET_FPEMU_CTL(a,b)	(-EINVAL)
+#endif
+#ifndef GET_FPEMU_CTL
+# define GET_FPEMU_CTL(a,b)	(-EINVAL)
+#endif
+#ifndef SET_FPEXC_CTL
+# define SET_FPEXC_CTL(a,b)	(-EINVAL)
+#endif
+#ifndef GET_FPEXC_CTL
+# define GET_FPEXC_CTL(a,b)	(-EINVAL)
+#endif
+
+/*
+ * this is where the system-wide overflow UID and GID are defined, for
+ * architectures that now have 32-bit UID/GID but didn't in the past
+ */
+
+int overflowuid = DEFAULT_OVERFLOWUID;
+int overflowgid = DEFAULT_OVERFLOWGID;
+
+#ifdef CONFIG_UID16
+EXPORT_SYMBOL(overflowuid);
+EXPORT_SYMBOL(overflowgid);
+#endif
+
+/*
+ * the same as above, but for filesystems which can only store a 16-bit
+ * UID and GID. as such, this is needed on all architectures
+ */
+
+int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
+int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
+
+EXPORT_SYMBOL(fs_overflowuid);
+EXPORT_SYMBOL(fs_overflowgid);
+
+/*
+ * this indicates whether you can reboot with ctrl-alt-del: the default is yes
+ */
+
+int C_A_D = 1;
+int cad_pid = 1;
+
+/*
+ *	Notifier list for kernel code which wants to be called
+ *	at shutdown. This is used to stop any idling DMA operations
+ *	and the like. 
+ */
+
+static struct notifier_block *reboot_notifier_list;
+static DEFINE_RWLOCK(notifier_lock);
+
+/**
+ *	notifier_chain_register	- Add notifier to a notifier chain
+ *	@list: Pointer to root list pointer
+ *	@n: New entry in notifier chain
+ *
+ *	Adds a notifier to a notifier chain.
+ *
+ *	Currently always returns zero.
+ */
+ 
+int notifier_chain_register(struct notifier_block **list, struct notifier_block *n)
+{
+	write_lock(&notifier_lock);
+	while(*list)
+	{
+		if(n->priority > (*list)->priority)
+			break;
+		list= &((*list)->next);
+	}
+	n->next = *list;
+	*list=n;
+	write_unlock(&notifier_lock);
+	return 0;
+}
+
+EXPORT_SYMBOL(notifier_chain_register);
+
+/**
+ *	notifier_chain_unregister - Remove notifier from a notifier chain
+ *	@nl: Pointer to root list pointer
+ *	@n: New entry in notifier chain
+ *
+ *	Removes a notifier from a notifier chain.
+ *
+ *	Returns zero on success, or %-ENOENT on failure.
+ */
+ 
+int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n)
+{
+	write_lock(&notifier_lock);
+	while((*nl)!=NULL)
+	{
+		if((*nl)==n)
+		{
+			*nl=n->next;
+			write_unlock(&notifier_lock);
+			return 0;
+		}
+		nl=&((*nl)->next);
+	}
+	write_unlock(&notifier_lock);
+	return -ENOENT;
+}
+
+EXPORT_SYMBOL(notifier_chain_unregister);
+
+/**
+ *	notifier_call_chain - Call functions in a notifier chain
+ *	@n: Pointer to root pointer of notifier chain
+ *	@val: Value passed unmodified to notifier function
+ *	@v: Pointer passed unmodified to notifier function
+ *
+ *	Calls each function in a notifier chain in turn.
+ *
+ *	If the return value of the notifier can be and'd
+ *	with %NOTIFY_STOP_MASK, then notifier_call_chain
+ *	will return immediately, with the return value of
+ *	the notifier function which halted execution.
+ *	Otherwise, the return value is the return value
+ *	of the last notifier function called.
+ */
+ 
+int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
+{
+	int ret=NOTIFY_DONE;
+	struct notifier_block *nb = *n;
+
+	while(nb)
+	{
+		ret=nb->notifier_call(nb,val,v);
+		if(ret&NOTIFY_STOP_MASK)
+		{
+			return ret;
+		}
+		nb=nb->next;
+	}
+	return ret;
+}
+
+EXPORT_SYMBOL(notifier_call_chain);
+
+/**
+ *	register_reboot_notifier - Register function to be called at reboot time
+ *	@nb: Info about notifier function to be called
+ *
+ *	Registers a function with the list of functions
+ *	to be called at reboot time.
+ *
+ *	Currently always returns zero, as notifier_chain_register
+ *	always returns zero.
+ */
+ 
+int register_reboot_notifier(struct notifier_block * nb)
+{
+	return notifier_chain_register(&reboot_notifier_list, nb);
+}
+
+EXPORT_SYMBOL(register_reboot_notifier);
+
+/**
+ *	unregister_reboot_notifier - Unregister previously registered reboot notifier
+ *	@nb: Hook to be unregistered
+ *
+ *	Unregisters a previously registered reboot
+ *	notifier function.
+ *
+ *	Returns zero on success, or %-ENOENT on failure.
+ */
+ 
+int unregister_reboot_notifier(struct notifier_block * nb)
+{
+	return notifier_chain_unregister(&reboot_notifier_list, nb);
+}
+
+EXPORT_SYMBOL(unregister_reboot_notifier);
+
+static int set_one_prio(struct task_struct *p, int niceval, int error)
+{
+	int no_nice;
+
+	if (p->uid != current->euid &&
+		p->euid != current->euid && !capable(CAP_SYS_NICE)) {
+		error = -EPERM;
+		goto out;
+	}
+	if (niceval < task_nice(p) && !capable(CAP_SYS_NICE)) {
+		error = -EACCES;
+		goto out;
+	}
+	no_nice = security_task_setnice(p, niceval);
+	if (no_nice) {
+		error = no_nice;
+		goto out;
+	}
+	if (error == -ESRCH)
+		error = 0;
+	set_user_nice(p, niceval);
+out:
+	return error;
+}
+
+asmlinkage long sys_setpriority(int which, int who, int niceval)
+{
+	struct task_struct *g, *p;
+	struct user_struct *user;
+	int error = -EINVAL;
+
+	if (which > 2 || which < 0)
+		goto out;
+
+	/* normalize: avoid signed division (rounding problems) */
+	error = -ESRCH;
+	if (niceval < -20)
+		niceval = -20;
+	if (niceval > 19)
+		niceval = 19;
+
+	read_lock(&tasklist_lock);
+	switch (which) {
+		case PRIO_PROCESS:
+			if (!who)
+				who = current->pid;
+			p = find_task_by_pid(who);
+			if (p)
+				error = set_one_prio(p, niceval, error);
+			break;
+		case PRIO_PGRP:
+			if (!who)
+				who = process_group(current);
+			do_each_task_pid(who, PIDTYPE_PGID, p) {
+				error = set_one_prio(p, niceval, error);
+			} while_each_task_pid(who, PIDTYPE_PGID, p);
+			break;
+		case PRIO_USER:
+			user = current->user;
+			if (!who)
+				who = current->uid;
+			else
+				if ((who != current->uid) && !(user = find_user(who)))
+					goto out_unlock;	/* No processes for this user */
+
+			do_each_thread(g, p)
+				if (p->uid == who)
+					error = set_one_prio(p, niceval, error);
+			while_each_thread(g, p);
+			if (who != current->uid)
+				free_uid(user);		/* For find_user() */
+			break;
+	}
+out_unlock:
+	read_unlock(&tasklist_lock);
+out:
+	return error;
+}
+
+/*
+ * Ugh. To avoid negative return values, "getpriority()" will
+ * not return the normal nice-value, but a negated value that
+ * has been offset by 20 (ie it returns 40..1 instead of -20..19)
+ * to stay compatible.
+ */
+asmlinkage long sys_getpriority(int which, int who)
+{
+	struct task_struct *g, *p;
+	struct user_struct *user;
+	long niceval, retval = -ESRCH;
+
+	if (which > 2 || which < 0)
+		return -EINVAL;
+
+	read_lock(&tasklist_lock);
+	switch (which) {
+		case PRIO_PROCESS:
+			if (!who)
+				who = current->pid;
+			p = find_task_by_pid(who);
+			if (p) {
+				niceval = 20 - task_nice(p);
+				if (niceval > retval)
+					retval = niceval;
+			}
+			break;
+		case PRIO_PGRP:
+			if (!who)
+				who = process_group(current);
+			do_each_task_pid(who, PIDTYPE_PGID, p) {
+				niceval = 20 - task_nice(p);
+				if (niceval > retval)
+					retval = niceval;
+			} while_each_task_pid(who, PIDTYPE_PGID, p);
+			break;
+		case PRIO_USER:
+			user = current->user;
+			if (!who)
+				who = current->uid;
+			else
+				if ((who != current->uid) && !(user = find_user(who)))
+					goto out_unlock;	/* No processes for this user */
+
+			do_each_thread(g, p)
+				if (p->uid == who) {
+					niceval = 20 - task_nice(p);
+					if (niceval > retval)
+						retval = niceval;
+				}
+			while_each_thread(g, p);
+			if (who != current->uid)
+				free_uid(user);		/* for find_user() */
+			break;
+	}
+out_unlock:
+	read_unlock(&tasklist_lock);
+
+	return retval;
+}
+
+
+/*
+ * Reboot system call: for obvious reasons only root may call it,
+ * and even root needs to set up some magic numbers in the registers
+ * so that some mistake won't make this reboot the whole machine.
+ * You can also set the meaning of the ctrl-alt-del-key here.
+ *
+ * reboot doesn't sync: do that yourself before calling this.
+ */
+asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg)
+{
+	char buffer[256];
+
+	/* We only trust the superuser with rebooting the system. */
+	if (!capable(CAP_SYS_BOOT))
+		return -EPERM;
+
+	/* For safety, we require "magic" arguments. */
+	if (magic1 != LINUX_REBOOT_MAGIC1 ||
+	    (magic2 != LINUX_REBOOT_MAGIC2 &&
+	                magic2 != LINUX_REBOOT_MAGIC2A &&
+			magic2 != LINUX_REBOOT_MAGIC2B &&
+	                magic2 != LINUX_REBOOT_MAGIC2C))
+		return -EINVAL;
+
+	lock_kernel();
+	switch (cmd) {
+	case LINUX_REBOOT_CMD_RESTART:
+		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+		system_state = SYSTEM_RESTART;
+		device_shutdown();
+		printk(KERN_EMERG "Restarting system.\n");
+		machine_restart(NULL);
+		break;
+
+	case LINUX_REBOOT_CMD_CAD_ON:
+		C_A_D = 1;
+		break;
+
+	case LINUX_REBOOT_CMD_CAD_OFF:
+		C_A_D = 0;
+		break;
+
+	case LINUX_REBOOT_CMD_HALT:
+		notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
+		system_state = SYSTEM_HALT;
+		device_shutdown();
+		printk(KERN_EMERG "System halted.\n");
+		machine_halt();
+		unlock_kernel();
+		do_exit(0);
+		break;
+
+	case LINUX_REBOOT_CMD_POWER_OFF:
+		notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
+		system_state = SYSTEM_POWER_OFF;
+		device_shutdown();
+		printk(KERN_EMERG "Power down.\n");
+		machine_power_off();
+		unlock_kernel();
+		do_exit(0);
+		break;
+
+	case LINUX_REBOOT_CMD_RESTART2:
+		if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
+			unlock_kernel();
+			return -EFAULT;
+		}
+		buffer[sizeof(buffer) - 1] = '\0';
+
+		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
+		system_state = SYSTEM_RESTART;
+		device_shutdown();
+		printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
+		machine_restart(buffer);
+		break;
+
+#ifdef CONFIG_SOFTWARE_SUSPEND
+	case LINUX_REBOOT_CMD_SW_SUSPEND:
+		{
+			int ret = software_suspend();
+			unlock_kernel();
+			return ret;
+		}
+#endif
+
+	default:
+		unlock_kernel();
+		return -EINVAL;
+	}
+	unlock_kernel();
+	return 0;
+}
+
+static void deferred_cad(void *dummy)
+{
+	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+	machine_restart(NULL);
+}
+
+/*
+ * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
+ * As it's called within an interrupt, it may NOT sync: the only choice
+ * is whether to reboot at once, or just ignore the ctrl-alt-del.
+ */
+void ctrl_alt_del(void)
+{
+	static DECLARE_WORK(cad_work, deferred_cad, NULL);
+
+	if (C_A_D)
+		schedule_work(&cad_work);
+	else
+		kill_proc(cad_pid, SIGINT, 1);
+}
+	
+
+/*
+ * Unprivileged users may change the real gid to the effective gid
+ * or vice versa.  (BSD-style)
+ *
+ * If you set the real gid at all, or set the effective gid to a value not
+ * equal to the real gid, then the saved gid is set to the new effective gid.
+ *
+ * This makes it possible for a setgid program to completely drop its
+ * privileges, which is often a useful assertion to make when you are doing
+ * a security audit over a program.
+ *
+ * The general idea is that a program which uses just setregid() will be
+ * 100% compatible with BSD.  A program which uses just setgid() will be
+ * 100% compatible with POSIX with saved IDs. 
+ *
+ * SMP: There are not races, the GIDs are checked only by filesystem
+ *      operations (as far as semantic preservation is concerned).
+ */
+asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
+{
+	int old_rgid = current->gid;
+	int old_egid = current->egid;
+	int new_rgid = old_rgid;
+	int new_egid = old_egid;
+	int retval;
+
+	retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
+	if (retval)
+		return retval;
+
+	if (rgid != (gid_t) -1) {
+		if ((old_rgid == rgid) ||
+		    (current->egid==rgid) ||
+		    capable(CAP_SETGID))
+			new_rgid = rgid;
+		else
+			return -EPERM;
+	}
+	if (egid != (gid_t) -1) {
+		if ((old_rgid == egid) ||
+		    (current->egid == egid) ||
+		    (current->sgid == egid) ||
+		    capable(CAP_SETGID))
+			new_egid = egid;
+		else {
+			return -EPERM;
+		}
+	}
+	if (new_egid != old_egid)
+	{
+		current->mm->dumpable = 0;
+		wmb();
+	}
+	if (rgid != (gid_t) -1 ||
+	    (egid != (gid_t) -1 && egid != old_rgid))
+		current->sgid = new_egid;
+	current->fsgid = new_egid;
+	current->egid = new_egid;
+	current->gid = new_rgid;
+	key_fsgid_changed(current);
+	return 0;
+}
+
+/*
+ * setgid() is implemented like SysV w/ SAVED_IDS 
+ *
+ * SMP: Same implicit races as above.
+ */
+asmlinkage long sys_setgid(gid_t gid)
+{
+	int old_egid = current->egid;
+	int retval;
+
+	retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
+	if (retval)
+		return retval;
+
+	if (capable(CAP_SETGID))
+	{
+		if(old_egid != gid)
+		{
+			current->mm->dumpable=0;
+			wmb();
+		}
+		current->gid = current->egid = current->sgid = current->fsgid = gid;
+	}
+	else if ((gid == current->gid) || (gid == current->sgid))
+	{
+		if(old_egid != gid)
+		{
+			current->mm->dumpable=0;
+			wmb();
+		}
+		current->egid = current->fsgid = gid;
+	}
+	else
+		return -EPERM;
+
+	key_fsgid_changed(current);
+	return 0;
+}
+  
+static int set_user(uid_t new_ruid, int dumpclear)
+{
+	struct user_struct *new_user;
+
+	new_user = alloc_uid(new_ruid);
+	if (!new_user)
+		return -EAGAIN;
+
+	if (atomic_read(&new_user->processes) >=
+				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
+			new_user != &root_user) {
+		free_uid(new_user);
+		return -EAGAIN;
+	}
+
+	switch_uid(new_user);
+
+	if(dumpclear)
+	{
+		current->mm->dumpable = 0;
+		wmb();
+	}
+	current->uid = new_ruid;
+	return 0;
+}
+
+/*
+ * Unprivileged users may change the real uid to the effective uid
+ * or vice versa.  (BSD-style)
+ *
+ * If you set the real uid at all, or set the effective uid to a value not
+ * equal to the real uid, then the saved uid is set to the new effective uid.
+ *
+ * This makes it possible for a setuid program to completely drop its
+ * privileges, which is often a useful assertion to make when you are doing
+ * a security audit over a program.
+ *
+ * The general idea is that a program which uses just setreuid() will be
+ * 100% compatible with BSD.  A program which uses just setuid() will be
+ * 100% compatible with POSIX with saved IDs. 
+ */
+asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
+{
+	int old_ruid, old_euid, old_suid, new_ruid, new_euid;
+	int retval;
+
+	retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
+	if (retval)
+		return retval;
+
+	new_ruid = old_ruid = current->uid;
+	new_euid = old_euid = current->euid;
+	old_suid = current->suid;
+
+	if (ruid != (uid_t) -1) {
+		new_ruid = ruid;
+		if ((old_ruid != ruid) &&
+		    (current->euid != ruid) &&
+		    !capable(CAP_SETUID))
+			return -EPERM;
+	}
+
+	if (euid != (uid_t) -1) {
+		new_euid = euid;
+		if ((old_ruid != euid) &&
+		    (current->euid != euid) &&
+		    (current->suid != euid) &&
+		    !capable(CAP_SETUID))
+			return -EPERM;
+	}
+
+	if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
+		return -EAGAIN;
+
+	if (new_euid != old_euid)
+	{
+		current->mm->dumpable=0;
+		wmb();
+	}
+	current->fsuid = current->euid = new_euid;
+	if (ruid != (uid_t) -1 ||
+	    (euid != (uid_t) -1 && euid != old_ruid))
+		current->suid = current->euid;
+	current->fsuid = current->euid;
+
+	key_fsuid_changed(current);
+
+	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
+}
+
+
+		
+/*
+ * setuid() is implemented like SysV with SAVED_IDS 
+ * 
+ * Note that SAVED_ID's is deficient in that a setuid root program
+ * like sendmail, for example, cannot set its uid to be a normal 
+ * user and then switch back, because if you're root, setuid() sets
+ * the saved uid too.  If you don't like this, blame the bright people
+ * in the POSIX committee and/or USG.  Note that the BSD-style setreuid()
+ * will allow a root program to temporarily drop privileges and be able to
+ * regain them by swapping the real and effective uid.  
+ */
+asmlinkage long sys_setuid(uid_t uid)
+{
+	int old_euid = current->euid;
+	int old_ruid, old_suid, new_ruid, new_suid;
+	int retval;
+
+	retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
+	if (retval)
+		return retval;
+
+	old_ruid = new_ruid = current->uid;
+	old_suid = current->suid;
+	new_suid = old_suid;
+	
+	if (capable(CAP_SETUID)) {
+		if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
+			return -EAGAIN;
+		new_suid = uid;
+	} else if ((uid != current->uid) && (uid != new_suid))
+		return -EPERM;
+
+	if (old_euid != uid)
+	{
+		current->mm->dumpable = 0;
+		wmb();
+	}
+	current->fsuid = current->euid = uid;
+	current->suid = new_suid;
+
+	key_fsuid_changed(current);
+
+	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
+}
+
+
+/*
+ * This function implements a generic ability to update ruid, euid,
+ * and suid.  This allows you to implement the 4.4 compatible seteuid().
+ */
+asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
+{
+	int old_ruid = current->uid;
+	int old_euid = current->euid;
+	int old_suid = current->suid;
+	int retval;
+
+	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
+	if (retval)
+		return retval;
+
+	if (!capable(CAP_SETUID)) {
+		if ((ruid != (uid_t) -1) && (ruid != current->uid) &&
+		    (ruid != current->euid) && (ruid != current->suid))
+			return -EPERM;
+		if ((euid != (uid_t) -1) && (euid != current->uid) &&
+		    (euid != current->euid) && (euid != current->suid))
+			return -EPERM;
+		if ((suid != (uid_t) -1) && (suid != current->uid) &&
+		    (suid != current->euid) && (suid != current->suid))
+			return -EPERM;
+	}
+	if (ruid != (uid_t) -1) {
+		if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0)
+			return -EAGAIN;
+	}
+	if (euid != (uid_t) -1) {
+		if (euid != current->euid)
+		{
+			current->mm->dumpable = 0;
+			wmb();
+		}
+		current->euid = euid;
+	}
+	current->fsuid = current->euid;
+	if (suid != (uid_t) -1)
+		current->suid = suid;
+
+	key_fsuid_changed(current);
+
+	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
+}
+
+asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
+{
+	int retval;
+
+	if (!(retval = put_user(current->uid, ruid)) &&
+	    !(retval = put_user(current->euid, euid)))
+		retval = put_user(current->suid, suid);
+
+	return retval;
+}
+
+/*
+ * Same as above, but for rgid, egid, sgid.
+ */
+asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
+{
+	int retval;
+
+	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
+	if (retval)
+		return retval;
+
+	if (!capable(CAP_SETGID)) {
+		if ((rgid != (gid_t) -1) && (rgid != current->gid) &&
+		    (rgid != current->egid) && (rgid != current->sgid))
+			return -EPERM;
+		if ((egid != (gid_t) -1) && (egid != current->gid) &&
+		    (egid != current->egid) && (egid != current->sgid))
+			return -EPERM;
+		if ((sgid != (gid_t) -1) && (sgid != current->gid) &&
+		    (sgid != current->egid) && (sgid != current->sgid))
+			return -EPERM;
+	}
+	if (egid != (gid_t) -1) {
+		if (egid != current->egid)
+		{
+			current->mm->dumpable = 0;
+			wmb();
+		}
+		current->egid = egid;
+	}
+	current->fsgid = current->egid;
+	if (rgid != (gid_t) -1)
+		current->gid = rgid;
+	if (sgid != (gid_t) -1)
+		current->sgid = sgid;
+
+	key_fsgid_changed(current);
+	return 0;
+}
+
+asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
+{
+	int retval;
+
+	if (!(retval = put_user(current->gid, rgid)) &&
+	    !(retval = put_user(current->egid, egid)))
+		retval = put_user(current->sgid, sgid);
+
+	return retval;
+}
+
+
+/*
+ * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
+ * is used for "access()" and for the NFS daemon (letting nfsd stay at
+ * whatever uid it wants to). It normally shadows "euid", except when
+ * explicitly set by setfsuid() or for access..
+ */
+asmlinkage long sys_setfsuid(uid_t uid)
+{
+	int old_fsuid;
+
+	old_fsuid = current->fsuid;
+	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
+		return old_fsuid;
+
+	if (uid == current->uid || uid == current->euid ||
+	    uid == current->suid || uid == current->fsuid || 
+	    capable(CAP_SETUID))
+	{
+		if (uid != old_fsuid)
+		{
+			current->mm->dumpable = 0;
+			wmb();
+		}
+		current->fsuid = uid;
+	}
+
+	key_fsuid_changed(current);
+
+	security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
+
+	return old_fsuid;
+}
+
+/*
+ * Samma p� svenska..
+ */
+asmlinkage long sys_setfsgid(gid_t gid)
+{
+	int old_fsgid;
+
+	old_fsgid = current->fsgid;
+	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
+		return old_fsgid;
+
+	if (gid == current->gid || gid == current->egid ||
+	    gid == current->sgid || gid == current->fsgid || 
+	    capable(CAP_SETGID))
+	{
+		if (gid != old_fsgid)
+		{
+			current->mm->dumpable = 0;
+			wmb();
+		}
+		current->fsgid = gid;
+		key_fsgid_changed(current);
+	}
+	return old_fsgid;
+}
+
+asmlinkage long sys_times(struct tms __user * tbuf)
+{
+	/*
+	 *	In the SMP world we might just be unlucky and have one of
+	 *	the times increment as we use it. Since the value is an
+	 *	atomically safe type this is just fine. Conceptually its
+	 *	as if the syscall took an instant longer to occur.
+	 */
+	if (tbuf) {
+		struct tms tmp;
+		struct task_struct *tsk = current;
+		struct task_struct *t;
+		cputime_t utime, stime, cutime, cstime;
+
+		read_lock(&tasklist_lock);
+		utime = tsk->signal->utime;
+		stime = tsk->signal->stime;
+		t = tsk;
+		do {
+			utime = cputime_add(utime, t->utime);
+			stime = cputime_add(stime, t->stime);
+			t = next_thread(t);
+		} while (t != tsk);
+
+		/*
+		 * While we have tasklist_lock read-locked, no dying thread
+		 * can be updating current->signal->[us]time.  Instead,
+		 * we got their counts included in the live thread loop.
+		 * However, another thread can come in right now and
+		 * do a wait call that updates current->signal->c[us]time.
+		 * To make sure we always see that pair updated atomically,
+		 * we take the siglock around fetching them.
+		 */
+		spin_lock_irq(&tsk->sighand->siglock);
+		cutime = tsk->signal->cutime;
+		cstime = tsk->signal->cstime;
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
+
+		tmp.tms_utime = cputime_to_clock_t(utime);
+		tmp.tms_stime = cputime_to_clock_t(stime);
+		tmp.tms_cutime = cputime_to_clock_t(cutime);
+		tmp.tms_cstime = cputime_to_clock_t(cstime);
+		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
+			return -EFAULT;
+	}
+	return (long) jiffies_64_to_clock_t(get_jiffies_64());
+}
+
+/*
+ * This needs some heavy checking ...
+ * I just haven't the stomach for it. I also don't fully
+ * understand sessions/pgrp etc. Let somebody who does explain it.
+ *
+ * OK, I think I have the protection semantics right.... this is really
+ * only important on a multi-user system anyway, to make sure one user
+ * can't send a signal to a process owned by another.  -TYT, 12/12/91
+ *
+ * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
+ * LBT 04.03.94
+ */
+
+asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
+{
+	struct task_struct *p;
+	int err = -EINVAL;
+
+	if (!pid)
+		pid = current->pid;
+	if (!pgid)
+		pgid = pid;
+	if (pgid < 0)
+		return -EINVAL;
+
+	/* From this point forward we keep holding onto the tasklist lock
+	 * so that our parent does not change from under us. -DaveM
+	 */
+	write_lock_irq(&tasklist_lock);
+
+	err = -ESRCH;
+	p = find_task_by_pid(pid);
+	if (!p)
+		goto out;
+
+	err = -EINVAL;
+	if (!thread_group_leader(p))
+		goto out;
+
+	if (p->parent == current || p->real_parent == current) {
+		err = -EPERM;
+		if (p->signal->session != current->signal->session)
+			goto out;
+		err = -EACCES;
+		if (p->did_exec)
+			goto out;
+	} else {
+		err = -ESRCH;
+		if (p != current)
+			goto out;
+	}
+
+	err = -EPERM;
+	if (p->signal->leader)
+		goto out;
+
+	if (pgid != pid) {
+		struct task_struct *p;
+
+		do_each_task_pid(pgid, PIDTYPE_PGID, p) {
+			if (p->signal->session == current->signal->session)
+				goto ok_pgid;
+		} while_each_task_pid(pgid, PIDTYPE_PGID, p);
+		goto out;
+	}
+
+ok_pgid:
+	err = security_task_setpgid(p, pgid);
+	if (err)
+		goto out;
+
+	if (process_group(p) != pgid) {
+		detach_pid(p, PIDTYPE_PGID);
+		p->signal->pgrp = pgid;
+		attach_pid(p, PIDTYPE_PGID, pgid);
+	}
+
+	err = 0;
+out:
+	/* All paths lead to here, thus we are safe. -DaveM */
+	write_unlock_irq(&tasklist_lock);
+	return err;
+}
+
+asmlinkage long sys_getpgid(pid_t pid)
+{
+	if (!pid) {
+		return process_group(current);
+	} else {
+		int retval;
+		struct task_struct *p;
+
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+
+		retval = -ESRCH;
+		if (p) {
+			retval = security_task_getpgid(p);
+			if (!retval)
+				retval = process_group(p);
+		}
+		read_unlock(&tasklist_lock);
+		return retval;
+	}
+}
+
+#ifdef __ARCH_WANT_SYS_GETPGRP
+
+asmlinkage long sys_getpgrp(void)
+{
+	/* SMP - assuming writes are word atomic this is fine */
+	return process_group(current);
+}
+
+#endif
+
+asmlinkage long sys_getsid(pid_t pid)
+{
+	if (!pid) {
+		return current->signal->session;
+	} else {
+		int retval;
+		struct task_struct *p;
+
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+
+		retval = -ESRCH;
+		if(p) {
+			retval = security_task_getsid(p);
+			if (!retval)
+				retval = p->signal->session;
+		}
+		read_unlock(&tasklist_lock);
+		return retval;
+	}
+}
+
+asmlinkage long sys_setsid(void)
+{
+	struct pid *pid;
+	int err = -EPERM;
+
+	if (!thread_group_leader(current))
+		return -EINVAL;
+
+	down(&tty_sem);
+	write_lock_irq(&tasklist_lock);
+
+	pid = find_pid(PIDTYPE_PGID, current->pid);
+	if (pid)
+		goto out;
+
+	current->signal->leader = 1;
+	__set_special_pids(current->pid, current->pid);
+	current->signal->tty = NULL;
+	current->signal->tty_old_pgrp = 0;
+	err = process_group(current);
+out:
+	write_unlock_irq(&tasklist_lock);
+	up(&tty_sem);
+	return err;
+}
+
+/*
+ * Supplementary group IDs
+ */
+
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+
+struct group_info *groups_alloc(int gidsetsize)
+{
+	struct group_info *group_info;
+	int nblocks;
+	int i;
+
+	nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
+	/* Make sure we always allocate at least one indirect block pointer */
+	nblocks = nblocks ? : 1;
+	group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+	if (!group_info)
+		return NULL;
+	group_info->ngroups = gidsetsize;
+	group_info->nblocks = nblocks;
+	atomic_set(&group_info->usage, 1);
+
+	if (gidsetsize <= NGROUPS_SMALL) {
+		group_info->blocks[0] = group_info->small_block;
+	} else {
+		for (i = 0; i < nblocks; i++) {
+			gid_t *b;
+			b = (void *)__get_free_page(GFP_USER);
+			if (!b)
+				goto out_undo_partial_alloc;
+			group_info->blocks[i] = b;
+		}
+	}
+	return group_info;
+
+out_undo_partial_alloc:
+	while (--i >= 0) {
+		free_page((unsigned long)group_info->blocks[i]);
+	}
+	kfree(group_info);
+	return NULL;
+}
+
+EXPORT_SYMBOL(groups_alloc);
+
+void groups_free(struct group_info *group_info)
+{
+	if (group_info->blocks[0] != group_info->small_block) {
+		int i;
+		for (i = 0; i < group_info->nblocks; i++)
+			free_page((unsigned long)group_info->blocks[i]);
+	}
+	kfree(group_info);
+}
+
+EXPORT_SYMBOL(groups_free);
+
+/* export the group_info to a user-space array */
+static int groups_to_user(gid_t __user *grouplist,
+    struct group_info *group_info)
+{
+	int i;
+	int count = group_info->ngroups;
+
+	for (i = 0; i < group_info->nblocks; i++) {
+		int cp_count = min(NGROUPS_PER_BLOCK, count);
+		int off = i * NGROUPS_PER_BLOCK;
+		int len = cp_count * sizeof(*grouplist);
+
+		if (copy_to_user(grouplist+off, group_info->blocks[i], len))
+			return -EFAULT;
+
+		count -= cp_count;
+	}
+	return 0;
+}
+
+/* fill a group_info from a user-space array - it must be allocated already */
+static int groups_from_user(struct group_info *group_info,
+    gid_t __user *grouplist)
+ {
+	int i;
+	int count = group_info->ngroups;
+
+	for (i = 0; i < group_info->nblocks; i++) {
+		int cp_count = min(NGROUPS_PER_BLOCK, count);
+		int off = i * NGROUPS_PER_BLOCK;
+		int len = cp_count * sizeof(*grouplist);
+
+		if (copy_from_user(group_info->blocks[i], grouplist+off, len))
+			return -EFAULT;
+
+		count -= cp_count;
+	}
+	return 0;
+}
+
+/* a simple shell-metzner sort */
+static void groups_sort(struct group_info *group_info)
+{
+	int base, max, stride;
+	int gidsetsize = group_info->ngroups;
+
+	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+		; /* nothing */
+	stride /= 3;
+
+	while (stride) {
+		max = gidsetsize - stride;
+		for (base = 0; base < max; base++) {
+			int left = base;
+			int right = left + stride;
+			gid_t tmp = GROUP_AT(group_info, right);
+
+			while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
+				GROUP_AT(group_info, right) =
+				    GROUP_AT(group_info, left);
+				right = left;
+				left -= stride;
+			}
+			GROUP_AT(group_info, right) = tmp;
+		}
+		stride /= 3;
+	}
+}
+
+/* a simple bsearch */
+static int groups_search(struct group_info *group_info, gid_t grp)
+{
+	int left, right;
+
+	if (!group_info)
+		return 0;
+
+	left = 0;
+	right = group_info->ngroups;
+	while (left < right) {
+		int mid = (left+right)/2;
+		int cmp = grp - GROUP_AT(group_info, mid);
+		if (cmp > 0)
+			left = mid + 1;
+		else if (cmp < 0)
+			right = mid;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+/* validate and set current->group_info */
+int set_current_groups(struct group_info *group_info)
+{
+	int retval;
+	struct group_info *old_info;
+
+	retval = security_task_setgroups(group_info);
+	if (retval)
+		return retval;
+
+	groups_sort(group_info);
+	get_group_info(group_info);
+
+	task_lock(current);
+	old_info = current->group_info;
+	current->group_info = group_info;
+	task_unlock(current);
+
+	put_group_info(old_info);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(set_current_groups);
+
+asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
+{
+	int i = 0;
+
+	/*
+	 *	SMP: Nobody else can change our grouplist. Thus we are
+	 *	safe.
+	 */
+
+	if (gidsetsize < 0)
+		return -EINVAL;
+
+	/* no need to grab task_lock here; it cannot change */
+	get_group_info(current->group_info);
+	i = current->group_info->ngroups;
+	if (gidsetsize) {
+		if (i > gidsetsize) {
+			i = -EINVAL;
+			goto out;
+		}
+		if (groups_to_user(grouplist, current->group_info)) {
+			i = -EFAULT;
+			goto out;
+		}
+	}
+out:
+	put_group_info(current->group_info);
+	return i;
+}
+
+/*
+ *	SMP: Our groups are copy-on-write. We can set them safely
+ *	without another task interfering.
+ */
+ 
+asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
+{
+	struct group_info *group_info;
+	int retval;
+
+	if (!capable(CAP_SETGID))
+		return -EPERM;
+	if ((unsigned)gidsetsize > NGROUPS_MAX)
+		return -EINVAL;
+
+	group_info = groups_alloc(gidsetsize);
+	if (!group_info)
+		return -ENOMEM;
+	retval = groups_from_user(group_info, grouplist);
+	if (retval) {
+		put_group_info(group_info);
+		return retval;
+	}
+
+	retval = set_current_groups(group_info);
+	put_group_info(group_info);
+
+	return retval;
+}
+
+/*
+ * Check whether we're fsgid/egid or in the supplemental group..
+ */
+int in_group_p(gid_t grp)
+{
+	int retval = 1;
+	if (grp != current->fsgid) {
+		get_group_info(current->group_info);
+		retval = groups_search(current->group_info, grp);
+		put_group_info(current->group_info);
+	}
+	return retval;
+}
+
+EXPORT_SYMBOL(in_group_p);
+
+int in_egroup_p(gid_t grp)
+{
+	int retval = 1;
+	if (grp != current->egid) {
+		get_group_info(current->group_info);
+		retval = groups_search(current->group_info, grp);
+		put_group_info(current->group_info);
+	}
+	return retval;
+}
+
+EXPORT_SYMBOL(in_egroup_p);
+
+DECLARE_RWSEM(uts_sem);
+
+EXPORT_SYMBOL(uts_sem);
+
+asmlinkage long sys_newuname(struct new_utsname __user * name)
+{
+	int errno = 0;
+
+	down_read(&uts_sem);
+	if (copy_to_user(name,&system_utsname,sizeof *name))
+		errno = -EFAULT;
+	up_read(&uts_sem);
+	return errno;
+}
+
+asmlinkage long sys_sethostname(char __user *name, int len)
+{
+	int errno;
+	char tmp[__NEW_UTS_LEN];
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (len < 0 || len > __NEW_UTS_LEN)
+		return -EINVAL;
+	down_write(&uts_sem);
+	errno = -EFAULT;
+	if (!copy_from_user(tmp, name, len)) {
+		memcpy(system_utsname.nodename, tmp, len);
+		system_utsname.nodename[len] = 0;
+		errno = 0;
+	}
+	up_write(&uts_sem);
+	return errno;
+}
+
+#ifdef __ARCH_WANT_SYS_GETHOSTNAME
+
+asmlinkage long sys_gethostname(char __user *name, int len)
+{
+	int i, errno;
+
+	if (len < 0)
+		return -EINVAL;
+	down_read(&uts_sem);
+	i = 1 + strlen(system_utsname.nodename);
+	if (i > len)
+		i = len;
+	errno = 0;
+	if (copy_to_user(name, system_utsname.nodename, i))
+		errno = -EFAULT;
+	up_read(&uts_sem);
+	return errno;
+}
+
+#endif
+
+/*
+ * Only setdomainname; getdomainname can be implemented by calling
+ * uname()
+ */
+asmlinkage long sys_setdomainname(char __user *name, int len)
+{
+	int errno;
+	char tmp[__NEW_UTS_LEN];
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (len < 0 || len > __NEW_UTS_LEN)
+		return -EINVAL;
+
+	down_write(&uts_sem);
+	errno = -EFAULT;
+	if (!copy_from_user(tmp, name, len)) {
+		memcpy(system_utsname.domainname, tmp, len);
+		system_utsname.domainname[len] = 0;
+		errno = 0;
+	}
+	up_write(&uts_sem);
+	return errno;
+}
+
+asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim)
+{
+	if (resource >= RLIM_NLIMITS)
+		return -EINVAL;
+	else {
+		struct rlimit value;
+		task_lock(current->group_leader);
+		value = current->signal->rlim[resource];
+		task_unlock(current->group_leader);
+		return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
+	}
+}
+
+#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
+
+/*
+ *	Back compatibility for getrlimit. Needed for some apps.
+ */
+ 
+asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim)
+{
+	struct rlimit x;
+	if (resource >= RLIM_NLIMITS)
+		return -EINVAL;
+
+	task_lock(current->group_leader);
+	x = current->signal->rlim[resource];
+	task_unlock(current->group_leader);
+	if(x.rlim_cur > 0x7FFFFFFF)
+		x.rlim_cur = 0x7FFFFFFF;
+	if(x.rlim_max > 0x7FFFFFFF)
+		x.rlim_max = 0x7FFFFFFF;
+	return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
+}
+
+#endif
+
+asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
+{
+	struct rlimit new_rlim, *old_rlim;
+	int retval;
+
+	if (resource >= RLIM_NLIMITS)
+		return -EINVAL;
+	if(copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+		return -EFAULT;
+       if (new_rlim.rlim_cur > new_rlim.rlim_max)
+               return -EINVAL;
+	old_rlim = current->signal->rlim + resource;
+	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
+	    !capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
+			return -EPERM;
+
+	retval = security_task_setrlimit(resource, &new_rlim);
+	if (retval)
+		return retval;
+
+	task_lock(current->group_leader);
+	*old_rlim = new_rlim;
+	task_unlock(current->group_leader);
+
+	if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY &&
+	    (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
+	     new_rlim.rlim_cur <= cputime_to_secs(
+		     current->signal->it_prof_expires))) {
+		cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur);
+		read_lock(&tasklist_lock);
+		spin_lock_irq(&current->sighand->siglock);
+		set_process_cpu_timer(current, CPUCLOCK_PROF,
+				      &cputime, NULL);
+		spin_unlock_irq(&current->sighand->siglock);
+		read_unlock(&tasklist_lock);
+	}
+
+	return 0;
+}
+
+/*
+ * It would make sense to put struct rusage in the task_struct,
+ * except that would make the task_struct be *really big*.  After
+ * task_struct gets moved into malloc'ed memory, it would
+ * make sense to do this.  It will make moving the rest of the information
+ * a lot simpler!  (Which we're not doing right now because we're not
+ * measuring them yet).
+ *
+ * This expects to be called with tasklist_lock read-locked or better,
+ * and the siglock not locked.  It may momentarily take the siglock.
+ *
+ * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
+ * races with threads incrementing their own counters.  But since word
+ * reads are atomic, we either get new values or old values and we don't
+ * care which for the sums.  We always take the siglock to protect reading
+ * the c* fields from p->signal from races with exit.c updating those
+ * fields when reaping, so a sample either gets all the additions of a
+ * given child after it's reaped, or none so this sample is before reaping.
+ */
+
+static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
+{
+	struct task_struct *t;
+	unsigned long flags;
+	cputime_t utime, stime;
+
+	memset((char *) r, 0, sizeof *r);
+
+	if (unlikely(!p->signal))
+		return;
+
+	switch (who) {
+		case RUSAGE_CHILDREN:
+			spin_lock_irqsave(&p->sighand->siglock, flags);
+			utime = p->signal->cutime;
+			stime = p->signal->cstime;
+			r->ru_nvcsw = p->signal->cnvcsw;
+			r->ru_nivcsw = p->signal->cnivcsw;
+			r->ru_minflt = p->signal->cmin_flt;
+			r->ru_majflt = p->signal->cmaj_flt;
+			spin_unlock_irqrestore(&p->sighand->siglock, flags);
+			cputime_to_timeval(utime, &r->ru_utime);
+			cputime_to_timeval(stime, &r->ru_stime);
+			break;
+		case RUSAGE_SELF:
+			spin_lock_irqsave(&p->sighand->siglock, flags);
+			utime = stime = cputime_zero;
+			goto sum_group;
+		case RUSAGE_BOTH:
+			spin_lock_irqsave(&p->sighand->siglock, flags);
+			utime = p->signal->cutime;
+			stime = p->signal->cstime;
+			r->ru_nvcsw = p->signal->cnvcsw;
+			r->ru_nivcsw = p->signal->cnivcsw;
+			r->ru_minflt = p->signal->cmin_flt;
+			r->ru_majflt = p->signal->cmaj_flt;
+		sum_group:
+			utime = cputime_add(utime, p->signal->utime);
+			stime = cputime_add(stime, p->signal->stime);
+			r->ru_nvcsw += p->signal->nvcsw;
+			r->ru_nivcsw += p->signal->nivcsw;
+			r->ru_minflt += p->signal->min_flt;
+			r->ru_majflt += p->signal->maj_flt;
+			t = p;
+			do {
+				utime = cputime_add(utime, t->utime);
+				stime = cputime_add(stime, t->stime);
+				r->ru_nvcsw += t->nvcsw;
+				r->ru_nivcsw += t->nivcsw;
+				r->ru_minflt += t->min_flt;
+				r->ru_majflt += t->maj_flt;
+				t = next_thread(t);
+			} while (t != p);
+			spin_unlock_irqrestore(&p->sighand->siglock, flags);
+			cputime_to_timeval(utime, &r->ru_utime);
+			cputime_to_timeval(stime, &r->ru_stime);
+			break;
+		default:
+			BUG();
+	}
+}
+
+int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
+{
+	struct rusage r;
+	read_lock(&tasklist_lock);
+	k_getrusage(p, who, &r);
+	read_unlock(&tasklist_lock);
+	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
+}
+
+asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
+{
+	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN)
+		return -EINVAL;
+	return getrusage(current, who, ru);
+}
+
+asmlinkage long sys_umask(int mask)
+{
+	mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
+	return mask;
+}
+    
+asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
+			  unsigned long arg4, unsigned long arg5)
+{
+	long error;
+	int sig;
+
+	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
+	if (error)
+		return error;
+
+	switch (option) {
+		case PR_SET_PDEATHSIG:
+			sig = arg2;
+			if (sig < 0 || sig > _NSIG) {
+				error = -EINVAL;
+				break;
+			}
+			current->pdeath_signal = sig;
+			break;
+		case PR_GET_PDEATHSIG:
+			error = put_user(current->pdeath_signal, (int __user *)arg2);
+			break;
+		case PR_GET_DUMPABLE:
+			if (current->mm->dumpable)
+				error = 1;
+			break;
+		case PR_SET_DUMPABLE:
+			if (arg2 != 0 && arg2 != 1) {
+				error = -EINVAL;
+				break;
+			}
+			current->mm->dumpable = arg2;
+			break;
+
+		case PR_SET_UNALIGN:
+			error = SET_UNALIGN_CTL(current, arg2);
+			break;
+		case PR_GET_UNALIGN:
+			error = GET_UNALIGN_CTL(current, arg2);
+			break;
+		case PR_SET_FPEMU:
+			error = SET_FPEMU_CTL(current, arg2);
+			break;
+		case PR_GET_FPEMU:
+			error = GET_FPEMU_CTL(current, arg2);
+			break;
+		case PR_SET_FPEXC:
+			error = SET_FPEXC_CTL(current, arg2);
+			break;
+		case PR_GET_FPEXC:
+			error = GET_FPEXC_CTL(current, arg2);
+			break;
+		case PR_GET_TIMING:
+			error = PR_TIMING_STATISTICAL;
+			break;
+		case PR_SET_TIMING:
+			if (arg2 == PR_TIMING_STATISTICAL)
+				error = 0;
+			else
+				error = -EINVAL;
+			break;
+
+		case PR_GET_KEEPCAPS:
+			if (current->keep_capabilities)
+				error = 1;
+			break;
+		case PR_SET_KEEPCAPS:
+			if (arg2 != 0 && arg2 != 1) {
+				error = -EINVAL;
+				break;
+			}
+			current->keep_capabilities = arg2;
+			break;
+		case PR_SET_NAME: {
+			struct task_struct *me = current;
+			unsigned char ncomm[sizeof(me->comm)];
+
+			ncomm[sizeof(me->comm)-1] = 0;
+			if (strncpy_from_user(ncomm, (char __user *)arg2,
+						sizeof(me->comm)-1) < 0)
+				return -EFAULT;
+			set_task_comm(me, ncomm);
+			return 0;
+		}
+		case PR_GET_NAME: {
+			struct task_struct *me = current;
+			unsigned char tcomm[sizeof(me->comm)];
+
+			get_task_comm(tcomm, me);
+			if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
+				return -EFAULT;
+			return 0;
+		}
+		default:
+			error = -EINVAL;
+			break;
+	}
+	return error;
+}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
new file mode 100644
index 000000000000..1802a311dd3f
--- /dev/null
+++ b/kernel/sys_ni.c
@@ -0,0 +1,86 @@
+
+#include <linux/linkage.h>
+#include <linux/errno.h>
+
+#include <asm/unistd.h>
+
+/*
+ * Non-implemented system calls get redirected here.
+ */
+asmlinkage long sys_ni_syscall(void)
+{
+	return -ENOSYS;
+}
+
+cond_syscall(sys_nfsservctl);
+cond_syscall(sys_quotactl);
+cond_syscall(sys_acct);
+cond_syscall(sys_lookup_dcookie);
+cond_syscall(sys_swapon);
+cond_syscall(sys_swapoff);
+cond_syscall(sys_init_module);
+cond_syscall(sys_delete_module);
+cond_syscall(sys_socketpair);
+cond_syscall(sys_bind);
+cond_syscall(sys_listen);
+cond_syscall(sys_accept);
+cond_syscall(sys_connect);
+cond_syscall(sys_getsockname);
+cond_syscall(sys_getpeername);
+cond_syscall(sys_sendto);
+cond_syscall(sys_send);
+cond_syscall(sys_recvfrom);
+cond_syscall(sys_recv);
+cond_syscall(sys_socket);
+cond_syscall(sys_setsockopt);
+cond_syscall(sys_getsockopt);
+cond_syscall(sys_shutdown);
+cond_syscall(sys_sendmsg);
+cond_syscall(sys_recvmsg);
+cond_syscall(sys_socketcall);
+cond_syscall(sys_futex);
+cond_syscall(compat_sys_futex);
+cond_syscall(sys_epoll_create);
+cond_syscall(sys_epoll_ctl);
+cond_syscall(sys_epoll_wait);
+cond_syscall(sys_semget);
+cond_syscall(sys_semop);
+cond_syscall(sys_semtimedop);
+cond_syscall(sys_semctl);
+cond_syscall(sys_msgget);
+cond_syscall(sys_msgsnd);
+cond_syscall(sys_msgrcv);
+cond_syscall(sys_msgctl);
+cond_syscall(sys_shmget);
+cond_syscall(sys_shmdt);
+cond_syscall(sys_shmctl);
+cond_syscall(sys_mq_open);
+cond_syscall(sys_mq_unlink);
+cond_syscall(sys_mq_timedsend);
+cond_syscall(sys_mq_timedreceive);
+cond_syscall(sys_mq_notify);
+cond_syscall(sys_mq_getsetattr);
+cond_syscall(compat_sys_mq_open);
+cond_syscall(compat_sys_mq_timedsend);
+cond_syscall(compat_sys_mq_timedreceive);
+cond_syscall(compat_sys_mq_notify);
+cond_syscall(compat_sys_mq_getsetattr);
+cond_syscall(sys_mbind);
+cond_syscall(sys_get_mempolicy);
+cond_syscall(sys_set_mempolicy);
+cond_syscall(compat_sys_mbind);
+cond_syscall(compat_sys_get_mempolicy);
+cond_syscall(compat_sys_set_mempolicy);
+cond_syscall(sys_add_key);
+cond_syscall(sys_request_key);
+cond_syscall(sys_keyctl);
+cond_syscall(compat_sys_keyctl);
+cond_syscall(compat_sys_socketcall);
+
+/* arch-specific weak syscall entries */
+cond_syscall(sys_pciconfig_read);
+cond_syscall(sys_pciconfig_write);
+cond_syscall(sys_pciconfig_iobase);
+cond_syscall(sys32_ipc);
+cond_syscall(sys32_sysctl);
+cond_syscall(ppc_rtas);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
new file mode 100644
index 000000000000..79dbd93bd697
--- /dev/null
+++ b/kernel/sysctl.c
@@ -0,0 +1,2337 @@
+/*
+ * sysctl.c: General linux system control interface
+ *
+ * Begun 24 March 1995, Stephen Tweedie
+ * Added /proc support, Dec 1995
+ * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas.
+ * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver.
+ * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver.
+ * Dynamic registration fixes, Stephen Tweedie.
+ * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn.
+ * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris
+ *  Horn.
+ * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer.
+ * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer.
+ * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill
+ *  Wendling.
+ * The list_for_each() macro wasn't appropriate for the sysctl loop.
+ *  Removed it and replaced it with older style, 03/23/00, Bill Wendling
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/utsname.h>
+#include <linux/capability.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sysrq.h>
+#include <linux/highuid.h>
+#include <linux/writeback.h>
+#include <linux/hugetlb.h>
+#include <linux/security.h>
+#include <linux/initrd.h>
+#include <linux/times.h>
+#include <linux/limits.h>
+#include <linux/dcache.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+
+#ifdef CONFIG_ROOT_NFS
+#include <linux/nfs_fs.h>
+#endif
+
+#if defined(CONFIG_SYSCTL)
+
+/* External variables not in a header file. */
+extern int C_A_D;
+extern int sysctl_overcommit_memory;
+extern int sysctl_overcommit_ratio;
+extern int max_threads;
+extern int sysrq_enabled;
+extern int core_uses_pid;
+extern char core_pattern[];
+extern int cad_pid;
+extern int pid_max;
+extern int min_free_kbytes;
+extern int printk_ratelimit_jiffies;
+extern int printk_ratelimit_burst;
+extern int pid_max_min, pid_max_max;
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+int unknown_nmi_panic;
+extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
+				  void __user *, size_t *, loff_t *);
+#endif
+
+/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
+static int maxolduid = 65535;
+static int minolduid;
+
+static int ngroups_max = NGROUPS_MAX;
+
+#ifdef CONFIG_KMOD
+extern char modprobe_path[];
+#endif
+#ifdef CONFIG_HOTPLUG
+extern char hotplug_path[];
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+extern int sg_big_buff;
+#endif
+#ifdef CONFIG_SYSVIPC
+extern size_t shm_ctlmax;
+extern size_t shm_ctlall;
+extern int shm_ctlmni;
+extern int msg_ctlmax;
+extern int msg_ctlmnb;
+extern int msg_ctlmni;
+extern int sem_ctls[];
+#endif
+
+#ifdef __sparc__
+extern char reboot_command [];
+extern int stop_a_enabled;
+extern int scons_pwroff;
+#endif
+
+#ifdef __hppa__
+extern int pwrsw_enabled;
+extern int unaligned_enabled;
+#endif
+
+#ifdef CONFIG_ARCH_S390
+#ifdef CONFIG_MATHEMU
+extern int sysctl_ieee_emulation_warnings;
+#endif
+extern int sysctl_userprocess_debug;
+#endif
+
+extern int sysctl_hz_timer;
+
+#ifdef CONFIG_BSD_PROCESS_ACCT
+extern int acct_parm[];
+#endif
+
+int randomize_va_space = 1;
+
+static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
+		       ctl_table *, void **);
+static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos);
+
+static ctl_table root_table[];
+static struct ctl_table_header root_table_header =
+	{ root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) };
+
+static ctl_table kern_table[];
+static ctl_table vm_table[];
+#ifdef CONFIG_NET
+extern ctl_table net_table[];
+#endif
+static ctl_table proc_table[];
+static ctl_table fs_table[];
+static ctl_table debug_table[];
+static ctl_table dev_table[];
+extern ctl_table random_table[];
+#ifdef CONFIG_UNIX98_PTYS
+extern ctl_table pty_table[];
+#endif
+
+#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+int sysctl_legacy_va_layout;
+#endif
+
+/* /proc declarations: */
+
+#ifdef CONFIG_PROC_FS
+
+static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
+static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
+static int proc_opensys(struct inode *, struct file *);
+
+struct file_operations proc_sys_file_operations = {
+	.open		= proc_opensys,
+	.read		= proc_readsys,
+	.write		= proc_writesys,
+};
+
+extern struct proc_dir_entry *proc_sys_root;
+
+static void register_proc_table(ctl_table *, struct proc_dir_entry *);
+static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
+#endif
+
+/* The default sysctl tables: */
+
+static ctl_table root_table[] = {
+	{
+		.ctl_name	= CTL_KERN,
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= kern_table,
+	},
+	{
+		.ctl_name	= CTL_VM,
+		.procname	= "vm",
+		.mode		= 0555,
+		.child		= vm_table,
+	},
+#ifdef CONFIG_NET
+	{
+		.ctl_name	= CTL_NET,
+		.procname	= "net",
+		.mode		= 0555,
+		.child		= net_table,
+	},
+#endif
+	{
+		.ctl_name	= CTL_PROC,
+		.procname	= "proc",
+		.mode		= 0555,
+		.child		= proc_table,
+	},
+	{
+		.ctl_name	= CTL_FS,
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= fs_table,
+	},
+	{
+		.ctl_name	= CTL_DEBUG,
+		.procname	= "debug",
+		.mode		= 0555,
+		.child		= debug_table,
+	},
+	{
+		.ctl_name	= CTL_DEV,
+		.procname	= "dev",
+		.mode		= 0555,
+		.child		= dev_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table kern_table[] = {
+	{
+		.ctl_name	= KERN_OSTYPE,
+		.procname	= "ostype",
+		.data		= system_utsname.sysname,
+		.maxlen		= sizeof(system_utsname.sysname),
+		.mode		= 0444,
+		.proc_handler	= &proc_doutsstring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_OSRELEASE,
+		.procname	= "osrelease",
+		.data		= system_utsname.release,
+		.maxlen		= sizeof(system_utsname.release),
+		.mode		= 0444,
+		.proc_handler	= &proc_doutsstring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_VERSION,
+		.procname	= "version",
+		.data		= system_utsname.version,
+		.maxlen		= sizeof(system_utsname.version),
+		.mode		= 0444,
+		.proc_handler	= &proc_doutsstring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_NODENAME,
+		.procname	= "hostname",
+		.data		= system_utsname.nodename,
+		.maxlen		= sizeof(system_utsname.nodename),
+		.mode		= 0644,
+		.proc_handler	= &proc_doutsstring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_DOMAINNAME,
+		.procname	= "domainname",
+		.data		= system_utsname.domainname,
+		.maxlen		= sizeof(system_utsname.domainname),
+		.mode		= 0644,
+		.proc_handler	= &proc_doutsstring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "panic",
+		.data		= &panic_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_CORE_USES_PID,
+		.procname	= "core_uses_pid",
+		.data		= &core_uses_pid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_CORE_PATTERN,
+		.procname	= "core_pattern",
+		.data		= core_pattern,
+		.maxlen		= 64,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_TAINTED,
+		.procname	= "tainted",
+		.data		= &tainted,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_CAP_BSET,
+		.procname	= "cap-bound",
+		.data		= &cap_bset,
+		.maxlen		= sizeof(kernel_cap_t),
+		.mode		= 0600,
+		.proc_handler	= &proc_dointvec_bset,
+	},
+#ifdef CONFIG_BLK_DEV_INITRD
+	{
+		.ctl_name	= KERN_REALROOTDEV,
+		.procname	= "real-root-dev",
+		.data		= &real_root_dev,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef __sparc__
+	{
+		.ctl_name	= KERN_SPARC_REBOOT,
+		.procname	= "reboot-cmd",
+		.data		= reboot_command,
+		.maxlen		= 256,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
+	{
+		.ctl_name	= KERN_SPARC_STOP_A,
+		.procname	= "stop-a",
+		.data		= &stop_a_enabled,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_SPARC_SCONS_PWROFF,
+		.procname	= "scons-poweroff",
+		.data		= &scons_pwroff,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef __hppa__
+	{
+		.ctl_name	= KERN_HPPA_PWRSW,
+		.procname	= "soft-power",
+		.data		= &pwrsw_enabled,
+		.maxlen		= sizeof (int),
+	 	.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_HPPA_UNALIGNED,
+		.procname	= "unaligned-trap",
+		.data		= &unaligned_enabled,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
+		.ctl_name	= KERN_CTLALTDEL,
+		.procname	= "ctrl-alt-del",
+		.data		= &C_A_D,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PRINTK,
+		.procname	= "printk",
+		.data		= &console_loglevel,
+		.maxlen		= 4*sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_KMOD
+	{
+		.ctl_name	= KERN_MODPROBE,
+		.procname	= "modprobe",
+		.data		= &modprobe_path,
+		.maxlen		= KMOD_PATH_LEN,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
+#endif
+#ifdef CONFIG_HOTPLUG
+	{
+		.ctl_name	= KERN_HOTPLUG,
+		.procname	= "hotplug",
+		.data		= &hotplug_path,
+		.maxlen		= HOTPLUG_PATH_LEN,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+	{
+		.ctl_name	= KERN_SG_BIG_BUFF,
+		.procname	= "sg-big-buff",
+		.data		= &sg_big_buff,
+		.maxlen		= sizeof (int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+	{
+		.ctl_name	= KERN_ACCT,
+		.procname	= "acct",
+		.data		= &acct_parm,
+		.maxlen		= 3*sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_SYSVIPC
+	{
+		.ctl_name	= KERN_SHMMAX,
+		.procname	= "shmmax",
+		.data		= &shm_ctlmax,
+		.maxlen		= sizeof (size_t),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+	{
+		.ctl_name	= KERN_SHMALL,
+		.procname	= "shmall",
+		.data		= &shm_ctlall,
+		.maxlen		= sizeof (size_t),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+	{
+		.ctl_name	= KERN_SHMMNI,
+		.procname	= "shmmni",
+		.data		= &shm_ctlmni,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_MSGMAX,
+		.procname	= "msgmax",
+		.data		= &msg_ctlmax,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_MSGMNI,
+		.procname	= "msgmni",
+		.data		= &msg_ctlmni,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_MSGMNB,
+		.procname	=  "msgmnb",
+		.data		= &msg_ctlmnb,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_SEM,
+		.procname	= "sem",
+		.data		= &sem_ctls,
+		.maxlen		= 4*sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_MAGIC_SYSRQ
+	{
+		.ctl_name	= KERN_SYSRQ,
+		.procname	= "sysrq",
+		.data		= &sysrq_enabled,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
+		.ctl_name	= KERN_CADPID,
+		.procname	= "cad_pid",
+		.data		= &cad_pid,
+		.maxlen		= sizeof (int),
+		.mode		= 0600,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_MAX_THREADS,
+		.procname	= "threads-max",
+		.data		= &max_threads,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_RANDOM,
+		.procname	= "random",
+		.mode		= 0555,
+		.child		= random_table,
+	},
+#ifdef CONFIG_UNIX98_PTYS
+	{
+		.ctl_name	= KERN_PTY,
+		.procname	= "pty",
+		.mode		= 0555,
+		.child		= pty_table,
+	},
+#endif
+	{
+		.ctl_name	= KERN_OVERFLOWUID,
+		.procname	= "overflowuid",
+		.data		= &overflowuid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &minolduid,
+		.extra2		= &maxolduid,
+	},
+	{
+		.ctl_name	= KERN_OVERFLOWGID,
+		.procname	= "overflowgid",
+		.data		= &overflowgid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &minolduid,
+		.extra2		= &maxolduid,
+	},
+#ifdef CONFIG_ARCH_S390
+#ifdef CONFIG_MATHEMU
+	{
+		.ctl_name	= KERN_IEEE_EMULATION_WARNINGS,
+		.procname	= "ieee_emulation_warnings",
+		.data		= &sysctl_ieee_emulation_warnings,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_NO_IDLE_HZ
+	{
+		.ctl_name       = KERN_HZ_TIMER,
+		.procname       = "hz_timer",
+		.data           = &sysctl_hz_timer,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
+	{
+		.ctl_name	= KERN_S390_USER_DEBUG_LOGGING,
+		.procname	= "userprocess_debug",
+		.data		= &sysctl_userprocess_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
+		.ctl_name	= KERN_PIDMAX,
+		.procname	= "pid_max",
+		.data		= &pid_max,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= sysctl_intvec,
+		.extra1		= &pid_max_min,
+		.extra2		= &pid_max_max,
+	},
+	{
+		.ctl_name	= KERN_PANIC_ON_OOPS,
+		.procname	= "panic_on_oops",
+		.data		= &panic_on_oops,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PRINTK_RATELIMIT,
+		.procname	= "printk_ratelimit",
+		.data		= &printk_ratelimit_jiffies,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
+	{
+		.ctl_name	= KERN_PRINTK_RATELIMIT_BURST,
+		.procname	= "printk_ratelimit_burst",
+		.data		= &printk_ratelimit_burst,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_NGROUPS_MAX,
+		.procname	= "ngroups_max",
+		.data		= &ngroups_max,
+		.maxlen		= sizeof (int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+	{
+		.ctl_name       = KERN_UNKNOWN_NMI_PANIC,
+		.procname       = "unknown_nmi_panic",
+		.data           = &unknown_nmi_panic,
+		.maxlen         = sizeof (int),
+		.mode           = 0644,
+		.proc_handler   = &proc_unknown_nmi_panic,
+	},
+#endif
+#if defined(CONFIG_X86)
+	{
+		.ctl_name	= KERN_BOOTLOADER_TYPE,
+		.procname	= "bootloader_type",
+		.data		= &bootloader_type,
+		.maxlen		= sizeof (int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
+		.ctl_name	= KERN_RANDOMIZE,
+		.procname	= "randomize_va_space",
+		.data		= &randomize_va_space,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+
+	{ .ctl_name = 0 }
+};
+
+/* Constants for minimum and maximum testing in vm_table.
+   We use these as one-element integer vectors. */
+static int zero;
+static int one_hundred = 100;
+
+
+static ctl_table vm_table[] = {
+	{
+		.ctl_name	= VM_OVERCOMMIT_MEMORY,
+		.procname	= "overcommit_memory",
+		.data		= &sysctl_overcommit_memory,
+		.maxlen		= sizeof(sysctl_overcommit_memory),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= VM_OVERCOMMIT_RATIO,
+		.procname	= "overcommit_ratio",
+		.data		= &sysctl_overcommit_ratio,
+		.maxlen		= sizeof(sysctl_overcommit_ratio),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= VM_PAGE_CLUSTER,
+		.procname	= "page-cluster", 
+		.data		= &page_cluster,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= VM_DIRTY_BACKGROUND,
+		.procname	= "dirty_background_ratio",
+		.data		= &dirty_background_ratio,
+		.maxlen		= sizeof(dirty_background_ratio),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+	{
+		.ctl_name	= VM_DIRTY_RATIO,
+		.procname	= "dirty_ratio",
+		.data		= &vm_dirty_ratio,
+		.maxlen		= sizeof(vm_dirty_ratio),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+	{
+		.ctl_name	= VM_DIRTY_WB_CS,
+		.procname	= "dirty_writeback_centisecs",
+		.data		= &dirty_writeback_centisecs,
+		.maxlen		= sizeof(dirty_writeback_centisecs),
+		.mode		= 0644,
+		.proc_handler	= &dirty_writeback_centisecs_handler,
+	},
+	{
+		.ctl_name	= VM_DIRTY_EXPIRE_CS,
+		.procname	= "dirty_expire_centisecs",
+		.data		= &dirty_expire_centisecs,
+		.maxlen		= sizeof(dirty_expire_centisecs),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= VM_NR_PDFLUSH_THREADS,
+		.procname	= "nr_pdflush_threads",
+		.data		= &nr_pdflush_threads,
+		.maxlen		= sizeof nr_pdflush_threads,
+		.mode		= 0444 /* read-only*/,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= VM_SWAPPINESS,
+		.procname	= "swappiness",
+		.data		= &vm_swappiness,
+		.maxlen		= sizeof(vm_swappiness),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#ifdef CONFIG_HUGETLB_PAGE
+	 {
+		.ctl_name	= VM_HUGETLB_PAGES,
+		.procname	= "nr_hugepages",
+		.data		= &max_huge_pages,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &hugetlb_sysctl_handler,
+		.extra1		= (void *)&hugetlb_zero,
+		.extra2		= (void *)&hugetlb_infinity,
+	 },
+	 {
+		.ctl_name	= VM_HUGETLB_GROUP,
+		.procname	= "hugetlb_shm_group",
+		.data		= &sysctl_hugetlb_shm_group,
+		.maxlen		= sizeof(gid_t),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	 },
+#endif
+	{
+		.ctl_name	= VM_LOWMEM_RESERVE_RATIO,
+		.procname	= "lowmem_reserve_ratio",
+		.data		= &sysctl_lowmem_reserve_ratio,
+		.maxlen		= sizeof(sysctl_lowmem_reserve_ratio),
+		.mode		= 0644,
+		.proc_handler	= &lowmem_reserve_ratio_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= VM_MIN_FREE_KBYTES,
+		.procname	= "min_free_kbytes",
+		.data		= &min_free_kbytes,
+		.maxlen		= sizeof(min_free_kbytes),
+		.mode		= 0644,
+		.proc_handler	= &min_free_kbytes_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+#ifdef CONFIG_MMU
+	{
+		.ctl_name	= VM_MAX_MAP_COUNT,
+		.procname	= "max_map_count",
+		.data		= &sysctl_max_map_count,
+		.maxlen		= sizeof(sysctl_max_map_count),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+#endif
+	{
+		.ctl_name	= VM_LAPTOP_MODE,
+		.procname	= "laptop_mode",
+		.data		= &laptop_mode,
+		.maxlen		= sizeof(laptop_mode),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= VM_BLOCK_DUMP,
+		.procname	= "block_dump",
+		.data		= &block_dump,
+		.maxlen		= sizeof(block_dump),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= VM_VFS_CACHE_PRESSURE,
+		.procname	= "vfs_cache_pressure",
+		.data		= &sysctl_vfs_cache_pressure,
+		.maxlen		= sizeof(sysctl_vfs_cache_pressure),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+	{
+		.ctl_name	= VM_LEGACY_VA_LAYOUT,
+		.procname	= "legacy_va_layout",
+		.data		= &sysctl_legacy_va_layout,
+		.maxlen		= sizeof(sysctl_legacy_va_layout),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+#endif
+#ifdef CONFIG_SWAP
+	{
+		.ctl_name	= VM_SWAP_TOKEN_TIMEOUT,
+		.procname	= "swap_token_timeout",
+		.data		= &swap_token_default_timeout,
+		.maxlen		= sizeof(swap_token_default_timeout),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
+#endif
+	{ .ctl_name = 0 }
+};
+
+static ctl_table proc_table[] = {
+	{ .ctl_name = 0 }
+};
+
+static ctl_table fs_table[] = {
+	{
+		.ctl_name	= FS_NRINODE,
+		.procname	= "inode-nr",
+		.data		= &inodes_stat,
+		.maxlen		= 2*sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= FS_STATINODE,
+		.procname	= "inode-state",
+		.data		= &inodes_stat,
+		.maxlen		= 7*sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= FS_NRFILE,
+		.procname	= "file-nr",
+		.data		= &files_stat,
+		.maxlen		= 3*sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= FS_MAXFILE,
+		.procname	= "file-max",
+		.data		= &files_stat.max_files,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= FS_DENTRY,
+		.procname	= "dentry-state",
+		.data		= &dentry_stat,
+		.maxlen		= 6*sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= FS_OVERFLOWUID,
+		.procname	= "overflowuid",
+		.data		= &fs_overflowuid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &minolduid,
+		.extra2		= &maxolduid,
+	},
+	{
+		.ctl_name	= FS_OVERFLOWGID,
+		.procname	= "overflowgid",
+		.data		= &fs_overflowgid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &minolduid,
+		.extra2		= &maxolduid,
+	},
+	{
+		.ctl_name	= FS_LEASES,
+		.procname	= "leases-enable",
+		.data		= &leases_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_DNOTIFY
+	{
+		.ctl_name	= FS_DIR_NOTIFY,
+		.procname	= "dir-notify-enable",
+		.data		= &dir_notify_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_MMU
+	{
+		.ctl_name	= FS_LEASE_TIME,
+		.procname	= "lease-break-time",
+		.data		= &lease_break_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= FS_AIO_NR,
+		.procname	= "aio-nr",
+		.data		= &aio_nr,
+		.maxlen		= sizeof(aio_nr),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= FS_AIO_MAX_NR,
+		.procname	= "aio-max-nr",
+		.data		= &aio_max_nr,
+		.maxlen		= sizeof(aio_max_nr),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{ .ctl_name = 0 }
+};
+
+static ctl_table debug_table[] = {
+	{ .ctl_name = 0 }
+};
+
+static ctl_table dev_table[] = {
+	{ .ctl_name = 0 }
+};  
+
+extern void init_irq_proc (void);
+
+void __init sysctl_init(void)
+{
+#ifdef CONFIG_PROC_FS
+	register_proc_table(root_table, proc_sys_root);
+	init_irq_proc();
+#endif
+}
+
+int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
+	       void __user *newval, size_t newlen)
+{
+	struct list_head *tmp;
+
+	if (nlen <= 0 || nlen >= CTL_MAXNAME)
+		return -ENOTDIR;
+	if (oldval) {
+		int old_len;
+		if (!oldlenp || get_user(old_len, oldlenp))
+			return -EFAULT;
+	}
+	tmp = &root_table_header.ctl_entry;
+	do {
+		struct ctl_table_header *head =
+			list_entry(tmp, struct ctl_table_header, ctl_entry);
+		void *context = NULL;
+		int error = parse_table(name, nlen, oldval, oldlenp, 
+					newval, newlen, head->ctl_table,
+					&context);
+		if (context)
+			kfree(context);
+		if (error != -ENOTDIR)
+			return error;
+		tmp = tmp->next;
+	} while (tmp != &root_table_header.ctl_entry);
+	return -ENOTDIR;
+}
+
+asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
+{
+	struct __sysctl_args tmp;
+	int error;
+
+	if (copy_from_user(&tmp, args, sizeof(tmp)))
+		return -EFAULT;
+
+	lock_kernel();
+	error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
+			  tmp.newval, tmp.newlen);
+	unlock_kernel();
+	return error;
+}
+
+/*
+ * ctl_perm does NOT grant the superuser all rights automatically, because
+ * some sysctl variables are readonly even to root.
+ */
+
+static int test_perm(int mode, int op)
+{
+	if (!current->euid)
+		mode >>= 6;
+	else if (in_egroup_p(0))
+		mode >>= 3;
+	if ((mode & op & 0007) == op)
+		return 0;
+	return -EACCES;
+}
+
+static inline int ctl_perm(ctl_table *table, int op)
+{
+	int error;
+	error = security_sysctl(table, op);
+	if (error)
+		return error;
+	return test_perm(table->mode, op);
+}
+
+static int parse_table(int __user *name, int nlen,
+		       void __user *oldval, size_t __user *oldlenp,
+		       void __user *newval, size_t newlen,
+		       ctl_table *table, void **context)
+{
+	int n;
+repeat:
+	if (!nlen)
+		return -ENOTDIR;
+	if (get_user(n, name))
+		return -EFAULT;
+	for ( ; table->ctl_name; table++) {
+		if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
+			int error;
+			if (table->child) {
+				if (ctl_perm(table, 001))
+					return -EPERM;
+				if (table->strategy) {
+					error = table->strategy(
+						table, name, nlen,
+						oldval, oldlenp,
+						newval, newlen, context);
+					if (error)
+						return error;
+				}
+				name++;
+				nlen--;
+				table = table->child;
+				goto repeat;
+			}
+			error = do_sysctl_strategy(table, name, nlen,
+						   oldval, oldlenp,
+						   newval, newlen, context);
+			return error;
+		}
+	}
+	return -ENOTDIR;
+}
+
+/* Perform the actual read/write of a sysctl table entry. */
+int do_sysctl_strategy (ctl_table *table, 
+			int __user *name, int nlen,
+			void __user *oldval, size_t __user *oldlenp,
+			void __user *newval, size_t newlen, void **context)
+{
+	int op = 0, rc;
+	size_t len;
+
+	if (oldval)
+		op |= 004;
+	if (newval) 
+		op |= 002;
+	if (ctl_perm(table, op))
+		return -EPERM;
+
+	if (table->strategy) {
+		rc = table->strategy(table, name, nlen, oldval, oldlenp,
+				     newval, newlen, context);
+		if (rc < 0)
+			return rc;
+		if (rc > 0)
+			return 0;
+	}
+
+	/* If there is no strategy routine, or if the strategy returns
+	 * zero, proceed with automatic r/w */
+	if (table->data && table->maxlen) {
+		if (oldval && oldlenp) {
+			if (get_user(len, oldlenp))
+				return -EFAULT;
+			if (len) {
+				if (len > table->maxlen)
+					len = table->maxlen;
+				if(copy_to_user(oldval, table->data, len))
+					return -EFAULT;
+				if(put_user(len, oldlenp))
+					return -EFAULT;
+			}
+		}
+		if (newval && newlen) {
+			len = newlen;
+			if (len > table->maxlen)
+				len = table->maxlen;
+			if(copy_from_user(table->data, newval, len))
+				return -EFAULT;
+		}
+	}
+	return 0;
+}
+
+/**
+ * register_sysctl_table - register a sysctl hierarchy
+ * @table: the top-level table structure
+ * @insert_at_head: whether the entry should be inserted in front or at the end
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. An entry with a ctl_name of 0 terminates the table. 
+ *
+ * The members of the &ctl_table structure are used as follows:
+ *
+ * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
+ *            must be unique within that level of sysctl
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ *            enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file, and for sysctl(2)
+ *
+ * child - a pointer to the child sysctl table if this entry is a directory, or
+ *         %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * strategy - the strategy routine (described below)
+ *
+ * de - for internal use by the sysctl routines
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.
+ *
+ * sysctl(2) can automatically manage read and write requests through
+ * the sysctl table.  The data and maxlen fields of the ctl_table
+ * struct enable minimal validation of the values being written to be
+ * performed, and the mode field allows minimal authentication.
+ *
+ * More sophisticated management can be enabled by the provision of a
+ * strategy routine with the table entry.  This will be called before
+ * any automatic read or write of the data is performed.
+ *
+ * The strategy routine may return
+ *
+ * < 0 - Error occurred (error is passed to user process)
+ *
+ * 0   - OK - proceed with automatic read or write.
+ *
+ * > 0 - OK - read or write has been done by the strategy routine, so
+ *       return immediately.
+ *
+ * There must be a proc_handler routine for any terminal nodes
+ * mirrored under /proc/sys (non-terminals are handled by a built-in
+ * directory handler).  Several default handlers are available to
+ * cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), 
+ * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *register_sysctl_table(ctl_table * table, 
+					       int insert_at_head)
+{
+	struct ctl_table_header *tmp;
+	tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
+	if (!tmp)
+		return NULL;
+	tmp->ctl_table = table;
+	INIT_LIST_HEAD(&tmp->ctl_entry);
+	if (insert_at_head)
+		list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
+	else
+		list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
+#ifdef CONFIG_PROC_FS
+	register_proc_table(table, proc_sys_root);
+#endif
+	return tmp;
+}
+
+/**
+ * unregister_sysctl_table - unregister a sysctl table hierarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+	list_del(&header->ctl_entry);
+#ifdef CONFIG_PROC_FS
+	unregister_proc_table(header->ctl_table, proc_sys_root);
+#endif
+	kfree(header);
+}
+
+/*
+ * /proc/sys support
+ */
+
+#ifdef CONFIG_PROC_FS
+
+/* Scan the sysctl entries in table and add them all into /proc */
+static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
+{
+	struct proc_dir_entry *de;
+	int len;
+	mode_t mode;
+	
+	for (; table->ctl_name; table++) {
+		/* Can't do anything without a proc name. */
+		if (!table->procname)
+			continue;
+		/* Maybe we can't do anything with it... */
+		if (!table->proc_handler && !table->child) {
+			printk(KERN_WARNING "SYSCTL: Can't register %s\n",
+				table->procname);
+			continue;
+		}
+
+		len = strlen(table->procname);
+		mode = table->mode;
+
+		de = NULL;
+		if (table->proc_handler)
+			mode |= S_IFREG;
+		else {
+			mode |= S_IFDIR;
+			for (de = root->subdir; de; de = de->next) {
+				if (proc_match(len, table->procname, de))
+					break;
+			}
+			/* If the subdir exists already, de is non-NULL */
+		}
+
+		if (!de) {
+			de = create_proc_entry(table->procname, mode, root);
+			if (!de)
+				continue;
+			de->data = (void *) table;
+			if (table->proc_handler)
+				de->proc_fops = &proc_sys_file_operations;
+		}
+		table->de = de;
+		if (de->mode & S_IFDIR)
+			register_proc_table(table->child, de);
+	}
+}
+
+/*
+ * Unregister a /proc sysctl table and any subdirectories.
+ */
+static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
+{
+	struct proc_dir_entry *de;
+	for (; table->ctl_name; table++) {
+		if (!(de = table->de))
+			continue;
+		if (de->mode & S_IFDIR) {
+			if (!table->child) {
+				printk (KERN_ALERT "Help - malformed sysctl tree on free\n");
+				continue;
+			}
+			unregister_proc_table(table->child, de);
+
+			/* Don't unregister directories which still have entries.. */
+			if (de->subdir)
+				continue;
+		}
+
+		/* Don't unregister proc entries that are still being used.. */
+		if (atomic_read(&de->count))
+			continue;
+
+		table->de = NULL;
+		remove_proc_entry(table->procname, root);
+	}
+}
+
+static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
+			  size_t count, loff_t *ppos)
+{
+	int op;
+	struct proc_dir_entry *de;
+	struct ctl_table *table;
+	size_t res;
+	ssize_t error;
+	
+	de = PDE(file->f_dentry->d_inode);
+	if (!de || !de->data)
+		return -ENOTDIR;
+	table = (struct ctl_table *) de->data;
+	if (!table || !table->proc_handler)
+		return -ENOTDIR;
+	op = (write ? 002 : 004);
+	if (ctl_perm(table, op))
+		return -EPERM;
+	
+	res = count;
+
+	error = (*table->proc_handler) (table, write, file, buf, &res, ppos);
+	if (error)
+		return error;
+	return res;
+}
+
+static int proc_opensys(struct inode *inode, struct file *file)
+{
+	if (file->f_mode & FMODE_WRITE) {
+		/*
+		 * sysctl entries that are not writable,
+		 * are _NOT_ writable, capabilities or not.
+		 */
+		if (!(inode->i_mode & S_IWUSR))
+			return -EPERM;
+	}
+
+	return 0;
+}
+
+static ssize_t proc_readsys(struct file * file, char __user * buf,
+			    size_t count, loff_t *ppos)
+{
+	return do_rw_proc(0, file, buf, count, ppos);
+}
+
+static ssize_t proc_writesys(struct file * file, const char __user * buf,
+			     size_t count, loff_t *ppos)
+{
+	return do_rw_proc(1, file, (char __user *) buf, count, ppos);
+}
+
+/**
+ * proc_dostring - read a string sysctl
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes a string from/to the user buffer. If the kernel
+ * buffer provided is not large enough to hold the string, the
+ * string is truncated. The copied string is %NULL-terminated.
+ * If the string is being read by the user process, it is copied
+ * and a newline '\n' is added. It is truncated if the buffer is
+ * not large enough.
+ *
+ * Returns 0 on success.
+ */
+int proc_dostring(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	size_t len;
+	char __user *p;
+	char c;
+	
+	if (!table->data || !table->maxlen || !*lenp ||
+	    (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	
+	if (write) {
+		len = 0;
+		p = buffer;
+		while (len < *lenp) {
+			if (get_user(c, p++))
+				return -EFAULT;
+			if (c == 0 || c == '\n')
+				break;
+			len++;
+		}
+		if (len >= table->maxlen)
+			len = table->maxlen-1;
+		if(copy_from_user(table->data, buffer, len))
+			return -EFAULT;
+		((char *) table->data)[len] = 0;
+		*ppos += *lenp;
+	} else {
+		len = strlen(table->data);
+		if (len > table->maxlen)
+			len = table->maxlen;
+		if (len > *lenp)
+			len = *lenp;
+		if (len)
+			if(copy_to_user(buffer, table->data, len))
+				return -EFAULT;
+		if (len < *lenp) {
+			if(put_user('\n', ((char __user *) buffer) + len))
+				return -EFAULT;
+			len++;
+		}
+		*lenp = len;
+		*ppos += len;
+	}
+	return 0;
+}
+
+/*
+ *	Special case of dostring for the UTS structure. This has locks
+ *	to observe. Should this be in kernel/sys.c ????
+ */
+ 
+static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int r;
+
+	if (!write) {
+		down_read(&uts_sem);
+		r=proc_dostring(table,0,filp,buffer,lenp, ppos);
+		up_read(&uts_sem);
+	} else {
+		down_write(&uts_sem);
+		r=proc_dostring(table,1,filp,buffer,lenp, ppos);
+		up_write(&uts_sem);
+	}
+	return r;
+}
+
+static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
+				 int *valp,
+				 int write, void *data)
+{
+	if (write) {
+		*valp = *negp ? -*lvalp : *lvalp;
+	} else {
+		int val = *valp;
+		if (val < 0) {
+			*negp = -1;
+			*lvalp = (unsigned long)-val;
+		} else {
+			*negp = 0;
+			*lvalp = (unsigned long)val;
+		}
+	}
+	return 0;
+}
+
+static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos,
+		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+			      int write, void *data),
+		  void *data)
+{
+#define TMPBUFLEN 21
+	int *i, vleft, first=1, neg, val;
+	unsigned long lval;
+	size_t left, len;
+	
+	char buf[TMPBUFLEN], *p;
+	char __user *s = buffer;
+	
+	if (!table->data || !table->maxlen || !*lenp ||
+	    (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	
+	i = (int *) table->data;
+	vleft = table->maxlen / sizeof(*i);
+	left = *lenp;
+
+	if (!conv)
+		conv = do_proc_dointvec_conv;
+
+	for (; left && vleft--; i++, first=0) {
+		if (write) {
+			while (left) {
+				char c;
+				if (get_user(c, s))
+					return -EFAULT;
+				if (!isspace(c))
+					break;
+				left--;
+				s++;
+			}
+			if (!left)
+				break;
+			neg = 0;
+			len = left;
+			if (len > sizeof(buf) - 1)
+				len = sizeof(buf) - 1;
+			if (copy_from_user(buf, s, len))
+				return -EFAULT;
+			buf[len] = 0;
+			p = buf;
+			if (*p == '-' && left > 1) {
+				neg = 1;
+				left--, p++;
+			}
+			if (*p < '0' || *p > '9')
+				break;
+
+			lval = simple_strtoul(p, &p, 0);
+
+			len = p-buf;
+			if ((len < left) && *p && !isspace(*p))
+				break;
+			if (neg)
+				val = -val;
+			s += len;
+			left -= len;
+
+			if (conv(&neg, &lval, i, 1, data))
+				break;
+		} else {
+			p = buf;
+			if (!first)
+				*p++ = '\t';
+	
+			if (conv(&neg, &lval, i, 0, data))
+				break;
+
+			sprintf(p, "%s%lu", neg ? "-" : "", lval);
+			len = strlen(buf);
+			if (len > left)
+				len = left;
+			if(copy_to_user(s, buf, len))
+				return -EFAULT;
+			left -= len;
+			s += len;
+		}
+	}
+
+	if (!write && !first && left) {
+		if(put_user('\n', s))
+			return -EFAULT;
+		left--, s++;
+	}
+	if (write) {
+		while (left) {
+			char c;
+			if (get_user(c, s++))
+				return -EFAULT;
+			if (!isspace(c))
+				break;
+			left--;
+		}
+	}
+	if (write && first)
+		return -EINVAL;
+	*lenp -= left;
+	*ppos += *lenp;
+	return 0;
+#undef TMPBUFLEN
+}
+
+/**
+ * proc_dointvec - read a vector of integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string. 
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec(ctl_table *table, int write, struct file *filp,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+		    	    NULL,NULL);
+}
+
+#define OP_SET	0
+#define OP_AND	1
+#define OP_OR	2
+#define OP_MAX	3
+#define OP_MIN	4
+
+static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
+				      int *valp,
+				      int write, void *data)
+{
+	int op = *(int *)data;
+	if (write) {
+		int val = *negp ? -*lvalp : *lvalp;
+		switch(op) {
+		case OP_SET:	*valp = val; break;
+		case OP_AND:	*valp &= val; break;
+		case OP_OR:	*valp |= val; break;
+		case OP_MAX:	if(*valp < val)
+					*valp = val;
+				break;
+		case OP_MIN:	if(*valp > val)
+				*valp = val;
+				break;
+		}
+	} else {
+		int val = *valp;
+		if (val < 0) {
+			*negp = -1;
+			*lvalp = (unsigned long)-val;
+		} else {
+			*negp = 0;
+			*lvalp = (unsigned long)val;
+		}
+	}
+	return 0;
+}
+
+/*
+ *	init may raise the set.
+ */
+ 
+int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int op;
+
+	if (!capable(CAP_SYS_MODULE)) {
+		return -EPERM;
+	}
+
+	op = (current->pid == 1) ? OP_SET : OP_AND;
+	return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+				do_proc_dointvec_bset_conv,&op);
+}
+
+struct do_proc_dointvec_minmax_conv_param {
+	int *min;
+	int *max;
+};
+
+static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 
+					int *valp, 
+					int write, void *data)
+{
+	struct do_proc_dointvec_minmax_conv_param *param = data;
+	if (write) {
+		int val = *negp ? -*lvalp : *lvalp;
+		if ((param->min && *param->min > val) ||
+		    (param->max && *param->max < val))
+			return -EINVAL;
+		*valp = val;
+	} else {
+		int val = *valp;
+		if (val < 0) {
+			*negp = -1;
+			*lvalp = (unsigned long)-val;
+		} else {
+			*negp = 0;
+			*lvalp = (unsigned long)val;
+		}
+	}
+	return 0;
+}
+
+/**
+ * proc_dointvec_minmax - read a vector of integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct do_proc_dointvec_minmax_conv_param param = {
+		.min = (int *) table->extra1,
+		.max = (int *) table->extra2,
+	};
+	return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+				do_proc_dointvec_minmax_conv, &param);
+}
+
+static int do_proc_doulongvec_minmax(ctl_table *table, int write,
+				     struct file *filp,
+				     void __user *buffer,
+				     size_t *lenp, loff_t *ppos,
+				     unsigned long convmul,
+				     unsigned long convdiv)
+{
+#define TMPBUFLEN 21
+	unsigned long *i, *min, *max, val;
+	int vleft, first=1, neg;
+	size_t len, left;
+	char buf[TMPBUFLEN], *p;
+	char __user *s = buffer;
+	
+	if (!table->data || !table->maxlen || !*lenp ||
+	    (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	
+	i = (unsigned long *) table->data;
+	min = (unsigned long *) table->extra1;
+	max = (unsigned long *) table->extra2;
+	vleft = table->maxlen / sizeof(unsigned long);
+	left = *lenp;
+	
+	for (; left && vleft--; i++, min++, max++, first=0) {
+		if (write) {
+			while (left) {
+				char c;
+				if (get_user(c, s))
+					return -EFAULT;
+				if (!isspace(c))
+					break;
+				left--;
+				s++;
+			}
+			if (!left)
+				break;
+			neg = 0;
+			len = left;
+			if (len > TMPBUFLEN-1)
+				len = TMPBUFLEN-1;
+			if (copy_from_user(buf, s, len))
+				return -EFAULT;
+			buf[len] = 0;
+			p = buf;
+			if (*p == '-' && left > 1) {
+				neg = 1;
+				left--, p++;
+			}
+			if (*p < '0' || *p > '9')
+				break;
+			val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
+			len = p-buf;
+			if ((len < left) && *p && !isspace(*p))
+				break;
+			if (neg)
+				val = -val;
+			s += len;
+			left -= len;
+
+			if(neg)
+				continue;
+			if ((min && val < *min) || (max && val > *max))
+				continue;
+			*i = val;
+		} else {
+			p = buf;
+			if (!first)
+				*p++ = '\t';
+			sprintf(p, "%lu", convdiv * (*i) / convmul);
+			len = strlen(buf);
+			if (len > left)
+				len = left;
+			if(copy_to_user(s, buf, len))
+				return -EFAULT;
+			left -= len;
+			s += len;
+		}
+	}
+
+	if (!write && !first && left) {
+		if(put_user('\n', s))
+			return -EFAULT;
+		left--, s++;
+	}
+	if (write) {
+		while (left) {
+			char c;
+			if (get_user(c, s++))
+				return -EFAULT;
+			if (!isspace(c))
+				break;
+			left--;
+		}
+	}
+	if (write && first)
+		return -EINVAL;
+	*lenp -= left;
+	*ppos += *lenp;
+	return 0;
+#undef TMPBUFLEN
+}
+
+/**
+ * proc_doulongvec_minmax - read a vector of long integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+    return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
+}
+
+/**
+ * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string. The values
+ * are treated as milliseconds, and converted to jiffies when they are stored.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
+				      struct file *filp,
+				      void __user *buffer,
+				      size_t *lenp, loff_t *ppos)
+{
+    return do_proc_doulongvec_minmax(table, write, filp, buffer,
+				     lenp, ppos, HZ, 1000l);
+}
+
+
+static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
+					 int *valp,
+					 int write, void *data)
+{
+	if (write) {
+		*valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
+	} else {
+		int val = *valp;
+		unsigned long lval;
+		if (val < 0) {
+			*negp = -1;
+			lval = (unsigned long)-val;
+		} else {
+			*negp = 0;
+			lval = (unsigned long)val;
+		}
+		*lvalp = lval / HZ;
+	}
+	return 0;
+}
+
+static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
+						int *valp,
+						int write, void *data)
+{
+	if (write) {
+		*valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
+	} else {
+		int val = *valp;
+		unsigned long lval;
+		if (val < 0) {
+			*negp = -1;
+			lval = (unsigned long)-val;
+		} else {
+			*negp = 0;
+			lval = (unsigned long)val;
+		}
+		*lvalp = jiffies_to_clock_t(lval);
+	}
+	return 0;
+}
+
+static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
+					    int *valp,
+					    int write, void *data)
+{
+	if (write) {
+		*valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
+	} else {
+		int val = *valp;
+		unsigned long lval;
+		if (val < 0) {
+			*negp = -1;
+			lval = (unsigned long)-val;
+		} else {
+			*negp = 0;
+			lval = (unsigned long)val;
+		}
+		*lvalp = jiffies_to_msecs(lval);
+	}
+	return 0;
+}
+
+/**
+ * proc_dointvec_jiffies - read a vector of integers as seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string. 
+ * The values read are assumed to be in seconds, and are converted into
+ * jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+		    	    do_proc_dointvec_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string. 
+ * The values read are assumed to be in 1/USER_HZ seconds, and 
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
+				 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+		    	    do_proc_dointvec_userhz_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string. 
+ * The values read are assumed to be in 1/1000 seconds, and 
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+				do_proc_dointvec_ms_jiffies_conv, NULL);
+}
+
+#else /* CONFIG_PROC_FS */
+
+int proc_dostring(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
+		    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
+		    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
+		    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
+		    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
+				      struct file *filp,
+				      void __user *buffer,
+				      size_t *lenp, loff_t *ppos)
+{
+    return -ENOSYS;
+}
+
+
+#endif /* CONFIG_PROC_FS */
+
+
+/*
+ * General sysctl support routines 
+ */
+
+/* The generic string strategy routine: */
+int sysctl_string(ctl_table *table, int __user *name, int nlen,
+		  void __user *oldval, size_t __user *oldlenp,
+		  void __user *newval, size_t newlen, void **context)
+{
+	size_t l, len;
+	
+	if (!table->data || !table->maxlen) 
+		return -ENOTDIR;
+	
+	if (oldval && oldlenp) {
+		if (get_user(len, oldlenp))
+			return -EFAULT;
+		if (len) {
+			l = strlen(table->data);
+			if (len > l) len = l;
+			if (len >= table->maxlen)
+				len = table->maxlen;
+			if(copy_to_user(oldval, table->data, len))
+				return -EFAULT;
+			if(put_user(0, ((char __user *) oldval) + len))
+				return -EFAULT;
+			if(put_user(len, oldlenp))
+				return -EFAULT;
+		}
+	}
+	if (newval && newlen) {
+		len = newlen;
+		if (len > table->maxlen)
+			len = table->maxlen;
+		if(copy_from_user(table->data, newval, len))
+			return -EFAULT;
+		if (len == table->maxlen)
+			len--;
+		((char *) table->data)[len] = 0;
+	}
+	return 0;
+}
+
+/*
+ * This function makes sure that all of the integers in the vector
+ * are between the minimum and maximum values given in the arrays
+ * table->extra1 and table->extra2, respectively.
+ */
+int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+
+	if (newval && newlen) {
+		int __user *vec = (int __user *) newval;
+		int *min = (int *) table->extra1;
+		int *max = (int *) table->extra2;
+		size_t length;
+		int i;
+
+		if (newlen % sizeof(int) != 0)
+			return -EINVAL;
+
+		if (!table->extra1 && !table->extra2)
+			return 0;
+
+		if (newlen > table->maxlen)
+			newlen = table->maxlen;
+		length = newlen / sizeof(int);
+
+		for (i = 0; i < length; i++) {
+			int value;
+			if (get_user(value, vec + i))
+				return -EFAULT;
+			if (min && value < min[i])
+				return -EINVAL;
+			if (max && value > max[i])
+				return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/* Strategy function to convert jiffies to seconds */ 
+int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+	if (oldval) {
+		size_t olen;
+		if (oldlenp) { 
+			if (get_user(olen, oldlenp))
+				return -EFAULT;
+			if (olen!=sizeof(int))
+				return -EINVAL; 
+		}
+		if (put_user(*(int *)(table->data)/HZ, (int __user *)oldval) ||
+		    (oldlenp && put_user(sizeof(int),oldlenp)))
+			return -EFAULT;
+	}
+	if (newval && newlen) { 
+		int new;
+		if (newlen != sizeof(int))
+			return -EINVAL; 
+		if (get_user(new, (int __user *)newval))
+			return -EFAULT;
+		*(int *)(table->data) = new*HZ; 
+	}
+	return 1;
+}
+
+/* Strategy function to convert jiffies to seconds */ 
+int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+	if (oldval) {
+		size_t olen;
+		if (oldlenp) { 
+			if (get_user(olen, oldlenp))
+				return -EFAULT;
+			if (olen!=sizeof(int))
+				return -EINVAL; 
+		}
+		if (put_user(jiffies_to_msecs(*(int *)(table->data)), (int __user *)oldval) ||
+		    (oldlenp && put_user(sizeof(int),oldlenp)))
+			return -EFAULT;
+	}
+	if (newval && newlen) { 
+		int new;
+		if (newlen != sizeof(int))
+			return -EINVAL; 
+		if (get_user(new, (int __user *)newval))
+			return -EFAULT;
+		*(int *)(table->data) = msecs_to_jiffies(new);
+	}
+	return 1;
+}
+
+#else /* CONFIG_SYSCTL */
+
+
+asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
+{
+	return -ENOSYS;
+}
+
+int sysctl_string(ctl_table *table, int __user *name, int nlen,
+		  void __user *oldval, size_t __user *oldlenp,
+		  void __user *newval, size_t newlen, void **context)
+{
+	return -ENOSYS;
+}
+
+int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+	return -ENOSYS;
+}
+
+int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+	return -ENOSYS;
+}
+
+int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen, void **context)
+{
+	return -ENOSYS;
+}
+
+int proc_dostring(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
+		    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
+			     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
+		    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
+int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
+				      struct file *filp,
+				      void __user *buffer,
+				      size_t *lenp, loff_t *ppos)
+{
+    return -ENOSYS;
+}
+
+struct ctl_table_header * register_sysctl_table(ctl_table * table, 
+						int insert_at_head)
+{
+	return NULL;
+}
+
+void unregister_sysctl_table(struct ctl_table_header * table)
+{
+}
+
+#endif /* CONFIG_SYSCTL */
+
+/*
+ * No sense putting this after each symbol definition, twice,
+ * exception granted :-)
+ */
+EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_dointvec_jiffies);
+EXPORT_SYMBOL(proc_dointvec_minmax);
+EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
+EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
+EXPORT_SYMBOL(proc_dostring);
+EXPORT_SYMBOL(proc_doulongvec_minmax);
+EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+EXPORT_SYMBOL(register_sysctl_table);
+EXPORT_SYMBOL(sysctl_intvec);
+EXPORT_SYMBOL(sysctl_jiffies);
+EXPORT_SYMBOL(sysctl_ms_jiffies);
+EXPORT_SYMBOL(sysctl_string);
+EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/time.c b/kernel/time.c
new file mode 100644
index 000000000000..96fd0f499631
--- /dev/null
+++ b/kernel/time.c
@@ -0,0 +1,599 @@
+/*
+ *  linux/kernel/time.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  This file contains the interface functions for the various
+ *  time related system calls: time, stime, gettimeofday, settimeofday,
+ *			       adjtime
+ */
+/*
+ * Modification history kernel/time.c
+ * 
+ * 1993-09-02    Philip Gladstone
+ *      Created file with time related functions from sched.c and adjtimex() 
+ * 1993-10-08    Torsten Duwe
+ *      adjtime interface update and CMOS clock write code
+ * 1995-08-13    Torsten Duwe
+ *      kernel PLL updated to 1994-12-13 specs (rfc-1589)
+ * 1999-01-16    Ulrich Windl
+ *	Introduced error checking for many cases in adjtimex().
+ *	Updated NTP code according to technical memorandum Jan '96
+ *	"A Kernel Model for Precision Timekeeping" by Dave Mills
+ *	Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
+ *	(Even though the technical memorandum forbids it)
+ * 2004-07-14	 Christoph Lameter
+ *	Added getnstimeofday to allow the posix timer functions to return
+ *	with nanosecond accuracy
+ */
+
+#include <linux/module.h>
+#include <linux/timex.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/syscalls.h>
+#include <linux/security.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+/* 
+ * The timezone where the local system is located.  Used as a default by some
+ * programs who obtain this value by using gettimeofday.
+ */
+struct timezone sys_tz;
+
+EXPORT_SYMBOL(sys_tz);
+
+#ifdef __ARCH_WANT_SYS_TIME
+
+/*
+ * sys_time() can be implemented in user-level using
+ * sys_gettimeofday().  Is this for backwards compatibility?  If so,
+ * why not move it into the appropriate arch directory (for those
+ * architectures that need it).
+ */
+asmlinkage long sys_time(time_t __user * tloc)
+{
+	time_t i;
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+	i = tv.tv_sec;
+
+	if (tloc) {
+		if (put_user(i,tloc))
+			i = -EFAULT;
+	}
+	return i;
+}
+
+/*
+ * sys_stime() can be implemented in user-level using
+ * sys_settimeofday().  Is this for backwards compatibility?  If so,
+ * why not move it into the appropriate arch directory (for those
+ * architectures that need it).
+ */
+ 
+asmlinkage long sys_stime(time_t __user *tptr)
+{
+	struct timespec tv;
+	int err;
+
+	if (get_user(tv.tv_sec, tptr))
+		return -EFAULT;
+
+	tv.tv_nsec = 0;
+
+	err = security_settime(&tv, NULL);
+	if (err)
+		return err;
+
+	do_settimeofday(&tv);
+	return 0;
+}
+
+#endif /* __ARCH_WANT_SYS_TIME */
+
+asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz)
+{
+	if (likely(tv != NULL)) {
+		struct timeval ktv;
+		do_gettimeofday(&ktv);
+		if (copy_to_user(tv, &ktv, sizeof(ktv)))
+			return -EFAULT;
+	}
+	if (unlikely(tz != NULL)) {
+		if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+/*
+ * Adjust the time obtained from the CMOS to be UTC time instead of
+ * local time.
+ * 
+ * This is ugly, but preferable to the alternatives.  Otherwise we
+ * would either need to write a program to do it in /etc/rc (and risk
+ * confusion if the program gets run more than once; it would also be 
+ * hard to make the program warp the clock precisely n hours)  or
+ * compile in the timezone information into the kernel.  Bad, bad....
+ *
+ *              				- TYT, 1992-01-01
+ *
+ * The best thing to do is to keep the CMOS clock in universal time (UTC)
+ * as real UNIX machines always do it. This avoids all headaches about
+ * daylight saving times and warping kernel clocks.
+ */
+inline static void warp_clock(void)
+{
+	write_seqlock_irq(&xtime_lock);
+	wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
+	xtime.tv_sec += sys_tz.tz_minuteswest * 60;
+	time_interpolator_reset();
+	write_sequnlock_irq(&xtime_lock);
+	clock_was_set();
+}
+
+/*
+ * In case for some reason the CMOS clock has not already been running
+ * in UTC, but in some local time: The first time we set the timezone,
+ * we will warp the clock so that it is ticking UTC time instead of
+ * local time. Presumably, if someone is setting the timezone then we
+ * are running in an environment where the programs understand about
+ * timezones. This should be done at boot time in the /etc/rc script,
+ * as soon as possible, so that the clock can be set right. Otherwise,
+ * various programs will get confused when the clock gets warped.
+ */
+
+int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
+{
+	static int firsttime = 1;
+	int error = 0;
+
+	error = security_settime(tv, tz);
+	if (error)
+		return error;
+
+	if (tz) {
+		/* SMP safe, global irq locking makes it work. */
+		sys_tz = *tz;
+		if (firsttime) {
+			firsttime = 0;
+			if (!tv)
+				warp_clock();
+		}
+	}
+	if (tv)
+	{
+		/* SMP safe, again the code in arch/foo/time.c should
+		 * globally block out interrupts when it runs.
+		 */
+		return do_settimeofday(tv);
+	}
+	return 0;
+}
+
+asmlinkage long sys_settimeofday(struct timeval __user *tv,
+				struct timezone __user *tz)
+{
+	struct timeval user_tv;
+	struct timespec	new_ts;
+	struct timezone new_tz;
+
+	if (tv) {
+		if (copy_from_user(&user_tv, tv, sizeof(*tv)))
+			return -EFAULT;
+		new_ts.tv_sec = user_tv.tv_sec;
+		new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
+	}
+	if (tz) {
+		if (copy_from_user(&new_tz, tz, sizeof(*tz)))
+			return -EFAULT;
+	}
+
+	return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+}
+
+long pps_offset;		/* pps time offset (us) */
+long pps_jitter = MAXTIME;	/* time dispersion (jitter) (us) */
+
+long pps_freq;			/* frequency offset (scaled ppm) */
+long pps_stabil = MAXFREQ;	/* frequency dispersion (scaled ppm) */
+
+long pps_valid = PPS_VALID;	/* pps signal watchdog counter */
+
+int pps_shift = PPS_SHIFT;	/* interval duration (s) (shift) */
+
+long pps_jitcnt;		/* jitter limit exceeded */
+long pps_calcnt;		/* calibration intervals */
+long pps_errcnt;		/* calibration errors */
+long pps_stbcnt;		/* stability limit exceeded */
+
+/* hook for a loadable hardpps kernel module */
+void (*hardpps_ptr)(struct timeval *);
+
+/* we call this to notify the arch when the clock is being
+ * controlled.  If no such arch routine, do nothing.
+ */
+void __attribute__ ((weak)) notify_arch_cmos_timer(void)
+{
+	return;
+}
+
+/* adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int do_adjtimex(struct timex *txc)
+{
+        long ltemp, mtemp, save_adjust;
+	int result;
+
+	/* In order to modify anything, you gotta be super-user! */
+	if (txc->modes && !capable(CAP_SYS_TIME))
+		return -EPERM;
+		
+	/* Now we validate the data before disabling interrupts */
+
+	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+	  /* singleshot must not be used with any other mode bits */
+		if (txc->modes != ADJ_OFFSET_SINGLESHOT)
+			return -EINVAL;
+
+	if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
+	  /* adjustment Offset limited to +- .512 seconds */
+		if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
+			return -EINVAL;	
+
+	/* if the quartz is off by more than 10% something is VERY wrong ! */
+	if (txc->modes & ADJ_TICK)
+		if (txc->tick <  900000/USER_HZ ||
+		    txc->tick > 1100000/USER_HZ)
+			return -EINVAL;
+
+	write_seqlock_irq(&xtime_lock);
+	result = time_state;	/* mostly `TIME_OK' */
+
+	/* Save for later - semantics of adjtime is to return old value */
+	save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
+
+#if 0	/* STA_CLOCKERR is never set yet */
+	time_status &= ~STA_CLOCKERR;		/* reset STA_CLOCKERR */
+#endif
+	/* If there are input parameters, then process them */
+	if (txc->modes)
+	{
+	    if (txc->modes & ADJ_STATUS)	/* only set allowed bits */
+		time_status =  (txc->status & ~STA_RONLY) |
+			      (time_status & STA_RONLY);
+
+	    if (txc->modes & ADJ_FREQUENCY) {	/* p. 22 */
+		if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_freq = txc->freq - pps_freq;
+	    }
+
+	    if (txc->modes & ADJ_MAXERROR) {
+		if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_maxerror = txc->maxerror;
+	    }
+
+	    if (txc->modes & ADJ_ESTERROR) {
+		if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_esterror = txc->esterror;
+	    }
+
+	    if (txc->modes & ADJ_TIMECONST) {	/* p. 24 */
+		if (txc->constant < 0) {	/* NTP v4 uses values > 6 */
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_constant = txc->constant;
+	    }
+
+	    if (txc->modes & ADJ_OFFSET) {	/* values checked earlier */
+		if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
+		    /* adjtime() is independent from ntp_adjtime() */
+		    if ((time_next_adjust = txc->offset) == 0)
+			 time_adjust = 0;
+		}
+		else if ( time_status & (STA_PLL | STA_PPSTIME) ) {
+		    ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) ==
+		            (STA_PPSTIME | STA_PPSSIGNAL) ?
+		            pps_offset : txc->offset;
+
+		    /*
+		     * Scale the phase adjustment and
+		     * clamp to the operating range.
+		     */
+		    if (ltemp > MAXPHASE)
+		        time_offset = MAXPHASE << SHIFT_UPDATE;
+		    else if (ltemp < -MAXPHASE)
+			time_offset = -(MAXPHASE << SHIFT_UPDATE);
+		    else
+		        time_offset = ltemp << SHIFT_UPDATE;
+
+		    /*
+		     * Select whether the frequency is to be controlled
+		     * and in which mode (PLL or FLL). Clamp to the operating
+		     * range. Ugly multiply/divide should be replaced someday.
+		     */
+
+		    if (time_status & STA_FREQHOLD || time_reftime == 0)
+		        time_reftime = xtime.tv_sec;
+		    mtemp = xtime.tv_sec - time_reftime;
+		    time_reftime = xtime.tv_sec;
+		    if (time_status & STA_FLL) {
+		        if (mtemp >= MINSEC) {
+			    ltemp = (time_offset / mtemp) << (SHIFT_USEC -
+							      SHIFT_UPDATE);
+			    if (ltemp < 0)
+			        time_freq -= -ltemp >> SHIFT_KH;
+			    else
+			        time_freq += ltemp >> SHIFT_KH;
+			} else /* calibration interval too short (p. 12) */
+				result = TIME_ERROR;
+		    } else {	/* PLL mode */
+		        if (mtemp < MAXSEC) {
+			    ltemp *= mtemp;
+			    if (ltemp < 0)
+			        time_freq -= -ltemp >> (time_constant +
+							time_constant +
+							SHIFT_KF - SHIFT_USEC);
+			    else
+			        time_freq += ltemp >> (time_constant +
+						       time_constant +
+						       SHIFT_KF - SHIFT_USEC);
+			} else /* calibration interval too long (p. 12) */
+				result = TIME_ERROR;
+		    }
+		    if (time_freq > time_tolerance)
+		        time_freq = time_tolerance;
+		    else if (time_freq < -time_tolerance)
+		        time_freq = -time_tolerance;
+		} /* STA_PLL || STA_PPSTIME */
+	    } /* txc->modes & ADJ_OFFSET */
+	    if (txc->modes & ADJ_TICK) {
+		tick_usec = txc->tick;
+		tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
+	    }
+	} /* txc->modes */
+leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
+	    || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0
+		&& (time_status & STA_PPSSIGNAL) == 0)
+	    /* p. 24, (b) */
+	    || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+		== (STA_PPSTIME|STA_PPSJITTER))
+	    /* p. 24, (c) */
+	    || ((time_status & STA_PPSFREQ) != 0
+		&& (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
+	    /* p. 24, (d) */
+		result = TIME_ERROR;
+	
+	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+	    txc->offset	   = save_adjust;
+	else {
+	    if (time_offset < 0)
+		txc->offset = -(-time_offset >> SHIFT_UPDATE);
+	    else
+		txc->offset = time_offset >> SHIFT_UPDATE;
+	}
+	txc->freq	   = time_freq + pps_freq;
+	txc->maxerror	   = time_maxerror;
+	txc->esterror	   = time_esterror;
+	txc->status	   = time_status;
+	txc->constant	   = time_constant;
+	txc->precision	   = time_precision;
+	txc->tolerance	   = time_tolerance;
+	txc->tick	   = tick_usec;
+	txc->ppsfreq	   = pps_freq;
+	txc->jitter	   = pps_jitter >> PPS_AVG;
+	txc->shift	   = pps_shift;
+	txc->stabil	   = pps_stabil;
+	txc->jitcnt	   = pps_jitcnt;
+	txc->calcnt	   = pps_calcnt;
+	txc->errcnt	   = pps_errcnt;
+	txc->stbcnt	   = pps_stbcnt;
+	write_sequnlock_irq(&xtime_lock);
+	do_gettimeofday(&txc->time);
+	notify_arch_cmos_timer();
+	return(result);
+}
+
+asmlinkage long sys_adjtimex(struct timex __user *txc_p)
+{
+	struct timex txc;		/* Local copy of parameter */
+	int ret;
+
+	/* Copy the user data space into the kernel copy
+	 * structure. But bear in mind that the structures
+	 * may change
+	 */
+	if(copy_from_user(&txc, txc_p, sizeof(struct timex)))
+		return -EFAULT;
+	ret = do_adjtimex(&txc);
+	return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
+}
+
+inline struct timespec current_kernel_time(void)
+{
+        struct timespec now;
+        unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		
+		now = xtime;
+	} while (read_seqretry(&xtime_lock, seq));
+
+	return now; 
+}
+
+EXPORT_SYMBOL(current_kernel_time);
+
+/**
+ * current_fs_time - Return FS time
+ * @sb: Superblock.
+ *
+ * Return the current time truncated to the time granuality supported by
+ * the fs.
+ */
+struct timespec current_fs_time(struct super_block *sb)
+{
+	struct timespec now = current_kernel_time();
+	return timespec_trunc(now, sb->s_time_gran);
+}
+EXPORT_SYMBOL(current_fs_time);
+
+/**
+ * timespec_trunc - Truncate timespec to a granuality
+ * @t: Timespec
+ * @gran: Granuality in ns.
+ *
+ * Truncate a timespec to a granuality. gran must be smaller than a second.
+ * Always rounds down.
+ *
+ * This function should be only used for timestamps returned by
+ * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
+ * it doesn't handle the better resolution of the later.
+ */
+struct timespec timespec_trunc(struct timespec t, unsigned gran)
+{
+	/*
+	 * Division is pretty slow so avoid it for common cases.
+	 * Currently current_kernel_time() never returns better than
+	 * jiffies resolution. Exploit that.
+	 */
+	if (gran <= jiffies_to_usecs(1) * 1000) {
+		/* nothing */
+	} else if (gran == 1000000000) {
+		t.tv_nsec = 0;
+	} else {
+		t.tv_nsec -= t.tv_nsec % gran;
+	}
+	return t;
+}
+EXPORT_SYMBOL(timespec_trunc);
+
+#ifdef CONFIG_TIME_INTERPOLATION
+void getnstimeofday (struct timespec *tv)
+{
+	unsigned long seq,sec,nsec;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		sec = xtime.tv_sec;
+		nsec = xtime.tv_nsec+time_interpolator_get_offset();
+	} while (unlikely(read_seqretry(&xtime_lock, seq)));
+
+	while (unlikely(nsec >= NSEC_PER_SEC)) {
+		nsec -= NSEC_PER_SEC;
+		++sec;
+	}
+	tv->tv_sec = sec;
+	tv->tv_nsec = nsec;
+}
+EXPORT_SYMBOL_GPL(getnstimeofday);
+
+int do_settimeofday (struct timespec *tv)
+{
+	time_t wtm_sec, sec = tv->tv_sec;
+	long wtm_nsec, nsec = tv->tv_nsec;
+
+	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+		return -EINVAL;
+
+	write_seqlock_irq(&xtime_lock);
+	{
+		/*
+		 * This is revolting. We need to set "xtime" correctly. However, the value
+		 * in this location is the value at the most recent update of wall time.
+		 * Discover what correction gettimeofday would have done, and then undo
+		 * it!
+		 */
+		nsec -= time_interpolator_get_offset();
+
+		wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+		wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+
+		set_normalized_timespec(&xtime, sec, nsec);
+		set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+
+		time_adjust = 0;		/* stop active adjtime() */
+		time_status |= STA_UNSYNC;
+		time_maxerror = NTP_PHASE_LIMIT;
+		time_esterror = NTP_PHASE_LIMIT;
+		time_interpolator_reset();
+	}
+	write_sequnlock_irq(&xtime_lock);
+	clock_was_set();
+	return 0;
+}
+
+void do_gettimeofday (struct timeval *tv)
+{
+	unsigned long seq, nsec, usec, sec, offset;
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		offset = time_interpolator_get_offset();
+		sec = xtime.tv_sec;
+		nsec = xtime.tv_nsec;
+	} while (unlikely(read_seqretry(&xtime_lock, seq)));
+
+	usec = (nsec + offset) / 1000;
+
+	while (unlikely(usec >= USEC_PER_SEC)) {
+		usec -= USEC_PER_SEC;
+		++sec;
+	}
+
+	tv->tv_sec = sec;
+	tv->tv_usec = usec;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+
+
+#else
+/*
+ * Simulate gettimeofday using do_gettimeofday which only allows a timeval
+ * and therefore only yields usec accuracy
+ */
+void getnstimeofday(struct timespec *tv)
+{
+	struct timeval x;
+
+	do_gettimeofday(&x);
+	tv->tv_sec = x.tv_sec;
+	tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
+}
+#endif
+
+#if (BITS_PER_LONG < 64)
+u64 get_jiffies_64(void)
+{
+	unsigned long seq;
+	u64 ret;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		ret = jiffies_64;
+	} while (read_seqretry(&xtime_lock, seq));
+	return ret;
+}
+
+EXPORT_SYMBOL(get_jiffies_64);
+#endif
+
+EXPORT_SYMBOL(jiffies);
diff --git a/kernel/timer.c b/kernel/timer.c
new file mode 100644
index 000000000000..ecb3d67c0e14
--- /dev/null
+++ b/kernel/timer.c
@@ -0,0 +1,1611 @@
+/*
+ *  linux/kernel/timer.c
+ *
+ *  Kernel internal timers, kernel timekeeping, basic process system calls
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
+ *
+ *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
+ *              "A Kernel Model for Precision Timekeeping" by Dave Mills
+ *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ *              serialize accesses to xtime/lost_ticks).
+ *                              Copyright (C) 1998  Andrea Arcangeli
+ *  1999-03-10  Improved NTP compatibility by Ulrich Windl
+ *  2002-05-31	Move sys_sysinfo here and make its locking sane, Robert Love
+ *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
+ *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
+ *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/notifier.h>
+#include <linux/thread_info.h>
+#include <linux/time.h>
+#include <linux/jiffies.h>
+#include <linux/posix-timers.h>
+#include <linux/cpu.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/div64.h>
+#include <asm/timex.h>
+#include <asm/io.h>
+
+#ifdef CONFIG_TIME_INTERPOLATION
+static void time_interpolator_update(long delta_nsec);
+#else
+#define time_interpolator_update(x)
+#endif
+
+/*
+ * per-CPU timer vector definitions:
+ */
+
+#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
+#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
+#define TVN_SIZE (1 << TVN_BITS)
+#define TVR_SIZE (1 << TVR_BITS)
+#define TVN_MASK (TVN_SIZE - 1)
+#define TVR_MASK (TVR_SIZE - 1)
+
+typedef struct tvec_s {
+	struct list_head vec[TVN_SIZE];
+} tvec_t;
+
+typedef struct tvec_root_s {
+	struct list_head vec[TVR_SIZE];
+} tvec_root_t;
+
+struct tvec_t_base_s {
+	spinlock_t lock;
+	unsigned long timer_jiffies;
+	struct timer_list *running_timer;
+	tvec_root_t tv1;
+	tvec_t tv2;
+	tvec_t tv3;
+	tvec_t tv4;
+	tvec_t tv5;
+} ____cacheline_aligned_in_smp;
+
+typedef struct tvec_t_base_s tvec_base_t;
+
+static inline void set_running_timer(tvec_base_t *base,
+					struct timer_list *timer)
+{
+#ifdef CONFIG_SMP
+	base->running_timer = timer;
+#endif
+}
+
+/* Fake initialization */
+static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };
+
+static void check_timer_failed(struct timer_list *timer)
+{
+	static int whine_count;
+	if (whine_count < 16) {
+		whine_count++;
+		printk("Uninitialised timer!\n");
+		printk("This is just a warning.  Your computer is OK\n");
+		printk("function=0x%p, data=0x%lx\n",
+			timer->function, timer->data);
+		dump_stack();
+	}
+	/*
+	 * Now fix it up
+	 */
+	spin_lock_init(&timer->lock);
+	timer->magic = TIMER_MAGIC;
+}
+
+static inline void check_timer(struct timer_list *timer)
+{
+	if (timer->magic != TIMER_MAGIC)
+		check_timer_failed(timer);
+}
+
+
+static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
+{
+	unsigned long expires = timer->expires;
+	unsigned long idx = expires - base->timer_jiffies;
+	struct list_head *vec;
+
+	if (idx < TVR_SIZE) {
+		int i = expires & TVR_MASK;
+		vec = base->tv1.vec + i;
+	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
+		int i = (expires >> TVR_BITS) & TVN_MASK;
+		vec = base->tv2.vec + i;
+	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
+		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
+		vec = base->tv3.vec + i;
+	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
+		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
+		vec = base->tv4.vec + i;
+	} else if ((signed long) idx < 0) {
+		/*
+		 * Can happen if you add a timer with expires == jiffies,
+		 * or you set a timer to go off in the past
+		 */
+		vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
+	} else {
+		int i;
+		/* If the timeout is larger than 0xffffffff on 64-bit
+		 * architectures then we use the maximum timeout:
+		 */
+		if (idx > 0xffffffffUL) {
+			idx = 0xffffffffUL;
+			expires = idx + base->timer_jiffies;
+		}
+		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
+		vec = base->tv5.vec + i;
+	}
+	/*
+	 * Timers are FIFO:
+	 */
+	list_add_tail(&timer->entry, vec);
+}
+
+int __mod_timer(struct timer_list *timer, unsigned long expires)
+{
+	tvec_base_t *old_base, *new_base;
+	unsigned long flags;
+	int ret = 0;
+
+	BUG_ON(!timer->function);
+
+	check_timer(timer);
+
+	spin_lock_irqsave(&timer->lock, flags);
+	new_base = &__get_cpu_var(tvec_bases);
+repeat:
+	old_base = timer->base;
+
+	/*
+	 * Prevent deadlocks via ordering by old_base < new_base.
+	 */
+	if (old_base && (new_base != old_base)) {
+		if (old_base < new_base) {
+			spin_lock(&new_base->lock);
+			spin_lock(&old_base->lock);
+		} else {
+			spin_lock(&old_base->lock);
+			spin_lock(&new_base->lock);
+		}
+		/*
+		 * The timer base might have been cancelled while we were
+		 * trying to take the lock(s):
+		 */
+		if (timer->base != old_base) {
+			spin_unlock(&new_base->lock);
+			spin_unlock(&old_base->lock);
+			goto repeat;
+		}
+	} else {
+		spin_lock(&new_base->lock);
+		if (timer->base != old_base) {
+			spin_unlock(&new_base->lock);
+			goto repeat;
+		}
+	}
+
+	/*
+	 * Delete the previous timeout (if there was any), and install
+	 * the new one:
+	 */
+	if (old_base) {
+		list_del(&timer->entry);
+		ret = 1;
+	}
+	timer->expires = expires;
+	internal_add_timer(new_base, timer);
+	timer->base = new_base;
+
+	if (old_base && (new_base != old_base))
+		spin_unlock(&old_base->lock);
+	spin_unlock(&new_base->lock);
+	spin_unlock_irqrestore(&timer->lock, flags);
+
+	return ret;
+}
+
+EXPORT_SYMBOL(__mod_timer);
+
+/***
+ * add_timer_on - start a timer on a particular CPU
+ * @timer: the timer to be added
+ * @cpu: the CPU to start it on
+ *
+ * This is not very scalable on SMP. Double adds are not possible.
+ */
+void add_timer_on(struct timer_list *timer, int cpu)
+{
+	tvec_base_t *base = &per_cpu(tvec_bases, cpu);
+  	unsigned long flags;
+  
+  	BUG_ON(timer_pending(timer) || !timer->function);
+
+	check_timer(timer);
+
+	spin_lock_irqsave(&base->lock, flags);
+	internal_add_timer(base, timer);
+	timer->base = base;
+	spin_unlock_irqrestore(&base->lock, flags);
+}
+
+
+/***
+ * mod_timer - modify a timer's timeout
+ * @timer: the timer to be modified
+ *
+ * mod_timer is a more efficient way to update the expire field of an
+ * active timer (if the timer is inactive it will be activated)
+ *
+ * mod_timer(timer, expires) is equivalent to:
+ *
+ *     del_timer(timer); timer->expires = expires; add_timer(timer);
+ *
+ * Note that if there are multiple unserialized concurrent users of the
+ * same timer, then mod_timer() is the only safe way to modify the timeout,
+ * since add_timer() cannot modify an already running timer.
+ *
+ * The function returns whether it has modified a pending timer or not.
+ * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
+ * active timer returns 1.)
+ */
+int mod_timer(struct timer_list *timer, unsigned long expires)
+{
+	BUG_ON(!timer->function);
+
+	check_timer(timer);
+
+	/*
+	 * This is a common optimization triggered by the
+	 * networking code - if the timer is re-modified
+	 * to be the same thing then just return:
+	 */
+	if (timer->expires == expires && timer_pending(timer))
+		return 1;
+
+	return __mod_timer(timer, expires);
+}
+
+EXPORT_SYMBOL(mod_timer);
+
+/***
+ * del_timer - deactive a timer.
+ * @timer: the timer to be deactivated
+ *
+ * del_timer() deactivates a timer - this works on both active and inactive
+ * timers.
+ *
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ */
+int del_timer(struct timer_list *timer)
+{
+	unsigned long flags;
+	tvec_base_t *base;
+
+	check_timer(timer);
+
+repeat:
+ 	base = timer->base;
+	if (!base)
+		return 0;
+	spin_lock_irqsave(&base->lock, flags);
+	if (base != timer->base) {
+		spin_unlock_irqrestore(&base->lock, flags);
+		goto repeat;
+	}
+	list_del(&timer->entry);
+	/* Need to make sure that anybody who sees a NULL base also sees the list ops */
+	smp_wmb();
+	timer->base = NULL;
+	spin_unlock_irqrestore(&base->lock, flags);
+
+	return 1;
+}
+
+EXPORT_SYMBOL(del_timer);
+
+#ifdef CONFIG_SMP
+/***
+ * del_timer_sync - deactivate a timer and wait for the handler to finish.
+ * @timer: the timer to be deactivated
+ *
+ * This function only differs from del_timer() on SMP: besides deactivating
+ * the timer it also makes sure the handler has finished executing on other
+ * CPUs.
+ *
+ * Synchronization rules: callers must prevent restarting of the timer,
+ * otherwise this function is meaningless. It must not be called from
+ * interrupt contexts. The caller must not hold locks which would prevent
+ * completion of the timer's handler.  Upon exit the timer is not queued and
+ * the handler is not running on any CPU.
+ *
+ * The function returns whether it has deactivated a pending timer or not.
+ *
+ * del_timer_sync() is slow and complicated because it copes with timer
+ * handlers which re-arm the timer (periodic timers).  If the timer handler
+ * is known to not do this (a single shot timer) then use
+ * del_singleshot_timer_sync() instead.
+ */
+int del_timer_sync(struct timer_list *timer)
+{
+	tvec_base_t *base;
+	int i, ret = 0;
+
+	check_timer(timer);
+
+del_again:
+	ret += del_timer(timer);
+
+	for_each_online_cpu(i) {
+		base = &per_cpu(tvec_bases, i);
+		if (base->running_timer == timer) {
+			while (base->running_timer == timer) {
+				cpu_relax();
+				preempt_check_resched();
+			}
+			break;
+		}
+	}
+	smp_rmb();
+	if (timer_pending(timer))
+		goto del_again;
+
+	return ret;
+}
+EXPORT_SYMBOL(del_timer_sync);
+
+/***
+ * del_singleshot_timer_sync - deactivate a non-recursive timer
+ * @timer: the timer to be deactivated
+ *
+ * This function is an optimization of del_timer_sync for the case where the
+ * caller can guarantee the timer does not reschedule itself in its timer
+ * function.
+ *
+ * Synchronization rules: callers must prevent restarting of the timer,
+ * otherwise this function is meaningless. It must not be called from
+ * interrupt contexts. The caller must not hold locks which wold prevent
+ * completion of the timer's handler.  Upon exit the timer is not queued and
+ * the handler is not running on any CPU.
+ *
+ * The function returns whether it has deactivated a pending timer or not.
+ */
+int del_singleshot_timer_sync(struct timer_list *timer)
+{
+	int ret = del_timer(timer);
+
+	if (!ret) {
+		ret = del_timer_sync(timer);
+		BUG_ON(ret);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(del_singleshot_timer_sync);
+#endif
+
+static int cascade(tvec_base_t *base, tvec_t *tv, int index)
+{
+	/* cascade all the timers from tv up one level */
+	struct list_head *head, *curr;
+
+	head = tv->vec + index;
+	curr = head->next;
+	/*
+	 * We are removing _all_ timers from the list, so we don't  have to
+	 * detach them individually, just clear the list afterwards.
+	 */
+	while (curr != head) {
+		struct timer_list *tmp;
+
+		tmp = list_entry(curr, struct timer_list, entry);
+		BUG_ON(tmp->base != base);
+		curr = curr->next;
+		internal_add_timer(base, tmp);
+	}
+	INIT_LIST_HEAD(head);
+
+	return index;
+}
+
+/***
+ * __run_timers - run all expired timers (if any) on this CPU.
+ * @base: the timer vector to be processed.
+ *
+ * This function cascades all vectors and executes all expired timer
+ * vectors.
+ */
+#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
+
+static inline void __run_timers(tvec_base_t *base)
+{
+	struct timer_list *timer;
+
+	spin_lock_irq(&base->lock);
+	while (time_after_eq(jiffies, base->timer_jiffies)) {
+		struct list_head work_list = LIST_HEAD_INIT(work_list);
+		struct list_head *head = &work_list;
+ 		int index = base->timer_jiffies & TVR_MASK;
+ 
+		/*
+		 * Cascade timers:
+		 */
+		if (!index &&
+			(!cascade(base, &base->tv2, INDEX(0))) &&
+				(!cascade(base, &base->tv3, INDEX(1))) &&
+					!cascade(base, &base->tv4, INDEX(2)))
+			cascade(base, &base->tv5, INDEX(3));
+		++base->timer_jiffies; 
+		list_splice_init(base->tv1.vec + index, &work_list);
+repeat:
+		if (!list_empty(head)) {
+			void (*fn)(unsigned long);
+			unsigned long data;
+
+			timer = list_entry(head->next,struct timer_list,entry);
+ 			fn = timer->function;
+ 			data = timer->data;
+
+			list_del(&timer->entry);
+			set_running_timer(base, timer);
+			smp_wmb();
+			timer->base = NULL;
+			spin_unlock_irq(&base->lock);
+			{
+				u32 preempt_count = preempt_count();
+				fn(data);
+				if (preempt_count != preempt_count()) {
+					printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count());
+					BUG();
+				}
+			}
+			spin_lock_irq(&base->lock);
+			goto repeat;
+		}
+	}
+	set_running_timer(base, NULL);
+	spin_unlock_irq(&base->lock);
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+/*
+ * Find out when the next timer event is due to happen. This
+ * is used on S/390 to stop all activity when a cpus is idle.
+ * This functions needs to be called disabled.
+ */
+unsigned long next_timer_interrupt(void)
+{
+	tvec_base_t *base;
+	struct list_head *list;
+	struct timer_list *nte;
+	unsigned long expires;
+	tvec_t *varray[4];
+	int i, j;
+
+	base = &__get_cpu_var(tvec_bases);
+	spin_lock(&base->lock);
+	expires = base->timer_jiffies + (LONG_MAX >> 1);
+	list = 0;
+
+	/* Look for timer events in tv1. */
+	j = base->timer_jiffies & TVR_MASK;
+	do {
+		list_for_each_entry(nte, base->tv1.vec + j, entry) {
+			expires = nte->expires;
+			if (j < (base->timer_jiffies & TVR_MASK))
+				list = base->tv2.vec + (INDEX(0));
+			goto found;
+		}
+		j = (j + 1) & TVR_MASK;
+	} while (j != (base->timer_jiffies & TVR_MASK));
+
+	/* Check tv2-tv5. */
+	varray[0] = &base->tv2;
+	varray[1] = &base->tv3;
+	varray[2] = &base->tv4;
+	varray[3] = &base->tv5;
+	for (i = 0; i < 4; i++) {
+		j = INDEX(i);
+		do {
+			if (list_empty(varray[i]->vec + j)) {
+				j = (j + 1) & TVN_MASK;
+				continue;
+			}
+			list_for_each_entry(nte, varray[i]->vec + j, entry)
+				if (time_before(nte->expires, expires))
+					expires = nte->expires;
+			if (j < (INDEX(i)) && i < 3)
+				list = varray[i + 1]->vec + (INDEX(i + 1));
+			goto found;
+		} while (j != (INDEX(i)));
+	}
+found:
+	if (list) {
+		/*
+		 * The search wrapped. We need to look at the next list
+		 * from next tv element that would cascade into tv element
+		 * where we found the timer element.
+		 */
+		list_for_each_entry(nte, list, entry) {
+			if (time_before(nte->expires, expires))
+				expires = nte->expires;
+		}
+	}
+	spin_unlock(&base->lock);
+	return expires;
+}
+#endif
+
+/******************************************************************/
+
+/*
+ * Timekeeping variables
+ */
+unsigned long tick_usec = TICK_USEC; 		/* USER_HZ period (usec) */
+unsigned long tick_nsec = TICK_NSEC;		/* ACTHZ period (nsec) */
+
+/* 
+ * The current time 
+ * wall_to_monotonic is what we need to add to xtime (or xtime corrected 
+ * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
+ * at zero at system boot time, so wall_to_monotonic will be negative,
+ * however, we will ALWAYS keep the tv_nsec part positive so we can use
+ * the usual normalization.
+ */
+struct timespec xtime __attribute__ ((aligned (16)));
+struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+
+EXPORT_SYMBOL(xtime);
+
+/* Don't completely fail for HZ > 500.  */
+int tickadj = 500/HZ ? : 1;		/* microsecs */
+
+
+/*
+ * phase-lock loop variables
+ */
+/* TIME_ERROR prevents overwriting the CMOS clock */
+int time_state = TIME_OK;		/* clock synchronization status	*/
+int time_status = STA_UNSYNC;		/* clock status bits		*/
+long time_offset;			/* time adjustment (us)		*/
+long time_constant = 2;			/* pll time constant		*/
+long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
+long time_precision = 1;		/* clock precision (us)		*/
+long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
+long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
+static long time_phase;			/* phase offset (scaled us)	*/
+long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
+					/* frequency offset (scaled ppm)*/
+static long time_adj;			/* tick adjust (scaled 1 / HZ)	*/
+long time_reftime;			/* time at last adjustment (s)	*/
+long time_adjust;
+long time_next_adjust;
+
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ *
+ */
+static void second_overflow(void)
+{
+    long ltemp;
+
+    /* Bump the maxerror field */
+    time_maxerror += time_tolerance >> SHIFT_USEC;
+    if ( time_maxerror > NTP_PHASE_LIMIT ) {
+	time_maxerror = NTP_PHASE_LIMIT;
+	time_status |= STA_UNSYNC;
+    }
+
+    /*
+     * Leap second processing. If in leap-insert state at
+     * the end of the day, the system clock is set back one
+     * second; if in leap-delete state, the system clock is
+     * set ahead one second. The microtime() routine or
+     * external clock driver will insure that reported time
+     * is always monotonic. The ugly divides should be
+     * replaced.
+     */
+    switch (time_state) {
+
+    case TIME_OK:
+	if (time_status & STA_INS)
+	    time_state = TIME_INS;
+	else if (time_status & STA_DEL)
+	    time_state = TIME_DEL;
+	break;
+
+    case TIME_INS:
+	if (xtime.tv_sec % 86400 == 0) {
+	    xtime.tv_sec--;
+	    wall_to_monotonic.tv_sec++;
+	    /* The timer interpolator will make time change gradually instead
+	     * of an immediate jump by one second.
+	     */
+	    time_interpolator_update(-NSEC_PER_SEC);
+	    time_state = TIME_OOP;
+	    clock_was_set();
+	    printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
+	}
+	break;
+
+    case TIME_DEL:
+	if ((xtime.tv_sec + 1) % 86400 == 0) {
+	    xtime.tv_sec++;
+	    wall_to_monotonic.tv_sec--;
+	    /* Use of time interpolator for a gradual change of time */
+	    time_interpolator_update(NSEC_PER_SEC);
+	    time_state = TIME_WAIT;
+	    clock_was_set();
+	    printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
+	}
+	break;
+
+    case TIME_OOP:
+	time_state = TIME_WAIT;
+	break;
+
+    case TIME_WAIT:
+	if (!(time_status & (STA_INS | STA_DEL)))
+	    time_state = TIME_OK;
+    }
+
+    /*
+     * Compute the phase adjustment for the next second. In
+     * PLL mode, the offset is reduced by a fixed factor
+     * times the time constant. In FLL mode the offset is
+     * used directly. In either mode, the maximum phase
+     * adjustment for each second is clamped so as to spread
+     * the adjustment over not more than the number of
+     * seconds between updates.
+     */
+    if (time_offset < 0) {
+	ltemp = -time_offset;
+	if (!(time_status & STA_FLL))
+	    ltemp >>= SHIFT_KG + time_constant;
+	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+	time_offset += ltemp;
+	time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+    } else {
+	ltemp = time_offset;
+	if (!(time_status & STA_FLL))
+	    ltemp >>= SHIFT_KG + time_constant;
+	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+	time_offset -= ltemp;
+	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+    }
+
+    /*
+     * Compute the frequency estimate and additional phase
+     * adjustment due to frequency error for the next
+     * second. When the PPS signal is engaged, gnaw on the
+     * watchdog counter and update the frequency computed by
+     * the pll and the PPS signal.
+     */
+    pps_valid++;
+    if (pps_valid == PPS_VALID) {	/* PPS signal lost */
+	pps_jitter = MAXTIME;
+	pps_stabil = MAXFREQ;
+	time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+			 STA_PPSWANDER | STA_PPSERROR);
+    }
+    ltemp = time_freq + pps_freq;
+    if (ltemp < 0)
+	time_adj -= -ltemp >>
+	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+    else
+	time_adj += ltemp >>
+	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+
+#if HZ == 100
+    /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
+     * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
+     */
+    if (time_adj < 0)
+	time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
+    else
+	time_adj += (time_adj >> 2) + (time_adj >> 5);
+#endif
+#if HZ == 1000
+    /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
+     * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
+     */
+    if (time_adj < 0)
+	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
+    else
+	time_adj += (time_adj >> 6) + (time_adj >> 7);
+#endif
+}
+
+/* in the NTP reference this is called "hardclock()" */
+static void update_wall_time_one_tick(void)
+{
+	long time_adjust_step, delta_nsec;
+
+	if ( (time_adjust_step = time_adjust) != 0 ) {
+	    /* We are doing an adjtime thing. 
+	     *
+	     * Prepare time_adjust_step to be within bounds.
+	     * Note that a positive time_adjust means we want the clock
+	     * to run faster.
+	     *
+	     * Limit the amount of the step to be in the range
+	     * -tickadj .. +tickadj
+	     */
+	     if (time_adjust > tickadj)
+		time_adjust_step = tickadj;
+	     else if (time_adjust < -tickadj)
+		time_adjust_step = -tickadj;
+
+	    /* Reduce by this step the amount of time left  */
+	    time_adjust -= time_adjust_step;
+	}
+	delta_nsec = tick_nsec + time_adjust_step * 1000;
+	/*
+	 * Advance the phase, once it gets to one microsecond, then
+	 * advance the tick more.
+	 */
+	time_phase += time_adj;
+	if (time_phase <= -FINENSEC) {
+		long ltemp = -time_phase >> (SHIFT_SCALE - 10);
+		time_phase += ltemp << (SHIFT_SCALE - 10);
+		delta_nsec -= ltemp;
+	}
+	else if (time_phase >= FINENSEC) {
+		long ltemp = time_phase >> (SHIFT_SCALE - 10);
+		time_phase -= ltemp << (SHIFT_SCALE - 10);
+		delta_nsec += ltemp;
+	}
+	xtime.tv_nsec += delta_nsec;
+	time_interpolator_update(delta_nsec);
+
+	/* Changes by adjtime() do not take effect till next tick. */
+	if (time_next_adjust != 0) {
+		time_adjust = time_next_adjust;
+		time_next_adjust = 0;
+	}
+}
+
+/*
+ * Using a loop looks inefficient, but "ticks" is
+ * usually just one (we shouldn't be losing ticks,
+ * we're doing this this way mainly for interrupt
+ * latency reasons, not because we think we'll
+ * have lots of lost timer ticks
+ */
+static void update_wall_time(unsigned long ticks)
+{
+	do {
+		ticks--;
+		update_wall_time_one_tick();
+		if (xtime.tv_nsec >= 1000000000) {
+			xtime.tv_nsec -= 1000000000;
+			xtime.tv_sec++;
+			second_overflow();
+		}
+	} while (ticks);
+}
+
+/*
+ * Called from the timer interrupt handler to charge one tick to the current 
+ * process.  user_tick is 1 if the tick is user time, 0 for system.
+ */
+void update_process_times(int user_tick)
+{
+	struct task_struct *p = current;
+	int cpu = smp_processor_id();
+
+	/* Note: this timer irq context must be accounted for as well. */
+	if (user_tick)
+		account_user_time(p, jiffies_to_cputime(1));
+	else
+		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
+	run_local_timers();
+	if (rcu_pending(cpu))
+		rcu_check_callbacks(cpu, user_tick);
+	scheduler_tick();
+ 	run_posix_cpu_timers(p);
+}
+
+/*
+ * Nr of active tasks - counted in fixed-point numbers
+ */
+static unsigned long count_active_tasks(void)
+{
+	return (nr_running() + nr_uninterruptible()) * FIXED_1;
+}
+
+/*
+ * Hmm.. Changed this, as the GNU make sources (load.c) seems to
+ * imply that avenrun[] is the standard name for this kind of thing.
+ * Nothing else seems to be standardized: the fractional size etc
+ * all seem to differ on different machines.
+ *
+ * Requires xtime_lock to access.
+ */
+unsigned long avenrun[3];
+
+EXPORT_SYMBOL(avenrun);
+
+/*
+ * calc_load - given tick count, update the avenrun load estimates.
+ * This is called while holding a write_lock on xtime_lock.
+ */
+static inline void calc_load(unsigned long ticks)
+{
+	unsigned long active_tasks; /* fixed-point */
+	static int count = LOAD_FREQ;
+
+	count -= ticks;
+	if (count < 0) {
+		count += LOAD_FREQ;
+		active_tasks = count_active_tasks();
+		CALC_LOAD(avenrun[0], EXP_1, active_tasks);
+		CALC_LOAD(avenrun[1], EXP_5, active_tasks);
+		CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+	}
+}
+
+/* jiffies at the most recent update of wall time */
+unsigned long wall_jiffies = INITIAL_JIFFIES;
+
+/*
+ * This read-write spinlock protects us from races in SMP while
+ * playing with xtime and avenrun.
+ */
+#ifndef ARCH_HAVE_XTIME_LOCK
+seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
+
+EXPORT_SYMBOL(xtime_lock);
+#endif
+
+/*
+ * This function runs timers and the timer-tq in bottom half context.
+ */
+static void run_timer_softirq(struct softirq_action *h)
+{
+	tvec_base_t *base = &__get_cpu_var(tvec_bases);
+
+	if (time_after_eq(jiffies, base->timer_jiffies))
+		__run_timers(base);
+}
+
+/*
+ * Called by the local, per-CPU timer interrupt on SMP.
+ */
+void run_local_timers(void)
+{
+	raise_softirq(TIMER_SOFTIRQ);
+}
+
+/*
+ * Called by the timer interrupt. xtime_lock must already be taken
+ * by the timer IRQ!
+ */
+static inline void update_times(void)
+{
+	unsigned long ticks;
+
+	ticks = jiffies - wall_jiffies;
+	if (ticks) {
+		wall_jiffies += ticks;
+		update_wall_time(ticks);
+	}
+	calc_load(ticks);
+}
+  
+/*
+ * The 64-bit jiffies value is not atomic - you MUST NOT read it
+ * without sampling the sequence number in xtime_lock.
+ * jiffies is defined in the linker script...
+ */
+
+void do_timer(struct pt_regs *regs)
+{
+	jiffies_64++;
+	update_times();
+}
+
+#ifdef __ARCH_WANT_SYS_ALARM
+
+/*
+ * For backwards compatibility?  This can be done in libc so Alpha
+ * and all newer ports shouldn't need it.
+ */
+asmlinkage unsigned long sys_alarm(unsigned int seconds)
+{
+	struct itimerval it_new, it_old;
+	unsigned int oldalarm;
+
+	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+	it_new.it_value.tv_sec = seconds;
+	it_new.it_value.tv_usec = 0;
+	do_setitimer(ITIMER_REAL, &it_new, &it_old);
+	oldalarm = it_old.it_value.tv_sec;
+	/* ehhh.. We can't return 0 if we have an alarm pending.. */
+	/* And we'd better return too much than too little anyway */
+	if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
+		oldalarm++;
+	return oldalarm;
+}
+
+#endif
+
+#ifndef __alpha__
+
+/*
+ * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
+ * should be moved into arch/i386 instead?
+ */
+
+/**
+ * sys_getpid - return the thread group id of the current process
+ *
+ * Note, despite the name, this returns the tgid not the pid.  The tgid and
+ * the pid are identical unless CLONE_THREAD was specified on clone() in
+ * which case the tgid is the same in all threads of the same group.
+ *
+ * This is SMP safe as current->tgid does not change.
+ */
+asmlinkage long sys_getpid(void)
+{
+	return current->tgid;
+}
+
+/*
+ * Accessing ->group_leader->real_parent is not SMP-safe, it could
+ * change from under us. However, rather than getting any lock
+ * we can use an optimistic algorithm: get the parent
+ * pid, and go back and check that the parent is still
+ * the same. If it has changed (which is extremely unlikely
+ * indeed), we just try again..
+ *
+ * NOTE! This depends on the fact that even if we _do_
+ * get an old value of "parent", we can happily dereference
+ * the pointer (it was and remains a dereferencable kernel pointer
+ * no matter what): we just can't necessarily trust the result
+ * until we know that the parent pointer is valid.
+ *
+ * NOTE2: ->group_leader never changes from under us.
+ */
+asmlinkage long sys_getppid(void)
+{
+	int pid;
+	struct task_struct *me = current;
+	struct task_struct *parent;
+
+	parent = me->group_leader->real_parent;
+	for (;;) {
+		pid = parent->tgid;
+#ifdef CONFIG_SMP
+{
+		struct task_struct *old = parent;
+
+		/*
+		 * Make sure we read the pid before re-reading the
+		 * parent pointer:
+		 */
+		rmb();
+		parent = me->group_leader->real_parent;
+		if (old != parent)
+			continue;
+}
+#endif
+		break;
+	}
+	return pid;
+}
+
+asmlinkage long sys_getuid(void)
+{
+	/* Only we change this so SMP safe */
+	return current->uid;
+}
+
+asmlinkage long sys_geteuid(void)
+{
+	/* Only we change this so SMP safe */
+	return current->euid;
+}
+
+asmlinkage long sys_getgid(void)
+{
+	/* Only we change this so SMP safe */
+	return current->gid;
+}
+
+asmlinkage long sys_getegid(void)
+{
+	/* Only we change this so SMP safe */
+	return  current->egid;
+}
+
+#endif
+
+static void process_timeout(unsigned long __data)
+{
+	wake_up_process((task_t *)__data);
+}
+
+/**
+ * schedule_timeout - sleep until timeout
+ * @timeout: timeout value in jiffies
+ *
+ * Make the current task sleep until @timeout jiffies have
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
+ * pass before the routine returns. The routine will return 0
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task. In this case the remaining time
+ * in jiffies will be returned, or 0 if the timer expired in time
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
+ * the CPU away without a bound on the timeout. In this case the return
+ * value will be %MAX_SCHEDULE_TIMEOUT.
+ *
+ * In all cases the return value is guaranteed to be non-negative.
+ */
+fastcall signed long __sched schedule_timeout(signed long timeout)
+{
+	struct timer_list timer;
+	unsigned long expire;
+
+	switch (timeout)
+	{
+	case MAX_SCHEDULE_TIMEOUT:
+		/*
+		 * These two special cases are useful to be comfortable
+		 * in the caller. Nothing more. We could take
+		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
+		 * but I' d like to return a valid offset (>=0) to allow
+		 * the caller to do everything it want with the retval.
+		 */
+		schedule();
+		goto out;
+	default:
+		/*
+		 * Another bit of PARANOID. Note that the retval will be
+		 * 0 since no piece of kernel is supposed to do a check
+		 * for a negative retval of schedule_timeout() (since it
+		 * should never happens anyway). You just have the printk()
+		 * that will tell you if something is gone wrong and where.
+		 */
+		if (timeout < 0)
+		{
+			printk(KERN_ERR "schedule_timeout: wrong timeout "
+			       "value %lx from %p\n", timeout,
+			       __builtin_return_address(0));
+			current->state = TASK_RUNNING;
+			goto out;
+		}
+	}
+
+	expire = timeout + jiffies;
+
+	init_timer(&timer);
+	timer.expires = expire;
+	timer.data = (unsigned long) current;
+	timer.function = process_timeout;
+
+	add_timer(&timer);
+	schedule();
+	del_singleshot_timer_sync(&timer);
+
+	timeout = expire - jiffies;
+
+ out:
+	return timeout < 0 ? 0 : timeout;
+}
+
+EXPORT_SYMBOL(schedule_timeout);
+
+/* Thread ID - the internal kernel "pid" */
+asmlinkage long sys_gettid(void)
+{
+	return current->pid;
+}
+
+static long __sched nanosleep_restart(struct restart_block *restart)
+{
+	unsigned long expire = restart->arg0, now = jiffies;
+	struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
+	long ret;
+
+	/* Did it expire while we handled signals? */
+	if (!time_after(expire, now))
+		return 0;
+
+	current->state = TASK_INTERRUPTIBLE;
+	expire = schedule_timeout(expire - now);
+
+	ret = 0;
+	if (expire) {
+		struct timespec t;
+		jiffies_to_timespec(expire, &t);
+
+		ret = -ERESTART_RESTARTBLOCK;
+		if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
+			ret = -EFAULT;
+		/* The 'restart' block is already filled in */
+	}
+	return ret;
+}
+
+asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
+{
+	struct timespec t;
+	unsigned long expire;
+	long ret;
+
+	if (copy_from_user(&t, rqtp, sizeof(t)))
+		return -EFAULT;
+
+	if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
+		return -EINVAL;
+
+	expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
+	current->state = TASK_INTERRUPTIBLE;
+	expire = schedule_timeout(expire);
+
+	ret = 0;
+	if (expire) {
+		struct restart_block *restart;
+		jiffies_to_timespec(expire, &t);
+		if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
+			return -EFAULT;
+
+		restart = &current_thread_info()->restart_block;
+		restart->fn = nanosleep_restart;
+		restart->arg0 = jiffies + expire;
+		restart->arg1 = (unsigned long) rmtp;
+		ret = -ERESTART_RESTARTBLOCK;
+	}
+	return ret;
+}
+
+/*
+ * sys_sysinfo - fill in sysinfo struct
+ */ 
+asmlinkage long sys_sysinfo(struct sysinfo __user *info)
+{
+	struct sysinfo val;
+	unsigned long mem_total, sav_total;
+	unsigned int mem_unit, bitcount;
+	unsigned long seq;
+
+	memset((char *)&val, 0, sizeof(struct sysinfo));
+
+	do {
+		struct timespec tp;
+		seq = read_seqbegin(&xtime_lock);
+
+		/*
+		 * This is annoying.  The below is the same thing
+		 * posix_get_clock_monotonic() does, but it wants to
+		 * take the lock which we want to cover the loads stuff
+		 * too.
+		 */
+
+		getnstimeofday(&tp);
+		tp.tv_sec += wall_to_monotonic.tv_sec;
+		tp.tv_nsec += wall_to_monotonic.tv_nsec;
+		if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
+			tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
+			tp.tv_sec++;
+		}
+		val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+
+		val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
+		val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
+		val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+
+		val.procs = nr_threads;
+	} while (read_seqretry(&xtime_lock, seq));
+
+	si_meminfo(&val);
+	si_swapinfo(&val);
+
+	/*
+	 * If the sum of all the available memory (i.e. ram + swap)
+	 * is less than can be stored in a 32 bit unsigned long then
+	 * we can be binary compatible with 2.2.x kernels.  If not,
+	 * well, in that case 2.2.x was broken anyways...
+	 *
+	 *  -Erik Andersen <andersee@debian.org>
+	 */
+
+	mem_total = val.totalram + val.totalswap;
+	if (mem_total < val.totalram || mem_total < val.totalswap)
+		goto out;
+	bitcount = 0;
+	mem_unit = val.mem_unit;
+	while (mem_unit > 1) {
+		bitcount++;
+		mem_unit >>= 1;
+		sav_total = mem_total;
+		mem_total <<= 1;
+		if (mem_total < sav_total)
+			goto out;
+	}
+
+	/*
+	 * If mem_total did not overflow, multiply all memory values by
+	 * val.mem_unit and set it to 1.  This leaves things compatible
+	 * with 2.2.x, and also retains compatibility with earlier 2.4.x
+	 * kernels...
+	 */
+
+	val.mem_unit = 1;
+	val.totalram <<= bitcount;
+	val.freeram <<= bitcount;
+	val.sharedram <<= bitcount;
+	val.bufferram <<= bitcount;
+	val.totalswap <<= bitcount;
+	val.freeswap <<= bitcount;
+	val.totalhigh <<= bitcount;
+	val.freehigh <<= bitcount;
+
+ out:
+	if (copy_to_user(info, &val, sizeof(struct sysinfo)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static void __devinit init_timers_cpu(int cpu)
+{
+	int j;
+	tvec_base_t *base;
+       
+	base = &per_cpu(tvec_bases, cpu);
+	spin_lock_init(&base->lock);
+	for (j = 0; j < TVN_SIZE; j++) {
+		INIT_LIST_HEAD(base->tv5.vec + j);
+		INIT_LIST_HEAD(base->tv4.vec + j);
+		INIT_LIST_HEAD(base->tv3.vec + j);
+		INIT_LIST_HEAD(base->tv2.vec + j);
+	}
+	for (j = 0; j < TVR_SIZE; j++)
+		INIT_LIST_HEAD(base->tv1.vec + j);
+
+	base->timer_jiffies = jiffies;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
+{
+	struct timer_list *timer;
+
+	while (!list_empty(head)) {
+		timer = list_entry(head->next, struct timer_list, entry);
+		/* We're locking backwards from __mod_timer order here,
+		   beware deadlock. */
+		if (!spin_trylock(&timer->lock))
+			return 0;
+		list_del(&timer->entry);
+		internal_add_timer(new_base, timer);
+		timer->base = new_base;
+		spin_unlock(&timer->lock);
+	}
+	return 1;
+}
+
+static void __devinit migrate_timers(int cpu)
+{
+	tvec_base_t *old_base;
+	tvec_base_t *new_base;
+	int i;
+
+	BUG_ON(cpu_online(cpu));
+	old_base = &per_cpu(tvec_bases, cpu);
+	new_base = &get_cpu_var(tvec_bases);
+
+	local_irq_disable();
+again:
+	/* Prevent deadlocks via ordering by old_base < new_base. */
+	if (old_base < new_base) {
+		spin_lock(&new_base->lock);
+		spin_lock(&old_base->lock);
+	} else {
+		spin_lock(&old_base->lock);
+		spin_lock(&new_base->lock);
+	}
+
+	if (old_base->running_timer)
+		BUG();
+	for (i = 0; i < TVR_SIZE; i++)
+		if (!migrate_timer_list(new_base, old_base->tv1.vec + i))
+			goto unlock_again;
+	for (i = 0; i < TVN_SIZE; i++)
+		if (!migrate_timer_list(new_base, old_base->tv2.vec + i)
+		    || !migrate_timer_list(new_base, old_base->tv3.vec + i)
+		    || !migrate_timer_list(new_base, old_base->tv4.vec + i)
+		    || !migrate_timer_list(new_base, old_base->tv5.vec + i))
+			goto unlock_again;
+	spin_unlock(&old_base->lock);
+	spin_unlock(&new_base->lock);
+	local_irq_enable();
+	put_cpu_var(tvec_bases);
+	return;
+
+unlock_again:
+	/* Avoid deadlock with __mod_timer, by backing off. */
+	spin_unlock(&old_base->lock);
+	spin_unlock(&new_base->lock);
+	cpu_relax();
+	goto again;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __devinit timer_cpu_notify(struct notifier_block *self, 
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+	switch(action) {
+	case CPU_UP_PREPARE:
+		init_timers_cpu(cpu);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		migrate_timers(cpu);
+		break;
+#endif
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata timers_nb = {
+	.notifier_call	= timer_cpu_notify,
+};
+
+
+void __init init_timers(void)
+{
+	timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+				(void *)(long)smp_processor_id());
+	register_cpu_notifier(&timers_nb);
+	open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
+}
+
+#ifdef CONFIG_TIME_INTERPOLATION
+
+struct time_interpolator *time_interpolator;
+static struct time_interpolator *time_interpolator_list;
+static DEFINE_SPINLOCK(time_interpolator_lock);
+
+static inline u64 time_interpolator_get_cycles(unsigned int src)
+{
+	unsigned long (*x)(void);
+
+	switch (src)
+	{
+		case TIME_SOURCE_FUNCTION:
+			x = time_interpolator->addr;
+			return x();
+
+		case TIME_SOURCE_MMIO64	:
+			return readq((void __iomem *) time_interpolator->addr);
+
+		case TIME_SOURCE_MMIO32	:
+			return readl((void __iomem *) time_interpolator->addr);
+
+		default: return get_cycles();
+	}
+}
+
+static inline u64 time_interpolator_get_counter(void)
+{
+	unsigned int src = time_interpolator->source;
+
+	if (time_interpolator->jitter)
+	{
+		u64 lcycle;
+		u64 now;
+
+		do {
+			lcycle = time_interpolator->last_cycle;
+			now = time_interpolator_get_cycles(src);
+			if (lcycle && time_after(lcycle, now))
+				return lcycle;
+			/* Keep track of the last timer value returned. The use of cmpxchg here
+			 * will cause contention in an SMP environment.
+			 */
+		} while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
+		return now;
+	}
+	else
+		return time_interpolator_get_cycles(src);
+}
+
+void time_interpolator_reset(void)
+{
+	time_interpolator->offset = 0;
+	time_interpolator->last_counter = time_interpolator_get_counter();
+}
+
+#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
+
+unsigned long time_interpolator_get_offset(void)
+{
+	/* If we do not have a time interpolator set up then just return zero */
+	if (!time_interpolator)
+		return 0;
+
+	return time_interpolator->offset +
+		GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator);
+}
+
+#define INTERPOLATOR_ADJUST 65536
+#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
+
+static void time_interpolator_update(long delta_nsec)
+{
+	u64 counter;
+	unsigned long offset;
+
+	/* If there is no time interpolator set up then do nothing */
+	if (!time_interpolator)
+		return;
+
+	/* The interpolator compensates for late ticks by accumulating
+         * the late time in time_interpolator->offset. A tick earlier than
+	 * expected will lead to a reset of the offset and a corresponding
+	 * jump of the clock forward. Again this only works if the
+	 * interpolator clock is running slightly slower than the regular clock
+	 * and the tuning logic insures that.
+         */
+
+	counter = time_interpolator_get_counter();
+	offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
+
+	if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
+		time_interpolator->offset = offset - delta_nsec;
+	else {
+		time_interpolator->skips++;
+		time_interpolator->ns_skipped += delta_nsec - offset;
+		time_interpolator->offset = 0;
+	}
+	time_interpolator->last_counter = counter;
+
+	/* Tuning logic for time interpolator invoked every minute or so.
+	 * Decrease interpolator clock speed if no skips occurred and an offset is carried.
+	 * Increase interpolator clock speed if we skip too much time.
+	 */
+	if (jiffies % INTERPOLATOR_ADJUST == 0)
+	{
+		if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
+			time_interpolator->nsec_per_cyc--;
+		if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
+			time_interpolator->nsec_per_cyc++;
+		time_interpolator->skips = 0;
+		time_interpolator->ns_skipped = 0;
+	}
+}
+
+static inline int
+is_better_time_interpolator(struct time_interpolator *new)
+{
+	if (!time_interpolator)
+		return 1;
+	return new->frequency > 2*time_interpolator->frequency ||
+	    (unsigned long)new->drift < (unsigned long)time_interpolator->drift;
+}
+
+void
+register_time_interpolator(struct time_interpolator *ti)
+{
+	unsigned long flags;
+
+	/* Sanity check */
+	if (ti->frequency == 0 || ti->mask == 0)
+		BUG();
+
+	ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
+	spin_lock(&time_interpolator_lock);
+	write_seqlock_irqsave(&xtime_lock, flags);
+	if (is_better_time_interpolator(ti)) {
+		time_interpolator = ti;
+		time_interpolator_reset();
+	}
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	ti->next = time_interpolator_list;
+	time_interpolator_list = ti;
+	spin_unlock(&time_interpolator_lock);
+}
+
+void
+unregister_time_interpolator(struct time_interpolator *ti)
+{
+	struct time_interpolator *curr, **prev;
+	unsigned long flags;
+
+	spin_lock(&time_interpolator_lock);
+	prev = &time_interpolator_list;
+	for (curr = *prev; curr; curr = curr->next) {
+		if (curr == ti) {
+			*prev = curr->next;
+			break;
+		}
+		prev = &curr->next;
+	}
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	if (ti == time_interpolator) {
+		/* we lost the best time-interpolator: */
+		time_interpolator = NULL;
+		/* find the next-best interpolator */
+		for (curr = time_interpolator_list; curr; curr = curr->next)
+			if (is_better_time_interpolator(curr))
+				time_interpolator = curr;
+		time_interpolator_reset();
+	}
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+	spin_unlock(&time_interpolator_lock);
+}
+#endif /* CONFIG_TIME_INTERPOLATION */
+
+/**
+ * msleep - sleep safely even with waitqueue interruptions
+ * @msecs: Time in milliseconds to sleep for
+ */
+void msleep(unsigned int msecs)
+{
+	unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+
+	while (timeout) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		timeout = schedule_timeout(timeout);
+	}
+}
+
+EXPORT_SYMBOL(msleep);
+
+/**
+ * msleep_interruptible - sleep waiting for waitqueue interruptions
+ * @msecs: Time in milliseconds to sleep for
+ */
+unsigned long msleep_interruptible(unsigned int msecs)
+{
+	unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+
+	while (timeout && !signal_pending(current)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		timeout = schedule_timeout(timeout);
+	}
+	return jiffies_to_msecs(timeout);
+}
+
+EXPORT_SYMBOL(msleep_interruptible);
diff --git a/kernel/uid16.c b/kernel/uid16.c
new file mode 100644
index 000000000000..f669941e8b26
--- /dev/null
+++ b/kernel/uid16.c
@@ -0,0 +1,196 @@
+/*
+ *	Wrapper functions for 16bit uid back compatibility. All nicely tied
+ *	together in the faint hope we can take the out in five years time.
+ */
+
+#include <linux/mm.h>
+#include <linux/utsname.h>
+#include <linux/mman.h>
+#include <linux/smp_lock.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/prctl.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+
+asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group)
+{
+	return sys_chown(filename, low2highuid(user), low2highgid(group));
+}
+
+asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group)
+{
+	return sys_lchown(filename, low2highuid(user), low2highgid(group));
+}
+
+asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
+{
+	return sys_fchown(fd, low2highuid(user), low2highgid(group));
+}
+
+asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
+{
+	return sys_setregid(low2highgid(rgid), low2highgid(egid));
+}
+
+asmlinkage long sys_setgid16(old_gid_t gid)
+{
+	return sys_setgid(low2highgid(gid));
+}
+
+asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
+{
+	return sys_setreuid(low2highuid(ruid), low2highuid(euid));
+}
+
+asmlinkage long sys_setuid16(old_uid_t uid)
+{
+	return sys_setuid(low2highuid(uid));
+}
+
+asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
+{
+	return sys_setresuid(low2highuid(ruid), low2highuid(euid),
+		low2highuid(suid));
+}
+
+asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
+{
+	int retval;
+
+	if (!(retval = put_user(high2lowuid(current->uid), ruid)) &&
+	    !(retval = put_user(high2lowuid(current->euid), euid)))
+		retval = put_user(high2lowuid(current->suid), suid);
+
+	return retval;
+}
+
+asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
+{
+	return sys_setresgid(low2highgid(rgid), low2highgid(egid),
+		low2highgid(sgid));
+}
+
+asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
+{
+	int retval;
+
+	if (!(retval = put_user(high2lowgid(current->gid), rgid)) &&
+	    !(retval = put_user(high2lowgid(current->egid), egid)))
+		retval = put_user(high2lowgid(current->sgid), sgid);
+
+	return retval;
+}
+
+asmlinkage long sys_setfsuid16(old_uid_t uid)
+{
+	return sys_setfsuid(low2highuid(uid));
+}
+
+asmlinkage long sys_setfsgid16(old_gid_t gid)
+{
+	return sys_setfsgid(low2highgid(gid));
+}
+
+static int groups16_to_user(old_gid_t __user *grouplist,
+    struct group_info *group_info)
+{
+	int i;
+	old_gid_t group;
+
+	for (i = 0; i < group_info->ngroups; i++) {
+		group = high2lowgid(GROUP_AT(group_info, i));
+		if (put_user(group, grouplist+i))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int groups16_from_user(struct group_info *group_info,
+    old_gid_t __user *grouplist)
+{
+	int i;
+	old_gid_t group;
+
+	for (i = 0; i < group_info->ngroups; i++) {
+		if (get_user(group, grouplist+i))
+			return  -EFAULT;
+		GROUP_AT(group_info, i) = low2highgid(group);
+	}
+
+	return 0;
+}
+
+asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
+{
+	int i = 0;
+
+	if (gidsetsize < 0)
+		return -EINVAL;
+
+	get_group_info(current->group_info);
+	i = current->group_info->ngroups;
+	if (gidsetsize) {
+		if (i > gidsetsize) {
+			i = -EINVAL;
+			goto out;
+		}
+		if (groups16_to_user(grouplist, current->group_info)) {
+			i = -EFAULT;
+			goto out;
+		}
+	}
+out:
+	put_group_info(current->group_info);
+	return i;
+}
+
+asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
+{
+	struct group_info *group_info;
+	int retval;
+
+	if (!capable(CAP_SETGID))
+		return -EPERM;
+	if ((unsigned)gidsetsize > NGROUPS_MAX)
+		return -EINVAL;
+
+	group_info = groups_alloc(gidsetsize);
+	if (!group_info)
+		return -ENOMEM;
+	retval = groups16_from_user(group_info, grouplist);
+	if (retval) {
+		put_group_info(group_info);
+		return retval;
+	}
+
+	retval = set_current_groups(group_info);
+	put_group_info(group_info);
+
+	return retval;
+}
+
+asmlinkage long sys_getuid16(void)
+{
+	return high2lowuid(current->uid);
+}
+
+asmlinkage long sys_geteuid16(void)
+{
+	return high2lowuid(current->euid);
+}
+
+asmlinkage long sys_getgid16(void)
+{
+	return high2lowgid(current->gid);
+}
+
+asmlinkage long sys_getegid16(void)
+{
+	return high2lowgid(current->egid);
+}
diff --git a/kernel/user.c b/kernel/user.c
new file mode 100644
index 000000000000..734575d55769
--- /dev/null
+++ b/kernel/user.c
@@ -0,0 +1,189 @@
+/*
+ * The "user cache".
+ *
+ * (C) Copyright 1991-2000 Linus Torvalds
+ *
+ * We have a per-user structure to keep track of how many
+ * processes, files etc the user has claimed, in order to be
+ * able to have per-user limits for system resources. 
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/key.h>
+
+/*
+ * UID task count cache, to get fast user lookup in "alloc_uid"
+ * when changing user ID's (ie setuid() and friends).
+ */
+
+#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8)
+#define UIDHASH_SZ		(1 << UIDHASH_BITS)
+#define UIDHASH_MASK		(UIDHASH_SZ - 1)
+#define __uidhashfn(uid)	(((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
+#define uidhashentry(uid)	(uidhash_table + __uidhashfn((uid)))
+
+static kmem_cache_t *uid_cachep;
+static struct list_head uidhash_table[UIDHASH_SZ];
+static DEFINE_SPINLOCK(uidhash_lock);
+
+struct user_struct root_user = {
+	.__count	= ATOMIC_INIT(1),
+	.processes	= ATOMIC_INIT(1),
+	.files		= ATOMIC_INIT(0),
+	.sigpending	= ATOMIC_INIT(0),
+	.mq_bytes	= 0,
+	.locked_shm     = 0,
+#ifdef CONFIG_KEYS
+	.uid_keyring	= &root_user_keyring,
+	.session_keyring = &root_session_keyring,
+#endif
+};
+
+/*
+ * These routines must be called with the uidhash spinlock held!
+ */
+static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
+{
+	list_add(&up->uidhash_list, hashent);
+}
+
+static inline void uid_hash_remove(struct user_struct *up)
+{
+	list_del(&up->uidhash_list);
+}
+
+static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent)
+{
+	struct list_head *up;
+
+	list_for_each(up, hashent) {
+		struct user_struct *user;
+
+		user = list_entry(up, struct user_struct, uidhash_list);
+
+		if(user->uid == uid) {
+			atomic_inc(&user->__count);
+			return user;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
+ * caller must undo that ref with free_uid().
+ *
+ * If the user_struct could not be found, return NULL.
+ */
+struct user_struct *find_user(uid_t uid)
+{
+	struct user_struct *ret;
+
+	spin_lock(&uidhash_lock);
+	ret = uid_hash_find(uid, uidhashentry(uid));
+	spin_unlock(&uidhash_lock);
+	return ret;
+}
+
+void free_uid(struct user_struct *up)
+{
+	if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+		uid_hash_remove(up);
+		key_put(up->uid_keyring);
+		key_put(up->session_keyring);
+		kmem_cache_free(uid_cachep, up);
+		spin_unlock(&uidhash_lock);
+	}
+}
+
+struct user_struct * alloc_uid(uid_t uid)
+{
+	struct list_head *hashent = uidhashentry(uid);
+	struct user_struct *up;
+
+	spin_lock(&uidhash_lock);
+	up = uid_hash_find(uid, hashent);
+	spin_unlock(&uidhash_lock);
+
+	if (!up) {
+		struct user_struct *new;
+
+		new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
+		if (!new)
+			return NULL;
+		new->uid = uid;
+		atomic_set(&new->__count, 1);
+		atomic_set(&new->processes, 0);
+		atomic_set(&new->files, 0);
+		atomic_set(&new->sigpending, 0);
+
+		new->mq_bytes = 0;
+		new->locked_shm = 0;
+
+		if (alloc_uid_keyring(new) < 0) {
+			kmem_cache_free(uid_cachep, new);
+			return NULL;
+		}
+
+		/*
+		 * Before adding this, check whether we raced
+		 * on adding the same user already..
+		 */
+		spin_lock(&uidhash_lock);
+		up = uid_hash_find(uid, hashent);
+		if (up) {
+			key_put(new->uid_keyring);
+			key_put(new->session_keyring);
+			kmem_cache_free(uid_cachep, new);
+		} else {
+			uid_hash_insert(new, hashent);
+			up = new;
+		}
+		spin_unlock(&uidhash_lock);
+
+	}
+	return up;
+}
+
+void switch_uid(struct user_struct *new_user)
+{
+	struct user_struct *old_user;
+
+	/* What if a process setreuid()'s and this brings the
+	 * new uid over his NPROC rlimit?  We can check this now
+	 * cheaply with the new uid cache, so if it matters
+	 * we should be checking for it.  -DaveM
+	 */
+	old_user = current->user;
+	atomic_inc(&new_user->processes);
+	atomic_dec(&old_user->processes);
+	switch_uid_keyring(new_user);
+	current->user = new_user;
+	free_uid(old_user);
+	suid_keys(current);
+}
+
+
+static int __init uid_cache_init(void)
+{
+	int n;
+
+	uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
+			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+
+	for(n = 0; n < UIDHASH_SZ; ++n)
+		INIT_LIST_HEAD(uidhash_table + n);
+
+	/* Insert the root user immediately (init already runs as root) */
+	spin_lock(&uidhash_lock);
+	uid_hash_insert(&root_user, uidhashentry(0));
+	spin_unlock(&uidhash_lock);
+
+	return 0;
+}
+
+module_init(uid_cache_init);
diff --git a/kernel/wait.c b/kernel/wait.c
new file mode 100644
index 000000000000..791681cfea98
--- /dev/null
+++ b/kernel/wait.c
@@ -0,0 +1,246 @@
+/*
+ * Generic waiting primitives.
+ *
+ * (C) 2004 William Irwin, Oracle
+ */
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+
+void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	unsigned long flags;
+
+	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&q->lock, flags);
+	__add_wait_queue(q, wait);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue);
+
+void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	unsigned long flags;
+
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&q->lock, flags);
+	__add_wait_queue_tail(q, wait);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive);
+
+void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__remove_wait_queue(q, wait);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(remove_wait_queue);
+
+
+/*
+ * Note: we use "set_current_state()" _after_ the wait-queue add,
+ * because we need a memory barrier there on SMP, so that any
+ * wake-function that tests for the wait-queue being active
+ * will be guaranteed to see waitqueue addition _or_ subsequent
+ * tests in this thread will see the wakeup having taken place.
+ *
+ * The spin_unlock() itself is semi-permeable and only protects
+ * one way (it only protects stuff inside the critical region and
+ * stops them from bleeding out - it would still allow subsequent
+ * loads to move into the the critical region).
+ */
+void fastcall
+prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+	unsigned long flags;
+
+	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&q->lock, flags);
+	if (list_empty(&wait->task_list))
+		__add_wait_queue(q, wait);
+	/*
+	 * don't alter the task state if this is just going to
+	 * queue an async wait queue callback
+	 */
+	if (is_sync_wait(wait))
+		set_current_state(state);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait);
+
+void fastcall
+prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+	unsigned long flags;
+
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&q->lock, flags);
+	if (list_empty(&wait->task_list))
+		__add_wait_queue_tail(q, wait);
+	/*
+	 * don't alter the task state if this is just going to
+ 	 * queue an async wait queue callback
+	 */
+	if (is_sync_wait(wait))
+		set_current_state(state);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait_exclusive);
+
+void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	unsigned long flags;
+
+	__set_current_state(TASK_RUNNING);
+	/*
+	 * We can check for list emptiness outside the lock
+	 * IFF:
+	 *  - we use the "careful" check that verifies both
+	 *    the next and prev pointers, so that there cannot
+	 *    be any half-pending updates in progress on other
+	 *    CPU's that we haven't seen yet (and that might
+	 *    still change the stack area.
+	 * and
+	 *  - all other users take the lock (ie we can only
+	 *    have _one_ other CPU that looks at or modifies
+	 *    the list).
+	 */
+	if (!list_empty_careful(&wait->task_list)) {
+		spin_lock_irqsave(&q->lock, flags);
+		list_del_init(&wait->task_list);
+		spin_unlock_irqrestore(&q->lock, flags);
+	}
+}
+EXPORT_SYMBOL(finish_wait);
+
+int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	int ret = default_wake_function(wait, mode, sync, key);
+
+	if (ret)
+		list_del_init(&wait->task_list);
+	return ret;
+}
+EXPORT_SYMBOL(autoremove_wake_function);
+
+int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+{
+	struct wait_bit_key *key = arg;
+	struct wait_bit_queue *wait_bit
+		= container_of(wait, struct wait_bit_queue, wait);
+
+	if (wait_bit->key.flags != key->flags ||
+			wait_bit->key.bit_nr != key->bit_nr ||
+			test_bit(key->bit_nr, key->flags))
+		return 0;
+	else
+		return autoremove_wake_function(wait, mode, sync, key);
+}
+EXPORT_SYMBOL(wake_bit_function);
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking)
+ * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
+ * permitted return codes. Nonzero return codes halt waiting and return.
+ */
+int __sched fastcall
+__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
+			int (*action)(void *), unsigned mode)
+{
+	int ret = 0;
+
+	do {
+		prepare_to_wait(wq, &q->wait, mode);
+		if (test_bit(q->key.bit_nr, q->key.flags))
+			ret = (*action)(q->key.flags);
+	} while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
+	finish_wait(wq, &q->wait);
+	return ret;
+}
+EXPORT_SYMBOL(__wait_on_bit);
+
+int __sched fastcall out_of_line_wait_on_bit(void *word, int bit,
+					int (*action)(void *), unsigned mode)
+{
+	wait_queue_head_t *wq = bit_waitqueue(word, bit);
+	DEFINE_WAIT_BIT(wait, word, bit);
+
+	return __wait_on_bit(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit);
+
+int __sched fastcall
+__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
+			int (*action)(void *), unsigned mode)
+{
+	int ret = 0;
+
+	do {
+		prepare_to_wait_exclusive(wq, &q->wait, mode);
+		if (test_bit(q->key.bit_nr, q->key.flags)) {
+			if ((ret = (*action)(q->key.flags)))
+				break;
+		}
+	} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
+	finish_wait(wq, &q->wait);
+	return ret;
+}
+EXPORT_SYMBOL(__wait_on_bit_lock);
+
+int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit,
+					int (*action)(void *), unsigned mode)
+{
+	wait_queue_head_t *wq = bit_waitqueue(word, bit);
+	DEFINE_WAIT_BIT(wait, word, bit);
+
+	return __wait_on_bit_lock(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
+
+void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
+{
+	struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
+	if (waitqueue_active(wq))
+		__wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, &key);
+}
+EXPORT_SYMBOL(__wake_up_bit);
+
+/**
+ * wake_up_bit - wake up a waiter on a bit
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that wakes up waiters
+ * on a bit. For instance, if one were to have waiters on a bitflag,
+ * one would call wake_up_bit() after clearing the bit.
+ *
+ * In order for this to function properly, as it uses waitqueue_active()
+ * internally, some kind of memory barrier must be done prior to calling
+ * this. Typically, this will be smp_mb__after_clear_bit(), but in some
+ * cases where bitflags are manipulated non-atomically under a lock, one
+ * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
+ * because spin_unlock() does not guarantee a memory barrier.
+ */
+void fastcall wake_up_bit(void *word, int bit)
+{
+	__wake_up_bit(bit_waitqueue(word, bit), word, bit);
+}
+EXPORT_SYMBOL(wake_up_bit);
+
+fastcall wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+	const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+	const struct zone *zone = page_zone(virt_to_page(word));
+	unsigned long val = (unsigned long)word << shift | bit;
+
+	return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
+}
+EXPORT_SYMBOL(bit_waitqueue);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
new file mode 100644
index 000000000000..52ef419d2747
--- /dev/null
+++ b/kernel/workqueue.c
@@ -0,0 +1,555 @@
+/*
+ * linux/kernel/workqueue.c
+ *
+ * Generic mechanism for defining kernel helper threads for running
+ * arbitrary tasks in process context.
+ *
+ * Started by Ingo Molnar, Copyright (C) 2002
+ *
+ * Derived from the taskqueue/keventd code by:
+ *
+ *   David Woodhouse <dwmw2@infradead.org>
+ *   Andrew Morton <andrewm@uow.edu.au>
+ *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
+ *   Theodore Ts'o <tytso@mit.edu>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+#include <linux/completion.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+
+/*
+ * The per-CPU workqueue (if single thread, we always use cpu 0's).
+ *
+ * The sequence counters are for flush_scheduled_work().  It wants to wait
+ * until until all currently-scheduled works are completed, but it doesn't
+ * want to be livelocked by new, incoming ones.  So it waits until
+ * remove_sequence is >= the insert_sequence which pertained when
+ * flush_scheduled_work() was called.
+ */
+struct cpu_workqueue_struct {
+
+	spinlock_t lock;
+
+	long remove_sequence;	/* Least-recently added (next to run) */
+	long insert_sequence;	/* Next to add */
+
+	struct list_head worklist;
+	wait_queue_head_t more_work;
+	wait_queue_head_t work_done;
+
+	struct workqueue_struct *wq;
+	task_t *thread;
+
+	int run_depth;		/* Detect run_workqueue() recursion depth */
+} ____cacheline_aligned;
+
+/*
+ * The externally visible workqueue abstraction is an array of
+ * per-CPU workqueues:
+ */
+struct workqueue_struct {
+	struct cpu_workqueue_struct cpu_wq[NR_CPUS];
+	const char *name;
+	struct list_head list; 	/* Empty if single thread */
+};
+
+/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
+   threads to each one as cpus come/go. */
+static DEFINE_SPINLOCK(workqueue_lock);
+static LIST_HEAD(workqueues);
+
+/* If it's single threaded, it isn't in the list of workqueues. */
+static inline int is_single_threaded(struct workqueue_struct *wq)
+{
+	return list_empty(&wq->list);
+}
+
+/* Preempt must be disabled. */
+static void __queue_work(struct cpu_workqueue_struct *cwq,
+			 struct work_struct *work)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cwq->lock, flags);
+	work->wq_data = cwq;
+	list_add_tail(&work->entry, &cwq->worklist);
+	cwq->insert_sequence++;
+	wake_up(&cwq->more_work);
+	spin_unlock_irqrestore(&cwq->lock, flags);
+}
+
+/*
+ * Queue work on a workqueue. Return non-zero if it was successfully
+ * added.
+ *
+ * We queue the work to the CPU it was submitted, but there is no
+ * guarantee that it will be processed by that CPU.
+ */
+int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
+{
+	int ret = 0, cpu = get_cpu();
+
+	if (!test_and_set_bit(0, &work->pending)) {
+		if (unlikely(is_single_threaded(wq)))
+			cpu = 0;
+		BUG_ON(!list_empty(&work->entry));
+		__queue_work(wq->cpu_wq + cpu, work);
+		ret = 1;
+	}
+	put_cpu();
+	return ret;
+}
+
+static void delayed_work_timer_fn(unsigned long __data)
+{
+	struct work_struct *work = (struct work_struct *)__data;
+	struct workqueue_struct *wq = work->wq_data;
+	int cpu = smp_processor_id();
+
+	if (unlikely(is_single_threaded(wq)))
+		cpu = 0;
+
+	__queue_work(wq->cpu_wq + cpu, work);
+}
+
+int fastcall queue_delayed_work(struct workqueue_struct *wq,
+			struct work_struct *work, unsigned long delay)
+{
+	int ret = 0;
+	struct timer_list *timer = &work->timer;
+
+	if (!test_and_set_bit(0, &work->pending)) {
+		BUG_ON(timer_pending(timer));
+		BUG_ON(!list_empty(&work->entry));
+
+		/* This stores wq for the moment, for the timer_fn */
+		work->wq_data = wq;
+		timer->expires = jiffies + delay;
+		timer->data = (unsigned long)work;
+		timer->function = delayed_work_timer_fn;
+		add_timer(timer);
+		ret = 1;
+	}
+	return ret;
+}
+
+static inline void run_workqueue(struct cpu_workqueue_struct *cwq)
+{
+	unsigned long flags;
+
+	/*
+	 * Keep taking off work from the queue until
+	 * done.
+	 */
+	spin_lock_irqsave(&cwq->lock, flags);
+	cwq->run_depth++;
+	if (cwq->run_depth > 3) {
+		/* morton gets to eat his hat */
+		printk("%s: recursion depth exceeded: %d\n",
+			__FUNCTION__, cwq->run_depth);
+		dump_stack();
+	}
+	while (!list_empty(&cwq->worklist)) {
+		struct work_struct *work = list_entry(cwq->worklist.next,
+						struct work_struct, entry);
+		void (*f) (void *) = work->func;
+		void *data = work->data;
+
+		list_del_init(cwq->worklist.next);
+		spin_unlock_irqrestore(&cwq->lock, flags);
+
+		BUG_ON(work->wq_data != cwq);
+		clear_bit(0, &work->pending);
+		f(data);
+
+		spin_lock_irqsave(&cwq->lock, flags);
+		cwq->remove_sequence++;
+		wake_up(&cwq->work_done);
+	}
+	cwq->run_depth--;
+	spin_unlock_irqrestore(&cwq->lock, flags);
+}
+
+static int worker_thread(void *__cwq)
+{
+	struct cpu_workqueue_struct *cwq = __cwq;
+	DECLARE_WAITQUEUE(wait, current);
+	struct k_sigaction sa;
+	sigset_t blocked;
+
+	current->flags |= PF_NOFREEZE;
+
+	set_user_nice(current, -5);
+
+	/* Block and flush all signals */
+	sigfillset(&blocked);
+	sigprocmask(SIG_BLOCK, &blocked, NULL);
+	flush_signals(current);
+
+	/* SIG_IGN makes children autoreap: see do_notify_parent(). */
+	sa.sa.sa_handler = SIG_IGN;
+	sa.sa.sa_flags = 0;
+	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
+	do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		add_wait_queue(&cwq->more_work, &wait);
+		if (list_empty(&cwq->worklist))
+			schedule();
+		else
+			__set_current_state(TASK_RUNNING);
+		remove_wait_queue(&cwq->more_work, &wait);
+
+		if (!list_empty(&cwq->worklist))
+			run_workqueue(cwq);
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
+{
+	if (cwq->thread == current) {
+		/*
+		 * Probably keventd trying to flush its own queue. So simply run
+		 * it by hand rather than deadlocking.
+		 */
+		run_workqueue(cwq);
+	} else {
+		DEFINE_WAIT(wait);
+		long sequence_needed;
+
+		spin_lock_irq(&cwq->lock);
+		sequence_needed = cwq->insert_sequence;
+
+		while (sequence_needed - cwq->remove_sequence > 0) {
+			prepare_to_wait(&cwq->work_done, &wait,
+					TASK_UNINTERRUPTIBLE);
+			spin_unlock_irq(&cwq->lock);
+			schedule();
+			spin_lock_irq(&cwq->lock);
+		}
+		finish_wait(&cwq->work_done, &wait);
+		spin_unlock_irq(&cwq->lock);
+	}
+}
+
+/*
+ * flush_workqueue - ensure that any scheduled work has run to completion.
+ *
+ * Forces execution of the workqueue and blocks until its completion.
+ * This is typically used in driver shutdown handlers.
+ *
+ * This function will sample each workqueue's current insert_sequence number and
+ * will sleep until the head sequence is greater than or equal to that.  This
+ * means that we sleep until all works which were queued on entry have been
+ * handled, but we are not livelocked by new incoming ones.
+ *
+ * This function used to run the workqueues itself.  Now we just wait for the
+ * helper threads to do it.
+ */
+void fastcall flush_workqueue(struct workqueue_struct *wq)
+{
+	might_sleep();
+
+	if (is_single_threaded(wq)) {
+		/* Always use cpu 0's area. */
+		flush_cpu_workqueue(wq->cpu_wq + 0);
+	} else {
+		int cpu;
+
+		lock_cpu_hotplug();
+		for_each_online_cpu(cpu)
+			flush_cpu_workqueue(wq->cpu_wq + cpu);
+		unlock_cpu_hotplug();
+	}
+}
+
+static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
+						   int cpu)
+{
+	struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
+	struct task_struct *p;
+
+	spin_lock_init(&cwq->lock);
+	cwq->wq = wq;
+	cwq->thread = NULL;
+	cwq->insert_sequence = 0;
+	cwq->remove_sequence = 0;
+	INIT_LIST_HEAD(&cwq->worklist);
+	init_waitqueue_head(&cwq->more_work);
+	init_waitqueue_head(&cwq->work_done);
+
+	if (is_single_threaded(wq))
+		p = kthread_create(worker_thread, cwq, "%s", wq->name);
+	else
+		p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
+	if (IS_ERR(p))
+		return NULL;
+	cwq->thread = p;
+	return p;
+}
+
+struct workqueue_struct *__create_workqueue(const char *name,
+					    int singlethread)
+{
+	int cpu, destroy = 0;
+	struct workqueue_struct *wq;
+	struct task_struct *p;
+
+	BUG_ON(strlen(name) > 10);
+
+	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+	if (!wq)
+		return NULL;
+	memset(wq, 0, sizeof(*wq));
+
+	wq->name = name;
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	lock_cpu_hotplug();
+	if (singlethread) {
+		INIT_LIST_HEAD(&wq->list);
+		p = create_workqueue_thread(wq, 0);
+		if (!p)
+			destroy = 1;
+		else
+			wake_up_process(p);
+	} else {
+		spin_lock(&workqueue_lock);
+		list_add(&wq->list, &workqueues);
+		spin_unlock(&workqueue_lock);
+		for_each_online_cpu(cpu) {
+			p = create_workqueue_thread(wq, cpu);
+			if (p) {
+				kthread_bind(p, cpu);
+				wake_up_process(p);
+			} else
+				destroy = 1;
+		}
+	}
+	unlock_cpu_hotplug();
+
+	/*
+	 * Was there any error during startup? If yes then clean up:
+	 */
+	if (destroy) {
+		destroy_workqueue(wq);
+		wq = NULL;
+	}
+	return wq;
+}
+
+static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
+{
+	struct cpu_workqueue_struct *cwq;
+	unsigned long flags;
+	struct task_struct *p;
+
+	cwq = wq->cpu_wq + cpu;
+	spin_lock_irqsave(&cwq->lock, flags);
+	p = cwq->thread;
+	cwq->thread = NULL;
+	spin_unlock_irqrestore(&cwq->lock, flags);
+	if (p)
+		kthread_stop(p);
+}
+
+void destroy_workqueue(struct workqueue_struct *wq)
+{
+	int cpu;
+
+	flush_workqueue(wq);
+
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	lock_cpu_hotplug();
+	if (is_single_threaded(wq))
+		cleanup_workqueue_thread(wq, 0);
+	else {
+		for_each_online_cpu(cpu)
+			cleanup_workqueue_thread(wq, cpu);
+		spin_lock(&workqueue_lock);
+		list_del(&wq->list);
+		spin_unlock(&workqueue_lock);
+	}
+	unlock_cpu_hotplug();
+	kfree(wq);
+}
+
+static struct workqueue_struct *keventd_wq;
+
+int fastcall schedule_work(struct work_struct *work)
+{
+	return queue_work(keventd_wq, work);
+}
+
+int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
+{
+	return queue_delayed_work(keventd_wq, work, delay);
+}
+
+int schedule_delayed_work_on(int cpu,
+			struct work_struct *work, unsigned long delay)
+{
+	int ret = 0;
+	struct timer_list *timer = &work->timer;
+
+	if (!test_and_set_bit(0, &work->pending)) {
+		BUG_ON(timer_pending(timer));
+		BUG_ON(!list_empty(&work->entry));
+		/* This stores keventd_wq for the moment, for the timer_fn */
+		work->wq_data = keventd_wq;
+		timer->expires = jiffies + delay;
+		timer->data = (unsigned long)work;
+		timer->function = delayed_work_timer_fn;
+		add_timer_on(timer, cpu);
+		ret = 1;
+	}
+	return ret;
+}
+
+void flush_scheduled_work(void)
+{
+	flush_workqueue(keventd_wq);
+}
+
+/**
+ * cancel_rearming_delayed_workqueue - reliably kill off a delayed
+ *			work whose handler rearms the delayed work.
+ * @wq:   the controlling workqueue structure
+ * @work: the delayed work struct
+ */
+static void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
+					struct work_struct *work)
+{
+	while (!cancel_delayed_work(work))
+		flush_workqueue(wq);
+}
+
+/**
+ * cancel_rearming_delayed_work - reliably kill off a delayed keventd
+ *			work whose handler rearms the delayed work.
+ * @work: the delayed work struct
+ */
+void cancel_rearming_delayed_work(struct work_struct *work)
+{
+	cancel_rearming_delayed_workqueue(keventd_wq, work);
+}
+EXPORT_SYMBOL(cancel_rearming_delayed_work);
+
+int keventd_up(void)
+{
+	return keventd_wq != NULL;
+}
+
+int current_is_keventd(void)
+{
+	struct cpu_workqueue_struct *cwq;
+	int cpu = smp_processor_id();	/* preempt-safe: keventd is per-cpu */
+	int ret = 0;
+
+	BUG_ON(!keventd_wq);
+
+	cwq = keventd_wq->cpu_wq + cpu;
+	if (current == cwq->thread)
+		ret = 1;
+
+	return ret;
+
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* Take the work from this (downed) CPU. */
+static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
+{
+	struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
+	LIST_HEAD(list);
+	struct work_struct *work;
+
+	spin_lock_irq(&cwq->lock);
+	list_splice_init(&cwq->worklist, &list);
+
+	while (!list_empty(&list)) {
+		printk("Taking work for %s\n", wq->name);
+		work = list_entry(list.next,struct work_struct,entry);
+		list_del(&work->entry);
+		__queue_work(wq->cpu_wq + smp_processor_id(), work);
+	}
+	spin_unlock_irq(&cwq->lock);
+}
+
+/* We're holding the cpucontrol mutex here */
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	unsigned int hotcpu = (unsigned long)hcpu;
+	struct workqueue_struct *wq;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		/* Create a new workqueue thread for it. */
+		list_for_each_entry(wq, &workqueues, list) {
+			if (create_workqueue_thread(wq, hotcpu) < 0) {
+				printk("workqueue for %i failed\n", hotcpu);
+				return NOTIFY_BAD;
+			}
+		}
+		break;
+
+	case CPU_ONLINE:
+		/* Kick off worker threads. */
+		list_for_each_entry(wq, &workqueues, list) {
+			kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu);
+			wake_up_process(wq->cpu_wq[hotcpu].thread);
+		}
+		break;
+
+	case CPU_UP_CANCELED:
+		list_for_each_entry(wq, &workqueues, list) {
+			/* Unbind so it can run. */
+			kthread_bind(wq->cpu_wq[hotcpu].thread,
+				     smp_processor_id());
+			cleanup_workqueue_thread(wq, hotcpu);
+		}
+		break;
+
+	case CPU_DEAD:
+		list_for_each_entry(wq, &workqueues, list)
+			cleanup_workqueue_thread(wq, hotcpu);
+		list_for_each_entry(wq, &workqueues, list)
+			take_over_work(wq, hotcpu);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+#endif
+
+void init_workqueues(void)
+{
+	hotcpu_notifier(workqueue_cpu_callback, 0);
+	keventd_wq = create_workqueue("events");
+	BUG_ON(!keventd_wq);
+}
+
+EXPORT_SYMBOL_GPL(__create_workqueue);
+EXPORT_SYMBOL_GPL(queue_work);
+EXPORT_SYMBOL_GPL(queue_delayed_work);
+EXPORT_SYMBOL_GPL(flush_workqueue);
+EXPORT_SYMBOL_GPL(destroy_workqueue);
+
+EXPORT_SYMBOL(schedule_work);
+EXPORT_SYMBOL(schedule_delayed_work);
+EXPORT_SYMBOL(schedule_delayed_work_on);
+EXPORT_SYMBOL(flush_scheduled_work);
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /kernel